<a href="https://colab.research.google.com/github/olcaykursun/ML/blob/main/Fall2025/Week7/bmi_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd

# number of samples
N = 50

# generate synthetic heights (in meters, approx. 1.4–2.0 m)
height = np.random.rand(N) * 0.6 + 1.4

# generate synthetic weights (in kg, uniformly between 50–150 kg)
weight = np.random.uniform(50, 150, N)

# calculate Body Mass Index (BMI = weight / height^2)
bmi = weight / (height ** 2)

# add Gaussian noise to height (mean 0, std 0.05)
noisy_height = height + np.random.normal(0, 0.05, N)

# add Gaussian noise to weight (mean 0, std 0.05)
noisy_weight = weight + np.random.normal(0, 0.05, N)

# alternative way of adding random noise to weight (using randn * 0.05)
noise_weight = weight + np.random.randn(N) * 0.05

# create a DataFrame with noisy features and original BMI values
df = pd.DataFrame({'height': noisy_height,
                   'weight': noisy_weight,
                   'bmi': bmi})

# preview first few rows
df.head()


Unnamed: 0,height,weight,bmi
0,1.748003,148.782761,48.813732
1,1.900137,111.901492,29.105301
2,2.000351,67.897561,17.643967
3,1.808556,97.769318,32.136432
4,1.647647,132.412321,45.330085


In [8]:
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# initialize Leave-One-Out cross-validator
loo = LeaveOneOut()

# initialize linear regression model
model = LinearRegression()

# feature matrix (height and weight) and target vector (bmi)
X = df[['height', 'weight']].values
y = df['bmi'].values

# alternative equivalent formulations for creating X and y:
# X = df[['height', 'weight']].to_numpy()
# y = df['bmi'].to_numpy()
# or
# X = df.drop(columns=['bmi']).values
# y = df['bmi'].values

# lists to collect predictions and true values
y_preds = []
y_tests = []

# perform Leave-One-Out CV
for train_index, test_index in loo.split(X):
    # split into training and test sets (one test sample each time)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit the model on training data
    model.fit(X_train, y_train)

    # predict for the held-out test sample
    y_pred = model.predict(X_test)

    # store true and predicted values
    y_tests.append(y_test)
    y_preds.append(y_pred[0])

# evaluate model performance across all LOO splits
r2_score(y_tests, y_preds)


0.9204604740598912

In [9]:
df.describe()

Unnamed: 0,height,weight,bmi
count,50.0,50.0,50.0
mean,1.764579,104.747955,34.796131
std,0.174881,28.534927,11.729039
min,1.391719,51.087242,17.643967
25%,1.647775,84.468366,26.138205
50%,1.813414,101.515727,32.078453
75%,1.893607,132.27198,41.905538
max,2.053886,148.782761,68.537664


In [10]:
from sklearn.model_selection import LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# initialize Leave-One-Out cross-validator
loo = LeaveOneOut()

# define KNN regression model with k = 3 neighbors
model = KNeighborsRegressor(n_neighbors=3)

# feature matrix (height and weight) and target (bmi)
X = df[['height', 'weight']].values
y = df['bmi'].values

# scale features so both height and weight are in comparable ranges
# (important for distance-based models like KNN, otherwise weight dominates height)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# store predictions and ground truth
y_preds = []
y_tests = []

# perform Leave-One-Out CV
for train_index, test_index in loo.split(X):
    # leave one sample out for testing, rest for training
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit the model on training data
    model.fit(X_train, y_train)

    # predict the BMI for the left-out sample
    y_pred = model.predict(X_test)

    # save results for later evaluation
    y_tests.append(y_test)
    y_preds.append(y_pred[0])

# compute overall R² score across all LOO folds
r2_score(y_tests, y_preds)


0.9172554360721551

In [11]:
from sklearn.model_selection import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# initialize Leave-One-Out cross-validator
loo = LeaveOneOut()

# define KNN classifier with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

# features: height and weight, target: obese vs. non-obese
X = df[['height', 'weight']].values

# binary labels: True if BMI >= 30 (obese), False otherwise (non-obese)
y = df['bmi'].values >= 30
print(y)   # just to inspect the class distribution (imbalanced datasets can matter)

# scale features so that height and weight contribute comparably to distance
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# collect predictions and true labels across all LOO folds
y_preds = []
y_tests = []

for train_index, test_index in loo.split(X):
    # split into training and test sets (1 test sample each loop)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit the KNN model on the training data
    model.fit(X_train, y_train)

    # predict the left-out sample
    y_pred = model.predict(X_test)

    # save ground truth and prediction
    y_tests.append(y_test)
    y_preds.append(y_pred[0])

# overall classification metrics
print("Accuracy:", accuracy_score(y_tests, y_preds))          # proportion of correct predictions
print("Confusion matrix:\n", confusion_matrix(y_tests, y_preds))  # counts of TP, FP, FN, TN

print("Precision:", precision_score(y_tests, y_preds))        # of predicted positives, how many are correct
#Precision = of all samples predicted obese, what fraction were actually obese.

print("Recall:", recall_score(y_tests, y_preds))              # of actual positives, how many were detected
#Recall = of all truly obese samples, what fraction did the classifier correctly detect.

[ True False False  True  True False  True False  True  True  True False
 False  True  True False  True  True False  True  True False  True False
 False  True  True False  True False  True  True  True False False  True
 False  True  True False False  True  True False  True  True  True  True
  True  True]
Accuracy: 0.9
Confusion matrix:
 [[16  3]
 [ 2 29]]
Precision: 0.90625
Recall: 0.9354838709677419


In [12]:
# Precision (for non-obese, BMI < 30):
# Of all samples predicted as non-obese, how many were truly non-obese?
precision_nonobese = precision_score(y_tests, y_preds, pos_label=0)
# by default, 'positive' means BMI >= 30 (label=1),
# but with pos_label=0 we treat BMI < 30 (non-obese) as the positive class
print("Precision (non-obese):", precision_nonobese)

# Recall (for non-obese, BMI < 30):
# Of all truly non-obese samples, how many did the classifier correctly identify as non-obese?
recall_nonobese = recall_score(y_tests, y_preds, pos_label=0)
print("Recall (non-obese):", recall_nonobese)


Precision (non-obese): 0.8888888888888888
Recall (non-obese): 0.8421052631578947


In [None]:
# # Exercise (for practice, not part of the midterm):
# 1. Try training a simple neural network to predict BMI categories
#    (obese vs. non-obese) using 10-fold cross-validation.
#
# 2. Why might Leave-One-Out (LOO) cross-validation not be a good choice
#    for neural networks? (Think about training cost and model stability.)
#
# 3. Do we need a separate validation set when we are already using cross-validation?
#    How would you justify your answer?
