In [2]:
# Exercise 5.2 - run this cell
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# 1) Read data
wpath = 'weight-height (3).csv'   # adjust if filename differs
dfw = pd.read_csv(wpath)
print("Shape:", dfw.shape)
display(dfw.head())

# 2) pick target y as weight, X as height
print("Columns:", dfw.columns.tolist())

weight_col = [c for c in dfw.columns if 'weight' in c.lower()]
height_col = [c for c in dfw.columns if 'height' in c.lower()]

if not weight_col or not height_col:
    print("Couldn't detect height/weight columns automatically.")
    raise SystemExit("Please rename columns or set manually.")

weight_col = weight_col[0]
height_col = height_col[0]
print("Using weight_col =", weight_col, "height_col =", height_col)

y = dfw[weight_col]       # Target variable
X = dfw[[height_col]]     # Feature matrix

# 3) split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Train/test shapes:", X_train.shape, X_test.shape)

# 4) scaling using normalization and standardization
mm = MinMaxScaler()
ss = StandardScaler()

X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

print("Normalized sample (first 5):\n", X_train_mm[:5])
print("Standardized sample (first 5):\n", X_train_ss[:5])

# 5) KNN regression with k=5 on unscaled data
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

r2 = r2_score(y_test, y_pred)
print("KNN (k=5) on unscaled data -> R^2 score:", r2)

# Comparison table
comp = X_test.copy()
comp['actual_weight'] = y_test.values
comp['pred_weight_knn_k5'] = y_pred
display(comp.head(20))


Shape: (10000, 3)


Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


Columns: ['Gender', 'Height', 'Weight']
Using weight_col = Weight height_col = Height
Train/test shapes: (8000, 1) (2000, 1)
Normalized sample (first 5):
 [[0.28818819]
 [0.82611687]
 [0.57587754]
 [0.46630679]
 [0.52146168]]
Standardized sample (first 5):
 [[-1.29189725]
 [ 2.16821664]
 [ 0.5586044 ]
 [-0.14618656]
 [ 0.20858579]]
KNN (k=5) on unscaled data -> R^2 score: 0.8346485438169171


Unnamed: 0,Height,actual_weight,pred_weight_knn_k5
6252,68.687232,173.115813,177.369952
4684,68.829334,195.162042,179.351321
1731,66.398128,170.217451,166.130596
4742,71.92934,215.04966,207.234087
4521,67.042903,182.721452,158.388172
6340,65.310436,136.434393,153.248702
576,66.045444,169.326284,152.662487
5202,66.759071,150.88133,165.536139
6363,62.598648,114.766882,136.308846
439,63.596455,144.591922,135.724496
