# Step 1: Read the data

In [12]:
import pandas as pd 

df = pd.read_csv("calories.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


# Step 2: Define the input and target features

In [18]:
X = df.drop(columns=['Calories'], axis=1)
X["Gender"] = X["Gender"].map({"female": 0, "male": 1})

y = df['Calories']

In [22]:
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14733363,1,68,190.0,94.0,29.0,105.0,40.8
14861698,0,20,166.0,60.0,14.0,94.0,40.3
11179863,1,69,179.0,79.0,5.0,88.0,38.7
16180408,0,34,179.0,71.0,13.0,100.0,40.5
17771927,0,27,154.0,58.0,10.0,81.0,39.8


In [23]:
X.describe()

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,0.496467,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453
std,0.500004,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923
min,0.0,20.0,123.0,36.0,1.0,67.0,37.1
25%,0.0,28.0,164.0,63.0,8.0,88.0,39.6
50%,0.0,39.0,175.0,74.0,16.0,96.0,40.2
75%,1.0,56.0,185.0,87.0,23.0,103.0,40.6
max,1.0,79.0,222.0,132.0,30.0,128.0,41.5


# Step 3: Run a 5-fold cross validation with k=10

In [20]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor

# Defining the cross-validation strategy
skf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the k-NN regressor
knn = KNeighborsRegressor(n_neighbors=10)

# Running cross-validation
knn_cv = cross_val_score(knn, X, y, cv=skf, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Displaying the cross-validation results
print(f"Cross-validation RMSE: {-knn_cv.mean():.2f} ± {knn_cv.std():.2f}")

Cross-validation RMSE: 7.05 ± 0.21


In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor

# Defining the cross-validation strategy
skf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the k-NN regressor
knn = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=10))

# Running cross-validation
knn_cv = cross_val_score(knn, X, y, cv=skf, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Displaying the cross-validation results
print(f"Cross-validation RMSE: {-knn_cv.mean():.2f} ± {knn_cv.std():.2f}")

Cross-validation RMSE: 4.94 ± 0.06


# Step 4: Run an optuna experiment to tune the k-NN model

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor

import optuna

def objective(trial):

    # Suggesting a value for k
    k = trial.suggest_int('k', 2, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    
    # Defining the k-NN regressor with scaling
    knn = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=k, weights=weights))
    
    # Running cross-validation
    skf = KFold(n_splits=5, shuffle=True, random_state=42)
    knn_cv = cross_val_score(knn, X, y, cv=skf, scoring='neg_root_mean_squared_error', n_jobs=-1)
    
    # Returning the mean RMSE
    return -knn_cv.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2025-08-07 17:27:33,492] A new study created in memory with name: no-name-77055858-e130-41d6-b349-48d02a0b120f
[I 2025-08-07 17:27:34,189] Trial 0 finished with value: 5.268488968599594 and parameters: {'k': 24, 'weights': 'uniform'}. Best is trial 0 with value: 5.268488968599594.
[I 2025-08-07 17:27:34,578] Trial 1 finished with value: 4.950453442208027 and parameters: {'k': 12, 'weights': 'uniform'}. Best is trial 1 with value: 4.950453442208027.
[I 2025-08-07 17:27:34,985] Trial 2 finished with value: 5.106199044213506 and parameters: {'k': 26, 'weights': 'distance'}. Best is trial 1 with value: 4.950453442208027.
[I 2025-08-07 17:27:35,384] Trial 3 finished with value: 4.845451455144096 and parameters: {'k': 7, 'weights': 'distance'}. Best is trial 3 with value: 4.845451455144096.
[I 2025-08-07 17:27:35,802] Trial 4 finished with value: 5.1961676311345615 and parameters: {'k': 30, 'weights': 'distance'}. Best is trial 3 with value: 4.845451455144096.
[I 2025-08-07 17:27:36,220] 

In [26]:
study.best_params

{'k': 11, 'weights': 'distance'}

In [27]:
study.best_value

4.767305725654351