# Step 1: Read the data

In [9]:
import pandas as pd

# Reading the data
df = pd.read_csv("calories.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


# Step 2: Define the input and target features

In [10]:
X = df.drop("Calories", axis=1)
y = df["Calories"]

X["Gender"] = X["Gender"].map({"female": 0, "male": 1})

# Step 3: Run a 5-fold cross validation

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Ridge

# Define the model 
ridge_md = make_pipeline(StandardScaler(), Ridge())

# Cross validation
skf = KFold(n_splits=5, shuffle=True, random_state=10)
ridge_cv = cross_val_score(ridge_md, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print(f"Ridge Regression CV RMSE: {-ridge_cv.mean():.2f} +/- {ridge_cv.std():.2f}")

Ridge Regression CV RMSE: 11.31 +/- 0.14


In [12]:
from itertools import combinations

cols = X.columns.tolist()

# Generate interaction features
for col1, col2 in combinations(cols, 2):
    X[f"{col1}_x_{col2}"] = X[col1] * X[col2]


In [13]:
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Gender_x_Age,Gender_x_Height,Gender_x_Weight,...,Height_x_Weight,Height_x_Duration,Height_x_Heart_Rate,Height_x_Body_Temp,Weight_x_Duration,Weight_x_Heart_Rate,Weight_x_Body_Temp,Duration_x_Heart_Rate,Duration_x_Body_Temp,Heart_Rate_x_Body_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14733363,1,68,190.0,94.0,29.0,105.0,40.8,68,190.0,94.0,...,17860.0,5510.0,19950.0,7752.0,2726.0,9870.0,3835.2,3045.0,1183.2,4284.0
14861698,0,20,166.0,60.0,14.0,94.0,40.3,0,0.0,0.0,...,9960.0,2324.0,15604.0,6689.8,840.0,5640.0,2418.0,1316.0,564.2,3788.2
11179863,1,69,179.0,79.0,5.0,88.0,38.7,69,179.0,79.0,...,14141.0,895.0,15752.0,6927.3,395.0,6952.0,3057.3,440.0,193.5,3405.6
16180408,0,34,179.0,71.0,13.0,100.0,40.5,0,0.0,0.0,...,12709.0,2327.0,17900.0,7249.5,923.0,7100.0,2875.5,1300.0,526.5,4050.0
17771927,0,27,154.0,58.0,10.0,81.0,39.8,0,0.0,0.0,...,8932.0,1540.0,12474.0,6129.2,580.0,4698.0,2308.4,810.0,398.0,3223.8


In [14]:
# Define the model 
ridge_md = make_pipeline(StandardScaler(), Ridge())

# Cross validation
skf = KFold(n_splits=5, shuffle=True, random_state=10)
ridge_cv = cross_val_score(ridge_md, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print(f"Ridge Regression CV RMSE: {-ridge_cv.mean():.2f} +/- {ridge_cv.std():.2f}")

Ridge Regression CV RMSE: 3.04 +/- 0.02


# Finding the optimal value for lambda

In [16]:
import numpy as np
from sklearn.linear_model import RidgeCV

# Finding the optimal value for lambda
alphas = np.linspace(0.01, 10, 100)

ridge_cv = RidgeCV(alphas=alphas, scoring="neg_root_mean_squared_error", cv=skf).fit(X, y)

print(f"Optimal alpha: {ridge_cv.alpha_}")

Optimal alpha: 0.01


In [17]:
# Define the model 
ridge_md = make_pipeline(StandardScaler(), Ridge(alpha=ridge_cv.alpha_))

# Cross validation
skf = KFold(n_splits=5, shuffle=True, random_state=10)
ridge_cv = cross_val_score(ridge_md, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print(f"Ridge Regression CV RMSE: {-ridge_cv.mean():.2f} +/- {ridge_cv.std():.2f}")

Ridge Regression CV RMSE: 3.00 +/- 0.03
