# Step 1: Read the data

In [43]:
import pandas as pd

df = pd.read_csv("calories.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


# Step 2: Define the input and target features

In [44]:
X = df.drop(columns=["Calories"], axis=1)
y = df["Calories"]

In [45]:
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14733363,male,68,190.0,94.0,29.0,105.0,40.8
14861698,female,20,166.0,60.0,14.0,94.0,40.3
11179863,male,69,179.0,79.0,5.0,88.0,38.7
16180408,female,34,179.0,71.0,13.0,100.0,40.5
17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [47]:
X["Gender"] = X["Gender"].map({"male": 1, "female": 0})

In [48]:
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14733363,1,68,190.0,94.0,29.0,105.0,40.8
14861698,0,20,166.0,60.0,14.0,94.0,40.3
11179863,1,69,179.0,79.0,5.0,88.0,38.7
16180408,0,34,179.0,71.0,13.0,100.0,40.5
17771927,0,27,154.0,58.0,10.0,81.0,39.8


# Step 3: Run a 5-fold cross validation

In [49]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression

# Defining the cross-validation strategy
skf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the model
lm_md = LinearRegression()

# Perform cross-validation
lm_cv = cross_val_score(lm_md, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print("The LR model 5-fold CV RMSE is: {:.2f} +/- {:.2f}".format(-lm_cv.mean(), lm_cv.std())    )

The LR model 5-fold CV RMSE is: 11.32 +/- 0.16


In [50]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Defining the new model
lm_md_new = make_pipeline(StandardScaler(), LinearRegression())

# Perform cross-validation
lm_cv_new = cross_val_score(lm_md_new, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print("The LR model with scaling 5-fold CV RMSE is: {:.2f} +/- {:.2f}".format(-lm_cv_new.mean(), lm_cv_new.std()))

The LR model with scaling 5-fold CV RMSE is: 11.32 +/- 0.16


In [51]:
X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14733363,1,68,190.0,94.0,29.0,105.0,40.8
14861698,0,20,166.0,60.0,14.0,94.0,40.3
11179863,1,69,179.0,79.0,5.0,88.0,38.7
16180408,0,34,179.0,71.0,13.0,100.0,40.5
17771927,0,27,154.0,58.0,10.0,81.0,39.8


In [53]:
cols = X.columns.tolist()
cols.remove("Gender")
cols

['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

In [54]:
from itertools import combinations

for col1, col2 in combinations(cols, 2):
    X[f"{col1}x{col2}"] = X[col1] * X[col2]

In [56]:
pd.set_option('display.max_columns', None)

X.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,AgexHeight,AgexWeight,AgexDuration,AgexHeart_Rate,AgexBody_Temp,HeightxWeight,HeightxDuration,HeightxHeart_Rate,HeightxBody_Temp,WeightxDuration,WeightxHeart_Rate,WeightxBody_Temp,DurationxHeart_Rate,DurationxBody_Temp,Heart_RatexBody_Temp
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
14733363,1,68,190.0,94.0,29.0,105.0,40.8,12920.0,6392.0,1972.0,7140.0,2774.4,17860.0,5510.0,19950.0,7752.0,2726.0,9870.0,3835.2,3045.0,1183.2,4284.0
14861698,0,20,166.0,60.0,14.0,94.0,40.3,3320.0,1200.0,280.0,1880.0,806.0,9960.0,2324.0,15604.0,6689.8,840.0,5640.0,2418.0,1316.0,564.2,3788.2
11179863,1,69,179.0,79.0,5.0,88.0,38.7,12351.0,5451.0,345.0,6072.0,2670.3,14141.0,895.0,15752.0,6927.3,395.0,6952.0,3057.3,440.0,193.5,3405.6
16180408,0,34,179.0,71.0,13.0,100.0,40.5,6086.0,2414.0,442.0,3400.0,1377.0,12709.0,2327.0,17900.0,7249.5,923.0,7100.0,2875.5,1300.0,526.5,4050.0
17771927,0,27,154.0,58.0,10.0,81.0,39.8,4158.0,1566.0,270.0,2187.0,1074.6,8932.0,1540.0,12474.0,6129.2,580.0,4698.0,2308.4,810.0,398.0,3223.8


In [57]:
# Define the model
lm_md = LinearRegression()

# Perform cross-validation
lm_cv = cross_val_score(lm_md, X, y, cv=skf, scoring="neg_root_mean_squared_error", n_jobs=-1)

print("The LR model 5-fold CV RMSE is: {:.2f} +/- {:.2f}".format(-lm_cv.mean(), lm_cv.std())    )

The LR model 5-fold CV RMSE is: 4.55 +/- 0.07
