# Step 1: Read the data

In [2]:
import pandas as pd

# Reading the CSV file into a DataFrame
df = pd.read_csv("calories.csv", index_col=0)
df.head()

Unnamed: 0_level_0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


# Step 2: Build a linear model that uses weight and age as input variables to predict calories as the target output.

In [3]:
from statsmodels.formula.api import ols

# Building the linear model
lm_md = ols("Calories ~ Weight + Age", data=df).fit()

# Displaying the model summary
lm_md.summary()

0,1,2,3
Dep. Variable:,Calories,R-squared:,0.024
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,186.8
Date:,"Tue, 26 Aug 2025",Prob (F-statistic):,7.33e-81
Time:,11:49:48,Log-Likelihood:,-83116.0
No. Observations:,15000,AIC:,166200.0
Df Residuals:,14997,BIC:,166300.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,58.7755,2.768,21.236,0.000,53.351,64.200
Weight,0.0903,0.034,2.685,0.007,0.024,0.156
Age,0.5607,0.030,18.822,0.000,0.502,0.619

0,1,2,3
Omnibus:,1936.721,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,915.484
Skew:,0.438,Prob(JB):,1.6e-199
Kurtosis:,2.166,Cond. No.,483.0


# Step 3: Build a linear model where calories is the target variable and all other features are used as input variables.

In [4]:
# Building the linear model

lm_md = ols("Calories ~  C(Gender) + Age + Height + Weight + Duration + Heart_Rate + Body_Temp", data=df).fit()

# Displaying the model summary
lm_md.summary()

0,1,2,3
Dep. Variable:,Calories,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.967
Method:,Least Squares,F-statistic:,63160.0
Date:,"Tue, 26 Aug 2025",Prob (F-statistic):,0.0
Time:,11:55:04,Log-Likelihood:,-57671.0
No. Observations:,15000,AIC:,115400.0
Df Residuals:,14992,BIC:,115400.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,464.5795,11.102,41.847,0.000,442.819,486.340
C(Gender)[T.male],-1.2679,0.310,-4.087,0.000,-1.876,-0.660
Age,0.5009,0.006,86.809,0.000,0.490,0.512
Height,-0.1831,0.024,-7.481,0.000,-0.231,-0.135
Weight,0.3010,0.027,11.342,0.000,0.249,0.353
Duration,6.6403,0.032,210.643,0.000,6.579,6.702
Heart_Rate,1.9903,0.018,107.785,0.000,1.954,2.027
Body_Temp,-16.9815,0.276,-61.484,0.000,-17.523,-16.440

0,1,2,3
Omnibus:,3006.038,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8086.256
Skew:,1.078,Prob(JB):,0.0
Kurtosis:,5.879,Cond. No.,26700.0


In [8]:
from sklearn.model_selection import train_test_split

# Spliting the data into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Building the linear model on the training set
lm_md = ols("Calories ~  C(Gender) + Age + Height + Weight + Duration + Heart_Rate + Body_Temp", data=train).fit()

# Printing the model summary
lm_md.summary()

0,1,2,3
Dep. Variable:,Calories,R-squared:,0.967
Model:,OLS,Adj. R-squared:,0.967
Method:,Least Squares,F-statistic:,50460.0
Date:,"Tue, 26 Aug 2025",Prob (F-statistic):,0.0
Time:,11:58:19,Log-Likelihood:,-46090.0
No. Observations:,12000,AIC:,92200.0
Df Residuals:,11992,BIC:,92250.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,461.8627,12.372,37.330,0.000,437.611,486.115
C(Gender)[T.male],-1.3742,0.344,-4.000,0.000,-2.048,-0.701
Age,0.5015,0.006,77.935,0.000,0.489,0.514
Height,-0.1697,0.027,-6.203,0.000,-0.223,-0.116
Weight,0.2863,0.030,9.668,0.000,0.228,0.344
Duration,6.6280,0.035,188.865,0.000,6.559,6.697
Heart_Rate,1.9909,0.021,96.757,0.000,1.951,2.031
Body_Temp,-16.9425,0.308,-54.923,0.000,-17.547,-16.338

0,1,2,3
Omnibus:,2452.311,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6759.76
Skew:,1.089,Prob(JB):,0.0
Kurtosis:,5.962,Cond. No.,26700.0


In [10]:
# Predicting on the test set
predictions = lm_md.predict(test)

pd.concat([test.reset_index(drop=True), predictions.reset_index(drop=True)], axis=1)

Unnamed: 0,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,0
0,female,45,154.0,52.0,26.0,107.0,40.6,173.0,170.676460
1,male,21,187.0,90.0,29.0,111.0,40.5,189.0,192.088578
2,male,58,176.0,77.0,11.0,90.0,40.0,53.0,56.147964
3,male,35,182.0,89.0,24.0,108.0,40.8,161.0,155.476415
4,female,67,171.0,67.0,29.0,116.0,41.1,226.0,212.450667
...,...,...,...,...,...,...,...,...,...
2995,female,61,166.0,66.0,28.0,106.0,41.0,186.0,185.161209
2996,female,73,165.0,66.0,10.0,92.0,40.0,53.0,61.115451
2997,female,38,169.0,66.0,20.0,104.0,40.2,120.0,129.665701
2998,female,25,163.0,54.0,5.0,86.0,39.2,20.0,2.415278


In [11]:
from sklearn.metrics import root_mean_squared_error

# Calculating RMSE
rmse = root_mean_squared_error(test["Calories"], predictions)
print(f"RMSE: {rmse}")

RMSE: 11.488940149152942
