In [74]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [75]:
data = pd.read_csv('data.csv')

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB


In [77]:
data.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [78]:
data.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,169.0,169.0,169.0,164.0
mean,63.846154,107.461538,134.047337,375.790244
std,42.299949,14.510259,16.450434,266.379919
min,15.0,80.0,100.0,50.3
25%,45.0,100.0,124.0,250.925
50%,60.0,105.0,131.0,318.6
75%,60.0,111.0,141.0,387.6
max,300.0,159.0,184.0,1860.4


Based on the information presented above, we can see that the count for duration, pulse and maxpulse are all 169 except for calories which has 164.
This tells us that there are missing values present in the column of calories.

From the observation as well, we can see that only 5 columns do not have data.
We can either remove the rows with missing data or fill with the average of the column.
It is usually advisable to fill the missing data with the average because you do not want to look too much information.

But in this case, I am just going to remove the rows with missing data since it is just 5.

In [79]:
data.dropna(inplace=True)

In [80]:
data.corr()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.160661,0.005679,0.922717
Pulse,-0.160661,1.0,0.784631,0.025121
Maxpulse,0.005679,0.784631,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0


From the correlation table above, we can see that there is good correlation between Duration and Calories, which means that the more an individual works out, the probability of burning more calories is high.

Now, we are going to see how to predict the calories burnt based on our exercise duration, pulse and maxpulse

In [83]:
y = data['Calories']
y = np.array(y)

In [84]:
X = data[['Duration', 'Pulse', 'Maxpulse']]
X = np.array(X)

I am going to train the data but I also want an accuracy which is greater than 0.9 hence I will use a while to check this

In [87]:
while True:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    if model.score(X_test, y_test) > 0.9:
        print("Accuracy score", model.score(X_test, y_test))
        break

Accuracy score 0.9214950177353385


In [88]:
predictions = model.predict(X_test)

I am going to print out the first 10 predictions made by our LinearRegression model

In [93]:
for i in range(len(predictions)):
    if i < 10:
        string = f"Model predicts that if you work out for {X_train[i][0]}mins at a Pulse of {X_train[i][1]}"
        string += f" and Maxpluse of {X_train[i][2]}, you may burn {predictions[i]} calories.\n"
        string += f"Data shows that the actual calories burnt was {y_test[i]} \n"
        print(string)

Model predicts that if you work out for 60mins at a Pulse of 118 and Maxpluse of 121, you may burn 308.44127637322146 calories. Data shows that the actual calories burnt was 277.4
Model predicts that if you work out for 60mins at a Pulse of 104 and Maxpluse of 132, you may burn 112.36266153515504 calories. Data shows that the actual calories burnt was 131.4
Model predicts that if you work out for 60mins at a Pulse of 103 and Maxpluse of 136, you may burn 333.48400634763294 calories. Data shows that the actual calories burnt was 382.0
Model predicts that if you work out for 75mins at a Pulse of 120 and Maxpluse of 150, you may burn 313.65556214665986 calories. Data shows that the actual calories burnt was 275.3
Model predicts that if you work out for 45mins at a Pulse of 90 and Maxpluse of 112, you may burn 331.6941997222621 calories. Data shows that the actual calories burnt was 380.3
Model predicts that if you work out for 45mins at a Pulse of 110 and Maxpluse of 141, you may burn 226