In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
file_path = '/content/drive/MyDrive/HSM564/'

In [None]:
# Load the dataset
obesity = pd.read_csv(file_path + 'obesity_data.csv')

# Display the first few rows
obesity.head()

Unnamed: 0,Age,Gender,Height,Weight,BMI,PhysicalActivityLevel,ObesityCategory
0,56,Male,173.575262,71.982051,23.891783,4,Normal weight
1,69,Male,164.127306,89.959256,33.395209,2,Obese
2,46,Female,168.072202,72.930629,25.817737,4,Overweight
3,32,Male,168.459633,84.886912,29.912247,3,Overweight
4,60,Male,183.568568,69.038945,20.487903,3,Normal weight


In [None]:
obesity.shape

(1000, 7)

In [None]:
import sys
# append the function .py file file path to the system path
# function.py is uploaded to /content/drive/MyDrive/Functions folder
if '/content/drive/MyDrive/HSM564' not in sys.path:
  sys.path.append('/content/drive/MyDrive/HSM564')

In [None]:
import function as fun

In [None]:
# check the data types
obesity.dtypes

Age                        int64
Gender                    object
Height                   float64
Weight                   float64
BMI                      float64
PhysicalActivityLevel      int64
ObesityCategory           object
dtype: object

In [None]:
fun.metadata(obesity)

Unnamed: 0,column_name,datatype,missing_percent,unique,mean,std,min,25%,50%,75%,max
0,Age,int64,0.0,62,49.857,18.114267,18.0,35.0,50.0,66.0,79.0
1,Gender,object,0.0,2,,,,,,,
2,Height,float64,0.0,1000,170.052417,10.309971,136.115719,163.514205,169.801665,177.353596,201.41967
3,Weight,float64,0.0,1000,71.205769,15.509849,26.06573,61.129629,71.929072,81.133746,118.907366
4,BMI,float64,0.0,1000,24.888317,6.193912,8.470572,20.918068,24.698647,28.732132,50.791898
5,PhysicalActivityLevel,int64,0.0,4,,,,,,,
6,ObesityCategory,object,0.0,4,,,,,,,


In [None]:
# Using pandas get_dummies
obesity_encoded = pd.get_dummies(obesity, columns=['Gender', 'PhysicalActivityLevel', 'ObesityCategory']).astype(int)
obesity_encoded.head()

Unnamed: 0,Age,Height,Weight,BMI,Gender_Female,Gender_Male,PhysicalActivityLevel_1,PhysicalActivityLevel_2,PhysicalActivityLevel_3,PhysicalActivityLevel_4,ObesityCategory_Normal weight,ObesityCategory_Obese,ObesityCategory_Overweight,ObesityCategory_Underweight
0,56,173,71,23,0,1,0,0,0,1,1,0,0,0
1,69,164,89,33,0,1,0,1,0,0,0,1,0,0
2,46,168,72,25,1,0,0,0,0,1,0,0,1,0
3,32,168,84,29,0,1,0,0,1,0,0,0,1,0
4,60,183,69,20,0,1,0,0,1,0,1,0,0,0


In [None]:
fun.metadata(obesity_encoded)

Unnamed: 0,column_name,datatype,missing_percent,unique,mean,std,min,25%,50%,75%,max
0,Age,int64,0.0,62,49.857,18.114267,18.0,35.0,50.0,66.0,79.0
1,Height,int64,0.0,60,169.549,10.316202,136.0,163.0,169.0,177.0,201.0
2,Weight,int64,0.0,83,70.685,15.50749,26.0,61.0,71.0,81.0,118.0
3,BMI,int64,0.0,39,24.384,6.204437,8.0,20.0,24.0,28.0,50.0
4,Gender_Female,int64,0.0,2,,,,,,,
5,Gender_Male,int64,0.0,2,,,,,,,
6,PhysicalActivityLevel_1,int64,0.0,2,,,,,,,
7,PhysicalActivityLevel_2,int64,0.0,2,,,,,,,
8,PhysicalActivityLevel_3,int64,0.0,2,,,,,,,
9,PhysicalActivityLevel_4,int64,0.0,2,,,,,,,


In [None]:
# Compute the correlation matrix
correlation_matrix_obesity = obesity_encoded.corr()

# Display the correlation matrix
print(correlation_matrix_obesity)

                                    Age    Height    Weight       BMI  \
Age                            1.000000  0.019405 -0.061498 -0.059791   
Height                         0.019405  1.000000  0.013915 -0.478429   
Weight                        -0.061498  0.013915  1.000000  0.860183   
BMI                           -0.059791 -0.478429  0.860183  1.000000   
Gender_Female                 -0.036248 -0.016286  0.034134  0.037074   
Gender_Male                    0.036248  0.016286 -0.034134 -0.037074   
PhysicalActivityLevel_1        0.030458 -0.000730 -0.074387 -0.062682   
PhysicalActivityLevel_2        0.029113 -0.036791  0.048142  0.063622   
PhysicalActivityLevel_3       -0.089913  0.000001 -0.036513 -0.035488   
PhysicalActivityLevel_4        0.031146  0.036928  0.061351  0.033697   
ObesityCategory_Normal weight  0.039223  0.200828 -0.270202 -0.357668   
ObesityCategory_Obese         -0.022719 -0.361165  0.562661  0.703398   
ObesityCategory_Overweight    -0.059323 -0.083354  

In [None]:
correlation_with_BMI = obesity_encoded.corrwith(obesity_encoded['BMI'])

# Display the correlation values
print(correlation_with_BMI)

Age                             -0.059791
Height                          -0.478429
Weight                           0.860183
BMI                              1.000000
Gender_Female                    0.037074
Gender_Male                     -0.037074
PhysicalActivityLevel_1         -0.062682
PhysicalActivityLevel_2          0.063622
PhysicalActivityLevel_3         -0.035488
PhysicalActivityLevel_4          0.033697
ObesityCategory_Normal weight   -0.357668
ObesityCategory_Obese            0.703398
ObesityCategory_Overweight       0.253430
ObesityCategory_Underweight     -0.626422
dtype: float64


In [None]:
df_obesity_reg = obesity_encoded.drop('Gender_Female', axis = 1)
df_obesity_reg.head()

Unnamed: 0,Age,Height,Weight,BMI,Gender_Male,PhysicalActivityLevel_1,PhysicalActivityLevel_2,PhysicalActivityLevel_3,PhysicalActivityLevel_4,ObesityCategory_Normal weight,ObesityCategory_Obese,ObesityCategory_Overweight,ObesityCategory_Underweight
0,56,173,71,23,1,0,0,0,1,1,0,0,0
1,69,164,89,33,1,0,1,0,0,0,1,0,0
2,46,168,72,25,0,0,0,0,1,0,0,1,0
3,32,168,84,29,1,0,0,1,0,0,0,1,0
4,60,183,69,20,1,0,0,1,0,1,0,0,0


In [None]:
df_obesity_reg.describe()

Unnamed: 0,Age,Height,Weight,BMI,Gender_Male,PhysicalActivityLevel_1,PhysicalActivityLevel_2,PhysicalActivityLevel_3,PhysicalActivityLevel_4,ObesityCategory_Normal weight,ObesityCategory_Obese,ObesityCategory_Overweight,ObesityCategory_Underweight
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,49.857,169.549,70.685,24.384,0.523,0.239,0.247,0.255,0.259,0.371,0.191,0.295,0.143
std,18.114267,10.316202,15.50749,6.204437,0.499721,0.426686,0.431483,0.436079,0.438305,0.483314,0.393286,0.456271,0.350248
min,18.0,136.0,26.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,163.0,61.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.0,169.0,71.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,66.0,177.0,81.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
max,79.0,201.0,118.0,50.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X = df_obesity_reg.drop('BMI', axis=1)
y = df_obesity_reg['BMI']

In [None]:
X.shape

(1000, 12)

In [None]:
y.shape

(1000,)

Stepwise Selection

In [None]:
selected_features_stepwise = fun.stepwise_selection(X, y)
selected_features_stepwise

['Weight', 'Height', 'ObesityCategory_Obese', 'ObesityCategory_Underweight']

In [None]:
# Re-fit the model using Forward Selection results and display statistics
model_stepwise = sm.OLS(y, X[['Weight', 'Height', 'ObesityCategory_Obese', 'ObesityCategory_Underweight']]).fit()

# Display the model summary which includes similar statistics to what SPSS would provide
model_summary_stepwise = model_stepwise.summary()
print(model_summary_stepwise)

                                 OLS Regression Results                                
Dep. Variable:                    BMI   R-squared (uncentered):                   0.988
Model:                            OLS   Adj. R-squared (uncentered):              0.988
Method:                 Least Squares   F-statistic:                          2.018e+04
Date:                Fri, 10 May 2024   Prob (F-statistic):                        0.00
Time:                        01:45:38   Log-Likelihood:                         -2440.6
No. Observations:                1000   AIC:                                      4889.
Df Residuals:                     996   BIC:                                      4909.
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                                  coef    std err          t      P>|t|      [0.025      0.975]
------------------------

Enter Method

In [None]:
model_enter_summary =fun.enter_selection_method(X, y)
print(model_enter_summary)

                            OLS Regression Results                            
Dep. Variable:                    BMI   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.983
Method:                 Least Squares   F-statistic:                     5901.
Date:                Fri, 10 May 2024   Prob (F-statistic):               0.00
Time:                        01:46:15   Log-Likelihood:                -1191.0
No. Observations:                1000   AIC:                             2404.
Df Residuals:                     989   BIC:                             2458.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         