# Data exploration 

Dataset: https://www.kaggle.com/datasets/abdullah0a/comprehensive-weight-change-prediction

This dataset includes features involving aspects that may influence weight gain or loss

### Data Cleaning

In [342]:
# Load dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor



df = pd.read_csv('weight_change_dataset.csv')
df.head()

Unnamed: 0,Participant ID,Age,Gender,Current Weight (lbs),BMR (Calories),Daily Calories Consumed,Daily Caloric Surplus/Deficit,Weight Change (lbs),Duration (weeks),Physical Activity Level,Sleep Quality,Stress Level,Final Weight (lbs)
0,1,56,M,228.4,3102.3,3916.0,813.7,0.2,1,Sedentary,Excellent,6,228.6
1,2,46,F,165.4,2275.5,3823.0,1547.5,2.4,6,Very Active,Excellent,6,167.8
2,3,32,F,142.8,2119.4,2785.4,666.0,1.4,7,Sedentary,Good,3,144.2
3,4,25,F,145.5,2181.3,2587.3,406.0,0.8,8,Sedentary,Fair,2,146.3
4,5,38,M,155.5,2463.8,3312.8,849.0,2.0,10,Lightly Active,Good,1,157.5


In [343]:
df.columns

Index(['Participant ID', 'Age', 'Gender', 'Current Weight (lbs)',
       'BMR (Calories)', 'Daily Calories Consumed',
       'Daily Caloric Surplus/Deficit', 'Weight Change (lbs)',
       'Duration (weeks)', 'Physical Activity Level', 'Sleep Quality',
       'Stress Level', 'Final Weight (lbs)'],
      dtype='object')

In [344]:
df.columns = df.columns.str.replace(' ', '_')

df.columns

Index(['Participant_ID', 'Age', 'Gender', 'Current_Weight_(lbs)',
       'BMR_(Calories)', 'Daily_Calories_Consumed',
       'Daily_Caloric_Surplus/Deficit', 'Weight_Change_(lbs)',
       'Duration_(weeks)', 'Physical_Activity_Level', 'Sleep_Quality',
       'Stress_Level', 'Final_Weight_(lbs)'],
      dtype='object')

In [345]:
df = df.drop('Participant_ID', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            100 non-null    int64  
 1   Gender                         100 non-null    object 
 2   Current_Weight_(lbs)           100 non-null    float64
 3   BMR_(Calories)                 100 non-null    float64
 4   Daily_Calories_Consumed        100 non-null    float64
 5   Daily_Caloric_Surplus/Deficit  100 non-null    float64
 6   Weight_Change_(lbs)            100 non-null    float64
 7   Duration_(weeks)               100 non-null    int64  
 8   Physical_Activity_Level        100 non-null    object 
 9   Sleep_Quality                  100 non-null    object 
 10  Stress_Level                   100 non-null    int64  
 11  Final_Weight_(lbs)             100 non-null    float64
dtypes: float64(6), int64(3), object(3)
memory usage: 9.

In [346]:
df.isnull().sum()

Age                              0
Gender                           0
Current_Weight_(lbs)             0
BMR_(Calories)                   0
Daily_Calories_Consumed          0
Daily_Caloric_Surplus/Deficit    0
Weight_Change_(lbs)              0
Duration_(weeks)                 0
Physical_Activity_Level          0
Sleep_Quality                    0
Stress_Level                     0
Final_Weight_(lbs)               0
dtype: int64

In [347]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

cat_df = df[cat_cols]
cat_df.head()

Unnamed: 0,Gender,Physical_Activity_Level,Sleep_Quality
0,M,Sedentary,Excellent
1,F,Very Active,Excellent
2,F,Sedentary,Good
3,F,Sedentary,Fair
4,M,Lightly Active,Good


In [348]:
for col in cat_df.columns:
    print(f'{col}: \n{cat_df[col].unique()}')

Gender: 
['M' 'F']
Physical_Activity_Level: 
['Sedentary' 'Very Active' 'Lightly Active' 'Moderately Active']
Sleep_Quality: 
['Excellent' 'Good' 'Fair' 'Poor']


In [349]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

In [350]:
le = LabelEncoder()
df.loc[:, 'Physical_Activity_Level'] = le.fit_transform(
    df['Physical_Activity_Level'])  

df.head()

Unnamed: 0,Age,Gender,Current_Weight_(lbs),BMR_(Calories),Daily_Calories_Consumed,Daily_Caloric_Surplus/Deficit,Weight_Change_(lbs),Duration_(weeks),Physical_Activity_Level,Sleep_Quality,Stress_Level,Final_Weight_(lbs)
0,56,1,228.4,3102.3,3916.0,813.7,0.2,1,2,Excellent,6,228.6
1,46,0,165.4,2275.5,3823.0,1547.5,2.4,6,3,Excellent,6,167.8
2,32,0,142.8,2119.4,2785.4,666.0,1.4,7,2,Good,3,144.2
3,25,0,145.5,2181.3,2587.3,406.0,0.8,8,2,Fair,2,146.3
4,38,1,155.5,2463.8,3312.8,849.0,2.0,10,0,Good,1,157.5


In [351]:
le = LabelEncoder()
df.loc[:, 'Sleep_Quality'] = le.fit_transform(
    df['Sleep_Quality'])

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            100 non-null    int64  
 1   Gender                         100 non-null    int64  
 2   Current_Weight_(lbs)           100 non-null    float64
 3   BMR_(Calories)                 100 non-null    float64
 4   Daily_Calories_Consumed        100 non-null    float64
 5   Daily_Caloric_Surplus/Deficit  100 non-null    float64
 6   Weight_Change_(lbs)            100 non-null    float64
 7   Duration_(weeks)               100 non-null    int64  
 8   Physical_Activity_Level        100 non-null    object 
 9   Sleep_Quality                  100 non-null    object 
 10  Stress_Level                   100 non-null    int64  
 11  Final_Weight_(lbs)             100 non-null    float64
dtypes: float64(6), int64(4), object(2)
memory usage: 9.

In [352]:
df['Physical_Activity_Level'] = df['Physical_Activity_Level'].astype(object).astype(int)
df['Sleep_Quality'] = df['Sleep_Quality'].astype(object).astype(int)

In [353]:
df = df.drop(columns=['Final_Weight_(lbs)'])

In [354]:
x = df.drop(['Weight_Change_(lbs)'], axis = 1)
y = df['Weight_Change_(lbs)']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.8, random_state = 23)

In [355]:
y_train = y_train.fillna(0)

numerical_features = x.columns

numeric_transformer = StandardScaler()

In [356]:
preProcess = ColumnTransformer(transformers = [('num', numeric_transformer, numerical_features)])

pipeline = Pipeline([('preprocess', preProcess),
                     ('regressor', RandomForestRegressor())])

param_grid = [{'regressor': [RandomForestRegressor()],
               'regressor__max_depth': [1, 2, 3, 4],
              'regressor__n_estimators': [100, 500]}]

In [358]:
random.seed(1)
cv_mine = KFold(n_splits = 3)
gridSearch = GridSearchCV(pipeline, param_grid, cv=cv_mine,
                          scoring='r2', n_jobs=-1)
gridSearch.fit(x_train, y_train)