In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
import patsy
import statsmodels.api as sm

In [31]:
obesity=pd.read_csv('obesity.csv')
obesity.isnull().sum()
obesity.info()
#the data does not have any missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [33]:
#first we rename the variable types. 

obesity.rename(columns={'family_history_with_overweight': 'Family_Obesity'}, inplace=True)
obesity.rename(columns={'FAVC': 'High_Calories'}, inplace=True)
obesity.rename(columns={'FCVC': 'Vegetables'}, inplace=True)
obesity.rename(columns={'NCP': 'Meals'}, inplace=True)
obesity.rename(columns={'CAEC': 'Food_Between_Meals'}, inplace=True)
obesity.rename(columns={'SMOKE': 'Smoke'}, inplace=True)
obesity.rename(columns={'CH2O': 'Water'}, inplace=True)
obesity.rename(columns={'SCC': 'Monitor_Calories'}, inplace=True)
obesity.rename(columns={'FAF': 'Excercise'}, inplace=True)
obesity.rename(columns={'TUE': 'Technology'}, inplace=True)
obesity.rename(columns={'CALC': 'Alchohol'}, inplace=True)
obesity.rename(columns={'MTRANS': 'Transportation_Movement'}, inplace=True)


In [40]:
obesity.head()

Unnamed: 0,Gender,Age,Height,Weight,Family_Obesity,High_Calories,Vegetables,Meals,Food_Between_Meals,Smoke,Water,Monitor_Calories,Excercise,Technology,Alchohol,Transportation_Movement,NObeyesdad
0,Female,-0.522124,-0.875589,-0.862558,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,-1.188039,0.561997,no,Public_Transportation,Normal_Weight
1,Female,-0.522124,-1.947599,-1.168077,yes,no,1.088342,0.404153,Sometimes,yes,1.618759,yes,2.33975,-1.080625,Sometimes,Public_Transportation,Normal_Weight
2,Male,-0.206889,1.054029,-0.36609,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,1.16382,0.561997,Frequently,Public_Transportation,Normal_Weight
3,Male,0.423582,1.054029,0.015808,no,no,1.088342,0.404153,Sometimes,no,-0.013073,no,1.16382,-1.080625,Frequently,Walking,Overweight_Level_I
4,Male,-0.364507,0.839627,0.12274,no,no,-0.785019,-2.167023,Sometimes,no,-0.013073,no,-1.188039,-1.080625,Sometimes,Public_Transportation,Overweight_Level_II


In [35]:
# Identify the categorical columns to be one-hot encoded
categorical_columns = ['Gender', 'Family_Obesity', 'High_Calories', 'Food_Between_Meals', 'Smoke', 'Monitor_Calories', 'Transportation_Movement', 'NObeyesdad']

# Perform one-hot encoding
one_hot_encoded_data = pd.get_dummies(obesity, columns=categorical_columns)

# Display the encoded DataFrame
print(one_hot_encoded_data.head())

    Age  Height  Weight  Vegetables  Meals  Water  Excercise  Technology  \
0  21.0    1.62    64.0         2.0    3.0    2.0        0.0         1.0   
1  21.0    1.52    56.0         3.0    3.0    3.0        3.0         0.0   
2  23.0    1.80    77.0         2.0    3.0    2.0        2.0         1.0   
3  27.0    1.80    87.0         3.0    3.0    2.0        2.0         0.0   
4  22.0    1.78    89.8         2.0    1.0    2.0        0.0         0.0   

     Alchohol  Gender_Female  ...  Transportation_Movement_Motorbike  \
0          no           True  ...                              False   
1   Sometimes           True  ...                              False   
2  Frequently          False  ...                              False   
3  Frequently          False  ...                              False   
4   Sometimes          False  ...                              False   

   Transportation_Movement_Public_Transportation  \
0                                           True   
1     

In [38]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Age', 'Height', 'Weight', 'Vegetables', 'Meals', 'Water', 'Excercise', 'Technology']
scaler = StandardScaler()
obesity[numerical_columns] = scaler.fit_transform(obesity[numerical_columns])

print(obesity.head())

   Gender       Age    Height    Weight Family_Obesity High_Calories  \
0  Female -0.522124 -0.875589 -0.862558            yes            no   
1  Female -0.522124 -1.947599 -1.168077            yes            no   
2    Male -0.206889  1.054029 -0.366090            yes            no   
3    Male  0.423582  1.054029  0.015808             no            no   
4    Male -0.364507  0.839627  0.122740             no            no   

   Vegetables     Meals Food_Between_Meals Smoke     Water Monitor_Calories  \
0   -0.785019  0.404153          Sometimes    no -0.013073               no   
1    1.088342  0.404153          Sometimes   yes  1.618759              yes   
2   -0.785019  0.404153          Sometimes    no -0.013073               no   
3    1.088342  0.404153          Sometimes    no -0.013073               no   
4   -0.785019 -2.167023          Sometimes    no -0.013073               no   

   Excercise  Technology    Alchohol Transportation_Movement  \
0  -1.188039    0.561997    

In [39]:
from sklearn.model_selection import train_test_split

X = obesity.drop(columns=['NObeyesdad'])  
y = obesity['NObeyesdad'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (1688, 16) (1688,)
Testing set shape: (423, 16) (423,)
