In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
import patsy
import statsmodels.api as sm

In [2]:
obesity=pd.read_csv('obesity.csv')
obesity.isnull().sum()
obesity.info()
#the data does not have any missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [3]:
#first we rename the variable types. 

obesity.rename(columns={'family_history_with_overweight': 'Family_Obesity'}, inplace=True)
obesity.rename(columns={'FAVC': 'High_Calories'}, inplace=True)
obesity.rename(columns={'FCVC': 'Vegetables'}, inplace=True)
obesity.rename(columns={'NCP': 'Meals'}, inplace=True)
obesity.rename(columns={'CAEC': 'Food_Between_Meals'}, inplace=True)
obesity.rename(columns={'SMOKE': 'Smoke'}, inplace=True)
obesity.rename(columns={'CH2O': 'Water'}, inplace=True)
obesity.rename(columns={'SCC': 'Monitor_Calories'}, inplace=True)
obesity.rename(columns={'FAF': 'Excercise'}, inplace=True)
obesity.rename(columns={'TUE': 'Technology'}, inplace=True)
obesity.rename(columns={'CALC': 'Alchohol'}, inplace=True)
obesity.rename(columns={'MTRANS': 'Transportation_Movement'}, inplace=True)


In [4]:
obesity.head()

Unnamed: 0,Gender,Age,Height,Weight,Family_Obesity,High_Calories,Vegetables,Meals,Food_Between_Meals,Smoke,Water,Monitor_Calories,Excercise,Technology,Alchohol,Transportation_Movement,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
from sklearn.preprocessing import StandardScaler

numerical_columns = ['Age', 'Height', 'Weight', 'Vegetables', 'Meals', 'Water', 'Excercise', 'Technology']
scaler = StandardScaler()
obesity[numerical_columns] = scaler.fit_transform(obesity[numerical_columns])

In [6]:
obesity.head()

Unnamed: 0,Gender,Age,Height,Weight,Family_Obesity,High_Calories,Vegetables,Meals,Food_Between_Meals,Smoke,Water,Monitor_Calories,Excercise,Technology,Alchohol,Transportation_Movement,NObeyesdad
0,Female,-0.522124,-0.875589,-0.862558,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,-1.188039,0.561997,no,Public_Transportation,Normal_Weight
1,Female,-0.522124,-1.947599,-1.168077,yes,no,1.088342,0.404153,Sometimes,yes,1.618759,yes,2.33975,-1.080625,Sometimes,Public_Transportation,Normal_Weight
2,Male,-0.206889,1.054029,-0.36609,yes,no,-0.785019,0.404153,Sometimes,no,-0.013073,no,1.16382,0.561997,Frequently,Public_Transportation,Normal_Weight
3,Male,0.423582,1.054029,0.015808,no,no,1.088342,0.404153,Sometimes,no,-0.013073,no,1.16382,-1.080625,Frequently,Walking,Overweight_Level_I
4,Male,-0.364507,0.839627,0.12274,no,no,-0.785019,-2.167023,Sometimes,no,-0.013073,no,-1.188039,-1.080625,Sometimes,Public_Transportation,Overweight_Level_II


In [7]:
# Identify the categorical columns to be one-hot encoded
categorical_columns = ['Gender', 'Family_Obesity', 'High_Calories', 'Food_Between_Meals', 'Smoke', 'Monitor_Calories','Alchohol', 'Transportation_Movement']

# Perform one-hot encoding
obesity = pd.get_dummies(obesity, columns=categorical_columns)

# Display the encoded DataFrame
print(obesity.head())

        Age    Height    Weight  Vegetables     Meals     Water  Excercise  \
0 -0.522124 -0.875589 -0.862558   -0.785019  0.404153 -0.013073  -1.188039   
1 -0.522124 -1.947599 -1.168077    1.088342  0.404153  1.618759   2.339750   
2 -0.206889  1.054029 -0.366090   -0.785019  0.404153 -0.013073   1.163820   
3  0.423582  1.054029  0.015808    1.088342  0.404153 -0.013073   1.163820   
4 -0.364507  0.839627  0.122740   -0.785019 -2.167023 -0.013073  -1.188039   

   Technology           NObeyesdad  Gender_Female  ...  Monitor_Calories_yes  \
0    0.561997        Normal_Weight           True  ...                 False   
1   -1.080625        Normal_Weight           True  ...                  True   
2    0.561997        Normal_Weight          False  ...                 False   
3   -1.080625   Overweight_Level_I          False  ...                 False   
4   -1.080625  Overweight_Level_II          False  ...                 False   

   Alchohol_Always  Alchohol_Frequently  Alchohol_

In [8]:
obesity.head()

Unnamed: 0,Age,Height,Weight,Vegetables,Meals,Water,Excercise,Technology,NObeyesdad,Gender_Female,...,Monitor_Calories_yes,Alchohol_Always,Alchohol_Frequently,Alchohol_Sometimes,Alchohol_no,Transportation_Movement_Automobile,Transportation_Movement_Bike,Transportation_Movement_Motorbike,Transportation_Movement_Public_Transportation,Transportation_Movement_Walking
0,-0.522124,-0.875589,-0.862558,-0.785019,0.404153,-0.013073,-1.188039,0.561997,Normal_Weight,True,...,False,False,False,False,True,False,False,False,True,False
1,-0.522124,-1.947599,-1.168077,1.088342,0.404153,1.618759,2.33975,-1.080625,Normal_Weight,True,...,True,False,False,True,False,False,False,False,True,False
2,-0.206889,1.054029,-0.36609,-0.785019,0.404153,-0.013073,1.16382,0.561997,Normal_Weight,False,...,False,False,True,False,False,False,False,False,True,False
3,0.423582,1.054029,0.015808,1.088342,0.404153,-0.013073,1.16382,-1.080625,Overweight_Level_I,False,...,False,False,True,False,False,False,False,False,False,True
4,-0.364507,0.839627,0.12274,-0.785019,-2.167023,-0.013073,-1.188039,-1.080625,Overweight_Level_II,False,...,False,False,False,True,False,False,False,False,True,False


In [9]:
obesity_labels = obesity.columns.tolist()
print(obesity_labels)

['Age', 'Height', 'Weight', 'Vegetables', 'Meals', 'Water', 'Excercise', 'Technology', 'NObeyesdad', 'Gender_Female', 'Gender_Male', 'Family_Obesity_no', 'Family_Obesity_yes', 'High_Calories_no', 'High_Calories_yes', 'Food_Between_Meals_Always', 'Food_Between_Meals_Frequently', 'Food_Between_Meals_Sometimes', 'Food_Between_Meals_no', 'Smoke_no', 'Smoke_yes', 'Monitor_Calories_no', 'Monitor_Calories_yes', 'Alchohol_Always', 'Alchohol_Frequently', 'Alchohol_Sometimes', 'Alchohol_no', 'Transportation_Movement_Automobile', 'Transportation_Movement_Bike', 'Transportation_Movement_Motorbike', 'Transportation_Movement_Public_Transportation', 'Transportation_Movement_Walking']


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X = obesity.iloc[:, :-1]
y = obesity.iloc[:, -1]
 
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [11]:
X_train

Unnamed: 0,Age,Height,Weight,Vegetables,Meals,Water,Excercise,Technology,NObeyesdad,Gender_Female,...,Monitor_Calories_no,Monitor_Calories_yes,Alchohol_Always,Alchohol_Frequently,Alchohol_Sometimes,Alchohol_no,Transportation_Movement_Automobile,Transportation_Movement_Bike,Transportation_Movement_Motorbike,Transportation_Movement_Public_Transportation
1295,2.180837,-1.542294,-0.264440,-0.785019,0.404153,-1.196520,-1.188039,-1.080625,Obesity_Type_I,True,...,True,False,False,False,True,False,True,False,False,False
1627,2.530338,0.222106,0.869314,-0.256800,1.394543,-1.644905,0.376558,-1.080625,Obesity_Type_II,False,...,True,False,False,False,False,True,True,False,False,False
1388,2.417280,-1.381675,-0.248216,-0.650845,0.375753,-0.880723,-1.188039,-1.080625,Obesity_Type_I,True,...,True,False,False,False,False,True,True,False,False,False
1318,-0.169486,0.636030,0.429379,-0.785019,0.404153,1.600435,1.669380,2.168551,Obesity_Type_I,False,...,True,False,False,False,False,True,False,False,False,True
648,-0.278243,0.222803,-1.325371,0.457808,0.404153,-1.577820,-0.253878,1.205819,Insufficient_Weight,True,...,True,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,-0.731460,-0.023620,-0.327900,-0.925401,-0.420811,1.250035,-1.188039,-0.861226,Overweight_Level_II,False,...,True,False,False,False,False,True,False,False,False,True
1731,0.983906,0.830364,1.299023,0.472452,0.404153,0.224143,-0.150035,-0.105044,Obesity_Type_II,False,...,True,False,False,False,True,False,True,False,False,False
763,-0.522124,-0.875589,-0.633419,-0.785019,-2.167023,1.618759,-0.012109,-1.080625,Overweight_Level_I,False,...,True,False,False,False,True,False,False,False,False,True
835,0.789315,-0.634323,-0.636982,-0.097591,0.404153,-0.132887,0.671390,-1.080625,Overweight_Level_I,True,...,True,False,False,False,True,False,True,False,False,False
