In [2]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn.datasets import make_circles

In [4]:
#Read CSV file into pandas df
df = pd.read_csv("ObesityDataSet.csv")
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [6]:
#Transform NObeyesdad Column with get_dummies
obesity_dummies = pd.get_dummies(df["NObeyesdad"])
obesity_dummies.tail()

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
2106,0,0,0,0,1,0,0
2107,0,0,0,0,1,0,0
2108,0,0,0,0,1,0,0
2109,0,0,0,0,1,0,0
2110,0,0,0,0,1,0,0


In [7]:
df = pd.concat([df, obesity_dummies], axis=1)

# Drop the original NObeyesdad column
df = df.drop(columns=["NObeyesdad"])

# Display the DataFrame
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,...,TUE,CALC,MTRANS,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,...,1.0,no,Public_Transportation,0,1,0,0,0,0,0
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,...,0.0,Sometimes,Public_Transportation,0,1,0,0,0,0,0
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,...,1.0,Frequently,Public_Transportation,0,1,0,0,0,0,0
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,...,0.0,Frequently,Walking,0,0,0,0,0,1,0
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,...,0.0,Sometimes,Public_Transportation,0,0,0,0,0,0,1


In [8]:
#Do the same with other non numeric columns
gender_dummies = pd.get_dummies(df["Gender"])
gender_dummies.tail()

Unnamed: 0,Female,Male
2106,1,0
2107,1,0
2108,1,0
2109,1,0
2110,1,0


In [9]:
df = pd.concat([df, gender_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["Gender"])

In [10]:
family_dummies = pd.get_dummies(df["family_history_with_overweight"])
family_dummies.tail()

Unnamed: 0,no,yes
2106,0,1
2107,0,1
2108,0,1
2109,0,1
2110,0,1


In [11]:
df = pd.concat([df, family_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["family_history_with_overweight"])

In [12]:
favc_dummies = pd.get_dummies(df["FAVC"])
favc_dummies.tail()

Unnamed: 0,no,yes
2106,0,1
2107,0,1
2108,0,1
2109,0,1
2110,0,1


In [13]:
df = pd.concat([df, favc_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["FAVC"])

In [14]:
caec_dummies = pd.get_dummies(df["CAEC"])
caec_dummies.tail()

Unnamed: 0,Always,Frequently,Sometimes,no
2106,0,0,1,0
2107,0,0,1,0
2108,0,0,1,0
2109,0,0,1,0
2110,0,0,1,0


In [15]:
df = pd.concat([df, caec_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["CAEC"])

In [16]:
smoke_dummies = pd.get_dummies(df["SMOKE"])
smoke_dummies.tail()

Unnamed: 0,no,yes
2106,1,0
2107,1,0
2108,1,0
2109,1,0
2110,1,0


In [17]:
df = pd.concat([df, smoke_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["SMOKE"])

In [18]:
scc_dummies = pd.get_dummies(df["SCC"])
scc_dummies.tail()

Unnamed: 0,no,yes
2106,1,0
2107,1,0
2108,1,0
2109,1,0
2110,1,0


In [19]:
df = pd.concat([df, scc_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["SCC"])

In [20]:
calc_dummies = pd.get_dummies(df["CALC"])
calc_dummies.tail()

Unnamed: 0,Always,Frequently,Sometimes,no
2106,0,0,1,0
2107,0,0,1,0
2108,0,0,1,0
2109,0,0,1,0
2110,0,0,1,0


In [21]:
df = pd.concat([df, calc_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["CALC"])

In [22]:
transit_dummies = pd.get_dummies(df["MTRANS"])
transit_dummies.tail()

Unnamed: 0,Automobile,Bike,Motorbike,Public_Transportation,Walking
2106,0,0,0,1,0
2107,0,0,0,1,0
2108,0,0,0,1,0
2109,0,0,0,1,0
2110,0,0,0,1,0


In [23]:
df = pd.concat([df, transit_dummies], axis=1)

# Drop the original column
df = df.drop(columns=["MTRANS"])

In [24]:
df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Insufficient_Weight,Normal_Weight,...,yes,Always,Frequently,Sometimes,no,Automobile,Bike,Motorbike,Public_Transportation,Walking
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0,0,1,...,0,0,0,0,1,0,0,0,1,0
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,0,1,...,1,0,0,1,0,0,0,0,1,0
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,0,1,...,0,0,1,0,0,0,0,0,1,0
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [25]:
df.to_csv('clean_df.csv', index=False)  