In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [58]:
# load data
data=pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [7]:
# checking for null values
data.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [59]:
categorical_columns = [i for i in data if data[i].dtype == "O"]
categorical_columns

['Gender',
 'family_history_with_overweight',
 'FAVC',
 'CAEC',
 'SMOKE',
 'SCC',
 'CALC',
 'MTRANS',
 'NObeyesdad']

In [46]:
numerical_columns = [i for i in data if data[i].dtype != "O"]
numerical_columns

['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Preprocessing data

In [61]:
encoder = LabelEncoder()
for i in categorical_columns[:-1]:
    data[i] = encoder.fit_transform(data[i])

In [75]:
desired_output = {"Normal_Weight":'Normal',
                  "Overweight_Level_I":'Overweight',
                  "Overweight_Level_II":'Overweight',
                  "Obesity_Type_I":'Obesity',
                  "Insufficient_Weight":'Underweight',
                  "Obesity_Type_II":'Obesity',
                  "Obesity_Type_III":'Extreme_Obesity'}

In [78]:
# reqired output
data['NObeyesdad']=data['NObeyesdad'].map(desired_output)

In [79]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,Normal
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,Normal
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,Normal
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,Overweight
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,Overweight


In [63]:
x = data.drop(columns=['NObeyesdad'])
y = data['NObeyesdad']

In [80]:
# split data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

# train models

# DecisionTreeClassifier

In [81]:
# Decision Tree Classifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

DecisionTreeClassifier()

In [82]:
dt_predictions = dt_clf.predict(x_test)

In [84]:
accuracy_score(y_test, dt_predictions)

0.9408983451536643

# DecisionTreeClassifier accuracy = `0.9408983451536643`

# LogisticRegression

In [86]:
# Logistic Regression Classifier
lr_clf = LogisticRegression(n_jobs=10,max_iter=100)
lr_clf.fit(x_train, y_train)
lr_predictions = lr_clf.predict(x_test)

In [87]:
accuracy_score(y_test, lr_predictions)

0.735224586288416

# logistic Regression accuracy = `0.735224586288416`

# RandomForestClassifier

In [89]:
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)
rf_predictions = rf_clf.predict(x_test)

In [90]:
accuracy_score(y_test , rf_predictions)

0.9527186761229315

# RandomForestClassifier accuracy = `0.9527186761229315`

# SVC

In [91]:
svm_clf = SVC()
svm_clf.fit(x_train, y_train)
svm_predictions = svm_clf.predict(x_test)

In [92]:
accuracy_score(y_test, svm_predictions)

0.6761229314420804

# SVC accuracy = `0.6761229314420804`

# RandomForestClassifier gives the heightest accuracy amongs all the trained model