In [79]:
import pandas as pd
from sklearn import preprocessing, metrics, svm, tree
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier

le = preprocessing.LabelEncoder()
gnb = GaussianNB()
svm_ovo = svm.SVC(decision_function_shape='ovo')
svm_ovr = svm.SVC(decision_function_shape='ovr')
svm_lin = svm.LinearSVC(max_iter = 5000)
nc = NearestCentroid()
dt = tree.DecisionTreeClassifier()

dataset = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

print(dataset)

Age       24.312600
Height     1.701677
Weight    86.586058
FCVC       2.419043
NCP        2.685628
CH2O       2.008011
FAF        1.010298
TUE        0.657866
dtype: float64


In [80]:
dataset["SCC"] = le.fit_transform(dataset["SCC"])
dataset["SMOKE"] = le.fit_transform(dataset["SMOKE"])
dataset["Gender"] = le.fit_transform(dataset["Gender"]) 
dataset["family_history_with_overweight"] = le.fit_transform(dataset["family_history_with_overweight"])
dataset["FAVC"] = le.fit_transform(dataset["FAVC"])
dataset["NObeyesdad"] = le.fit_transform(dataset["NObeyesdad"])
dataset["CAEC"] = le.fit_transform(dataset["CAEC"])
dataset["CALC"] = le.fit_transform(dataset["CALC"])
dataset["MTRANS"] = le.fit_transform(dataset["MTRANS"])

minVal = min(dataset["Age"]);
maxVal = max(dataset["Age"]);

dataset["Age"] = (dataset["Age"] - minVal) / ( maxVal - minVal )

target = dataset['NObeyesdad']
dataset = dataset.drop(['Height', 'Weight', 'NObeyesdad'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(dataset,target, test_size=0.2,random_state=104) # 80% training and 20% test

# print(dataset)
print(dataset.mean())
print(dataset.std())
print(dataset.mode())
print(dataset.skew())


Gender                            0.505921
Age                               0.219417
family_history_with_overweight    0.817622
FAVC                              0.883941
FCVC                              2.419043
NCP                               2.685628
CAEC                              1.859308
SMOKE                             0.020843
CH2O                              2.008011
SCC                               0.045476
FAF                               1.010298
TUE                               0.657866
CALC                              2.268593
MTRANS                            2.365230
dtype: float64
Gender                            0.500083
Age                               0.135021
family_history_with_overweight    0.386247
FAVC                              0.320371
FCVC                              0.533927
NCP                               0.778039
CAEC                              0.468543
SMOKE                             0.142893
CH2O                              0.612

In [32]:
gnb.fit(X_train, y_train) 

y_pred = gnb.predict(X_test)

print("Accuracy gnb:",accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy gnb: 0.5933806146572104
Confusion matrix:
 [[39  1  7  8  0  0  0]
 [31  7  6  2  0  3  2]
 [ 0  2 42 20  0  1  2]
 [ 0  1  0 67  0  0  1]
 [ 1  0  0  0 68  0  0]
 [ 2  5 19 16  0 17  0]
 [ 4  3 11 23  0  1 11]]


In [31]:
svm_ovo.fit(X_train, y_train)

y_pred = svm_ovo.predict(X_test)

print("Accuracy svm_ovo:",metrics.accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy svm_ovo: 0.6903073286052009
Confusion matrix:
 [[47  3  4  1  0  0  0]
 [15 20  3  1  1 10  1]
 [ 1  0 44 13  1  2  6]
 [ 0  2  2 63  0  0  2]
 [ 0  1  0  0 68  0  0]
 [ 2  7 17  9  0 24  0]
 [ 2  4  8 11  1  1 26]]


In [33]:
svm_ovr.fit(X_train, y_train)

y_pred = svm_ovr.predict(X_test)

print("Accuracy svm_ovr:",metrics.accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy svm_ovr: 0.6903073286052009
Confusion matrix:
 [[47  3  4  1  0  0  0]
 [15 20  3  1  1 10  1]
 [ 1  0 44 13  1  2  6]
 [ 0  2  2 63  0  0  2]
 [ 0  1  0  0 68  0  0]
 [ 2  7 17  9  0 24  0]
 [ 2  4  8 11  1  1 26]]


In [34]:
svm_lin.fit(X_train, y_train)

y_pred = svm_lin.predict(X_test)

print("Accuracy svm_lin:",metrics.accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy svm_lin: 0.6099290780141844
Confusion matrix:
 [[39 11  2  3  0  0  0]
 [23 14  4  3  3  2  2]
 [ 2  0 48 12  1  2  2]
 [ 0  1  6 61  0  0  1]
 [ 1  0  0  0 68  0  0]
 [ 4  5 13 15  5 17  0]
 [ 3  3 18 15  2  1 11]]


In [63]:
nc.fit(X_train, y_train)

y_pred = nc.predict(X_test)

print("Accuracy nc:",metrics.accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy nc: 0.41843971631205673
Confusion matrix:
 [[20 14  0 10  6  4  1]
 [12 18  1  8  4  5  3]
 [ 9  4 25 15  3  9  2]
 [ 2 17  0 28  2 15  5]
 [ 8  1  0  0 60  0  0]
 [ 2  7  7 11  9 21  2]
 [ 8  4 11 10  5 10  5]]


In [84]:
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print("Accuracy dt:",metrics.accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy dt: 0.7990543735224587
Confusion matrix:
 [[51  2  1  0  0  1  0]
 [ 6 33  3  1  0  4  4]
 [ 0  2 54  2  0  3  6]
 [ 1  1  0 62  0  0  5]
 [ 0  3  0  0 66  0  0]
 [ 4  1  6  1  0 43  4]
 [ 6  5  6  2  0  5 29]]
