In [1]:
import pandas as pd  
import numpy as np 
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# Carrega o dataset
data = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
data = data.rename(columns={'family_history_with_overweight': 'FHWOW'})
data = data.rename(columns={'NObeyesdad': 'Obesity'})
print(data.shape)
data.head(10)

(2111, 17)


Unnamed: 0,Gender,Age,Height,Weight,FHWOW,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [3]:
# Verifica dados ausentes
missing_values = data.isnull().sum()
print(missing_values)

Gender     0
Age        0
Height     0
Weight     0
FHWOW      0
FAVC       0
FCVC       0
NCP        0
CAEC       0
SMOKE      0
CH2O       0
SCC        0
FAF        0
TUE        0
CALC       0
MTRANS     0
Obesity    0
dtype: int64


In [4]:
# Separa as colunas categoricas e numericas
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(exclude=['object']).columns

In [5]:
# colunas categoricas
print(categorical_columns.values)

['Gender' 'FHWOW' 'FAVC' 'CAEC' 'SMOKE' 'SCC' 'CALC' 'MTRANS' 'Obesity']


In [6]:
# colunas numericas
print(numerical_columns.values)

['Age' 'Height' 'Weight' 'FCVC' 'NCP' 'CH2O' 'FAF' 'TUE']


In [7]:
# Aplica o LabelEncoder nas colunas categoricas
data[categorical_columns] = data[categorical_columns].apply(LabelEncoder().fit_transform)
data.head(10)

Unnamed: 0,Gender,Age,Height,Weight,FHWOW,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Obesity
0,0,21.0,1.62,64.0,1,0,2.0,3.0,2,0,2.0,0,0.0,1.0,3,3,1
1,0,21.0,1.52,56.0,1,0,3.0,3.0,2,1,3.0,1,3.0,0.0,2,3,1
2,1,23.0,1.8,77.0,1,0,2.0,3.0,2,0,2.0,0,2.0,1.0,1,3,1
3,1,27.0,1.8,87.0,0,0,3.0,3.0,2,0,2.0,0,2.0,0.0,1,4,5
4,1,22.0,1.78,89.8,0,0,2.0,1.0,2,0,2.0,0,0.0,0.0,2,3,6
5,1,29.0,1.62,53.0,0,1,2.0,3.0,2,0,2.0,0,0.0,0.0,2,0,1
6,0,23.0,1.5,55.0,1,1,3.0,3.0,2,0,2.0,0,1.0,0.0,2,2,1
7,1,22.0,1.64,53.0,0,0,2.0,3.0,2,0,2.0,0,3.0,0.0,2,3,1
8,1,24.0,1.78,64.0,1,1,3.0,3.0,2,0,2.0,0,1.0,1.0,1,3,1
9,1,22.0,1.72,68.0,1,1,2.0,3.0,2,0,2.0,0,1.0,1.0,3,3,1


In [8]:
# Quantidade de instancias por classe  
print(data['Obesity'].value_counts())

Obesity
2    351
4    324
3    297
5    290
6    290
1    287
0    272
Name: count, dtype: int64


In [None]:
# Separa em fearures e target
X = data.drop('Obesity', axis=1)
y = data['Obesity']

from sklearn.model_selection import train_test_split

# Separa em features e target
X = data.drop('Obesity', axis=1)
y = data['Obesity']

# Separa TESTE (20%) e TREINO (80%) - que sera separado em treino e validacao dentro do cross-validation
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
modelKnn = KNeighborsClassifier(metric='euclidean', n_neighbors=5, weights='distance')

# Aqui separa em treino e validacao 
scores = cross_val_score(modelKnn, X, y, cv=5)

print("Scores: ", scores)
print("Média dos scores:", scores.mean())

Scores:  [0.8816568  0.84911243 0.85502959 0.884273   0.87537092]
Média dos scores: 0.8690885466963989


In [None]:
# Modelo final usando toda a base de treino + validacao
modelKnn.fit(X, y)

In [None]:
# Predicao do modelo em validacao
predictionsKnn = modelKnn.predict(X)

print('Accuracy:', accuracy_score(y, predictionsKnn))
print('F1:', f1_score(y, predictionsKnn, average='macro'))
print('Precision:', precision_score(y, predictionsKnn, average='macro'))
print('Recall:', recall_score(y, predictionsKnn, average='macro'))


Accuracy: 1.0
F1: 1.0
Precision: 1.0
Recall: 1.0


In [None]:
# Predicao do modelo em teste
predictionsKnn = modelKnn.predict(X_test)

print('Accuracy:', accuracy_score(y_test, predictionsKnn))
print('F1:', f1_score(y_test, predictionsKnn, average='macro'))
print('Precision:', precision_score(y_test, predictionsKnn, average='macro'))
print('Recall:', recall_score(y_test, predictionsKnn, average='macro'))

Accuracy: 0.8865248226950354
F1: 0.8739308156513157
Precision: 0.8871655915657648
Recall: 0.8861879645329511


In [14]:
# confusion matrix
cm = confusion_matrix(y_test, predictionsKnn)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[55  1  0  0  0  0  0]
 [13 27  2  0  0 15  5]
 [ 0  0 76  1  1  0  0]
 [ 0  0  1 57  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  3  0  0  0 52  1]
 [ 0  0  5  0  0  0 45]]
