# Autor: Natan Nobre Chaves
## Bacharelando em Engenharia de Computação

## Title: Pima Indians Diabetes Database

### Context:<br>
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

### Content:<br>
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

Source: https://www.kaggle.com/uciml/pima-indians-diabetes-database

# Bibliotecas

In [575]:
from matplotlib import pyplot
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import plotly.express as px

from sklearn import pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from sklearn import model_selection
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Generate and plot a synthetic imbalanced classification dataset
from numpy import where
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE

# Importação do Dataset

In [576]:
diabetes = pd.read_csv("dataset/diabetes.csv")
print(diabetes.shape)
diabetes.head()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [577]:
diabetes.groupby("Outcome").count()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,500,500,500,500,500,500,500,500
1,268,268,268,268,268,268,268,268


# Questão 1

In [578]:
y = diabetes['Outcome'].values
X = diabetes.copy()
del X['Outcome']
X = X.values
print(y.shape)
print(X.shape)

(768,)
(768, 8)


In [579]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
X_treino_holdout, X_validacao_holdout, y_treino_holdout, y_validacao_holdout = train_test_split(X_treino, y_treino, train_size=0.7, random_state=42, stratify=y_treino)
print(X_treino_holdout.shape)
print(X_validacao_holdout.shape)

(429, 8)
(185, 8)


## Modelos de KNN SEM escalonamento de features

In [580]:
quantidade_de_modelos = 55

# criando os modelos kNN
modelokNN = []
for idx in range(quantidade_de_modelos) :
    modelokNN.append(KNeighborsClassifier(n_neighbors=(idx+1)))

# treinando o modelo
for idx in range(quantidade_de_modelos) :
    modelokNN[idx].fit(X_treino_holdout, y_treino_holdout)

# predizendo as amostras de teste
y_pred = []
for idx in range(quantidade_de_modelos) :
    y_pred.append(np.array(modelokNN[idx].predict(X_validacao_holdout)))

In [581]:
# Calculando os F1-Scores
f1_scores_nao_escalonado = []
for idx in range(quantidade_de_modelos) :
    f1_scores_nao_escalonado.append([metrics.f1_score(y_validacao_holdout, y_pred[idx], average='weighted', zero_division=0), "K = " + str(idx+1), idx+1])

# Ordenando do maior para o menor
f1_scores_nao_escalonado.sort(reverse=True)
for idx in range(quantidade_de_modelos) :
    print(f1_scores_nao_escalonado[idx])

[0.8098258905806076, 'K = 51', 51]
[0.8098258905806076, 'K = 49', 49]
[0.8083248078448253, 'K = 50', 50]
[0.8000160160160161, 'K = 52', 52]
[0.7948146619902345, 'K = 47', 47]
[0.7933197348291688, 'K = 42', 42]
[0.7900434235816151, 'K = 48', 48]
[0.7898485008260396, 'K = 39', 39]
[0.7883586226502256, 'K = 43', 43]
[0.7882522522522524, 'K = 54', 54]
[0.7850914512199558, 'K = 53', 53]
[0.784898884898885, 'K = 41', 41]
[0.7834154765452476, 'K = 45', 45]
[0.7834154765452476, 'K = 31', 31]
[0.7818374978752337, 'K = 34', 34]
[0.7818374978752337, 'K = 32', 32]
[0.78016038016038, 'K = 55', 55]
[0.7769185481988864, 'K = 37', 37]
[0.7769185481988864, 'K = 36', 36]
[0.7769185481988864, 'K = 35', 35]
[0.7769185481988864, 'K = 33', 33]
[0.7769185481988864, 'K = 23', 23]
[0.7752487797770817, 'K = 44', 44]
[0.7752487797770817, 'K = 40', 40]
[0.7734747729075208, 'K = 46', 46]
[0.7703552609212987, 'K = 25', 25]
[0.7703552609212987, 'K = 21', 21]
[0.7685898738530318, 'K = 38', 38]
[0.7654784737475474, 'K

## Criando o melhor modelo KNN sem escalonamento

In [582]:
# criando
print("K = "+str(f1_scores_nao_escalonado[0][2]))
melhor_modelo = KNeighborsClassifier(n_neighbors=f1_scores_nao_escalonado[0][2])

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

K = 51


In [583]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.6753246753246753
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       100
           1       0.55      0.43      0.48        54

    accuracy                           0.68       154
   macro avg       0.64      0.62      0.62       154
weighted avg       0.66      0.68      0.66       154



## Modelos de KNN COM escalonamento de features

In [584]:
quantidade_de_modelos = 55

# criando os pipelines
pipe = []
for idx in range(quantidade_de_modelos) :
    pipe.append(Pipeline(steps = [('scale',StandardScaler()),('KNN',KNeighborsClassifier(n_neighbors=(idx+1)))]))

# treinando
for idx in range(quantidade_de_modelos) :
    pipe[idx].fit(X_treino_holdout, y_treino_holdout)

# predizendo as amostras de teste
y_pred = []
for idx in range(quantidade_de_modelos) :
    y_pred.append(np.array(pipe[idx].predict(X_validacao_holdout)))

In [585]:
# Calculando os F1-Scores
f1_scores_escalonado = []
for idx in range(quantidade_de_modelos) :
    f1_scores_escalonado.append([metrics.f1_score(y_validacao_holdout, y_pred[idx], average='weighted', zero_division=0), "K = " + str(idx+1), idx+1])

# Ordenando do maior para o menor
f1_scores_escalonado.sort(reverse=True)
for idx in range(quantidade_de_modelos) :
    print(f1_scores_escalonado[idx])

[0.7573967790184007, 'K = 19', 19]
[0.7555475266815472, 'K = 37', 37]
[0.7550506608452175, 'K = 29', 29]
[0.7525485618508875, 'K = 17', 17]
[0.7483921878082461, 'K = 51', 51]
[0.7481646908788137, 'K = 25', 25]
[0.7481646908788137, 'K = 23', 23]
[0.7477108319365817, 'K = 15', 15]
[0.7461129082750705, 'K = 11', 11]
[0.7459608551859475, 'K = 33', 33]
[0.7459608551859475, 'K = 31', 31]
[0.745448861238335, 'K = 21', 21]
[0.7433864065997517, 'K = 8', 8]
[0.742598324844869, 'K = 16', 16]
[0.7411436699857753, 'K = 42', 42]
[0.7411436699857753, 'K = 39', 39]
[0.7411436699857753, 'K = 36', 36]
[0.7388744687064015, 'K = 55', 55]
[0.7388744687064015, 'K = 35', 35]
[0.7388744687064015, 'K = 27', 27]
[0.7385146413445003, 'K = 46', 46]
[0.7380622566300316, 'K = 9', 9]
[0.7364108634181626, 'K = 45', 45]
[0.7364108634181626, 'K = 41', 41]
[0.7364108634181626, 'K = 30', 30]
[0.7338783549309865, 'K = 14', 14]
[0.7338783549309865, 'K = 12', 12]
[0.7337983682109491, 'K = 53', 53]
[0.7337983682109491, 'K = 

## Criando o melhor modelo KNN com escalonamento

In [586]:
# criando
print("K = "+str(f1_scores_escalonado[0][2]))
melhor_modelo = KNeighborsClassifier(n_neighbors=f1_scores_escalonado[0][2])

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

K = 19


In [587]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.7272727272727273
              precision    recall  f1-score   support

           0       0.77      0.83      0.80       100
           1       0.63      0.54      0.58        54

    accuracy                           0.73       154
   macro avg       0.70      0.68      0.69       154
weighted avg       0.72      0.73      0.72       154



# Questão 2

## Modelos Sem Escalonamento

In [588]:
quantidade_de_modelos = 55

# criando os modelos kNN
modelokNN = []
for idx in range(quantidade_de_modelos) :
    modelokNN.append(KNeighborsClassifier(n_neighbors=(idx+1)))

cv = []
for idx in range(quantidade_de_modelos) :
    cv.append([cross_val_score(modelokNN[idx],X_treino,y_treino,cv=5,scoring='f1_weighted').mean(), "K = " + str(idx+1), idx+1])

In [589]:
# Ordenando do maior para o menor
cv.sort(reverse=True)
for idx in range(quantidade_de_modelos) :
    print(cv[idx])

[0.7606241388532532, 'K = 12', 12]
[0.758260766571087, 'K = 14', 14]
[0.7564372883827548, 'K = 15', 15]
[0.7503549799350728, 'K = 11', 11]
[0.7500613027439541, 'K = 17', 17]
[0.7492766940816857, 'K = 16', 16]
[0.7488404793577346, 'K = 10', 10]
[0.747555881225926, 'K = 19', 19]
[0.7448849237389392, 'K = 27', 27]
[0.7436937094136458, 'K = 13', 13]
[0.7432099292202186, 'K = 9', 9]
[0.7428826323389677, 'K = 29', 29]
[0.7405528023366097, 'K = 36', 36]
[0.7399650411141376, 'K = 23', 23]
[0.7395763029378862, 'K = 28', 28]
[0.7392051294083578, 'K = 54', 54]
[0.7391921881495435, 'K = 25', 25]
[0.7384521433641553, 'K = 26', 26]
[0.7374244968245739, 'K = 20', 20]
[0.7371472480935382, 'K = 21', 21]
[0.7367600481660022, 'K = 52', 52]
[0.7366779405819885, 'K = 35', 35]
[0.7354765596511476, 'K = 34', 34]
[0.7353565440374894, 'K = 53', 53]
[0.7353416764765986, 'K = 33', 33]
[0.7352273454313959, 'K = 31', 31]
[0.7348283833554563, 'K = 37', 37]
[0.7346699681169848, 'K = 49', 49]
[0.7346491747446551, 'K 

## Criando o melhor modelo sem escalonamento

In [590]:
# criando
print("K = "+str(cv[0][2]))
melhor_modelo = KNeighborsClassifier(n_neighbors=cv[0][2])

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

K = 12


In [591]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.7012987012987013
              precision    recall  f1-score   support

           0       0.74      0.84      0.79       100
           1       0.60      0.44      0.51        54

    accuracy                           0.70       154
   macro avg       0.67      0.64      0.65       154
weighted avg       0.69      0.70      0.69       154



## Modelos Com Escalonamento

In [592]:
quantidade_de_modelos = 55

# criando os pipelines
pipe = []
for idx in range(quantidade_de_modelos) :
    pipe.append(Pipeline(steps = [('scale',StandardScaler()),('KNN',KNeighborsClassifier(n_neighbors=(idx+1)))]))

cv = []
for idx in range(quantidade_de_modelos) :
    cv.append([cross_val_score(pipe[idx],X_treino,y_treino,cv=5,scoring='f1_weighted').mean(), "K = " + str(idx+1), idx+1])

In [593]:
# Ordenando do maior para o menor
cv.sort(reverse=True)
for idx in range(quantidade_de_modelos) :
    print(cv[idx])

[0.7507420007654881, 'K = 31', 31]
[0.7485312592517452, 'K = 30', 30]
[0.7473043240771392, 'K = 13', 13]
[0.7467381782824891, 'K = 29', 29]
[0.7465483105942291, 'K = 41', 41]
[0.7449011096556681, 'K = 35', 35]
[0.7448486457570255, 'K = 33', 33]
[0.744209984300043, 'K = 15', 15]
[0.7412629069950082, 'K = 14', 14]
[0.7401998752328218, 'K = 24', 24]
[0.739968342864092, 'K = 25', 25]
[0.7394140965024014, 'K = 7', 7]
[0.7392839849411941, 'K = 9', 9]
[0.7390136639665208, 'K = 5', 5]
[0.7388287629854672, 'K = 45', 45]
[0.738820473041174, 'K = 32', 32]
[0.7380539193535964, 'K = 42', 42]
[0.7380397871255525, 'K = 37', 37]
[0.7367209127258352, 'K = 11', 11]
[0.7353603495110055, 'K = 39', 39]
[0.7349044065501291, 'K = 17', 17]
[0.7348863779724158, 'K = 20', 20]
[0.7347352032580259, 'K = 23', 23]
[0.7346151154816634, 'K = 49', 49]
[0.734143346654964, 'K = 10', 10]
[0.7339137258082209, 'K = 40', 40]
[0.733848984933099, 'K = 43', 43]
[0.7338093400301827, 'K = 21', 21]
[0.7336668652908517, 'K = 55', 

## Criando o melhor modelo com escalonamento

In [594]:
# criando
print("K = "+str(cv[0][2]))
melhor_modelo = KNeighborsClassifier(n_neighbors=cv[0][2])

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

K = 31


In [595]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.6688311688311688
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       100
           1       0.54      0.41      0.46        54

    accuracy                           0.67       154
   macro avg       0.63      0.61      0.61       154
weighted avg       0.65      0.67      0.66       154



# Questão 3

## Modelos Sem Escalonamento

In [596]:
knn = KNeighborsClassifier()

param_range = range(1,quantidade_de_modelos)

#param_grid = [{'kneighborsclassifier__n_neighbors': param_range}]
param_grid = [{'n_neighbors': param_range}]

In [597]:
gs = model_selection.GridSearchCV(estimator=knn, param_grid=param_grid, 
                    return_train_score=False, scoring='f1_weighted', cv=5)
gs = gs.fit(X_treino, y_treino)

print(gs.best_score_)
print(gs.best_params_)

0.7606241388532532
{'n_neighbors': 12}


In [598]:
df = pd.DataFrame(gs.cv_results_)
df.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,0.000798,0.000398922,0.003989,3.504023e-07,12,{'n_neighbors': 12},0.754521,0.776571,0.729023,0.736149,0.806857,0.760624,0.028379,1
13,0.000598,0.0004885776,0.004189,0.0003992797,14,{'n_neighbors': 14},0.752093,0.793758,0.690636,0.732693,0.822124,0.758261,0.046051,2
14,0.000798,0.0003989698,0.004189,0.0003990889,15,{'n_neighbors': 15},0.764077,0.79778,0.679818,0.732146,0.808366,0.756437,0.046743,3
10,0.000997,1.507891e-07,0.003989,1.907349e-07,11,{'n_neighbors': 11},0.734783,0.801087,0.700802,0.72193,0.793173,0.750355,0.039782,4
16,0.000804,0.0005113873,0.004195,0.000513407,17,{'n_neighbors': 17},0.761841,0.775363,0.700802,0.711541,0.80076,0.750061,0.038104,5


In [599]:
melhor_n = df.sort_values(by='rank_test_score')['param_n_neighbors'].iloc[0]
print(melhor_n)

12


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000997,1.168008e-07,0.00379,0.0003985883,1,{'n_neighbors': 1},0.647306,0.644077,0.686524,0.723577,0.779294,0.696156,0.050706,54
1,0.000798,0.0003989221,0.00379,0.0003990653,2,{'n_neighbors': 2},0.672499,0.692774,0.672499,0.721875,0.746645,0.701259,0.029021,53
2,0.000997,2.861023e-07,0.004388,0.0007982254,3,{'n_neighbors': 3},0.718207,0.683752,0.682569,0.72501,0.805228,0.722953,0.044639,50
3,0.000598,0.0004884998,0.004189,0.0003988744,4,{'n_neighbors': 4},0.732146,0.700968,0.679231,0.692774,0.729172,0.706858,0.020658,52
4,0.0,0.0,0.003989,3.371748e-07,5,{'n_neighbors': 5},0.739949,0.73269,0.680114,0.747225,0.773267,0.734649,0.030514,29


## Criando o melhor modelo sem escalonamento

In [600]:
# criando
melhor_modelo = KNeighborsClassifier(n_neighbors=melhor_n)

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

In [601]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.7012987012987013
              precision    recall  f1-score   support

           0       0.74      0.84      0.79       100
           1       0.60      0.44      0.51        54

    accuracy                           0.70       154
   macro avg       0.67      0.64      0.65       154
weighted avg       0.69      0.70      0.69       154



## Modelos Com Escalonamento

In [602]:
param_grid = [{'kneighborsclassifier__n_neighbors': param_range}]
pipe = pipeline.make_pipeline(StandardScaler(), KNeighborsClassifier())
gs = model_selection.GridSearchCV(estimator=pipe, param_grid=param_grid, 
                    return_train_score=False, scoring='f1_weighted', cv=5)
gs = gs.fit(X_treino, y_treino)

print(gs.best_score_)
print(gs.best_params_)

0.7507420007654881
{'kneighborsclassifier__n_neighbors': 31}


In [603]:
df = pd.DataFrame(gs.cv_results_)
df.sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,0.001396,0.000489,0.004987,3.693565e-07,31,{'kneighborsclassifier__n_neighbors': 31},0.732146,0.793758,0.732146,0.728918,0.766743,0.750742,0.02559,1
29,0.001197,0.000399,0.004987,2.611745e-07,30,{'kneighborsclassifier__n_neighbors': 30},0.736149,0.791448,0.729023,0.724797,0.76124,0.748531,0.024902,2
12,0.001463,0.000452,0.004629,0.000366447,13,{'kneighborsclassifier__n_neighbors': 13},0.717899,0.764077,0.754521,0.736149,0.763877,0.747304,0.017873,3
28,0.001197,0.000399,0.004987,2.431402e-07,29,{'kneighborsclassifier__n_neighbors': 29},0.707823,0.791448,0.749427,0.710846,0.774147,0.746738,0.033347,4
40,0.001396,0.000489,0.004987,2.611745e-07,41,{'kneighborsclassifier__n_neighbors': 41},0.761013,0.756689,0.703932,0.74699,0.764118,0.746548,0.022078,5


In [604]:
melhor_n = df.sort_values(by='rank_test_score')['param_kneighborsclassifier__n_neighbors'].iloc[0]
print(melhor_n)

31


## Criando o melhor modelo com escalonamento

In [605]:
# criando
melhor_modelo = KNeighborsClassifier(n_neighbors=melhor_n)

# treinando
melhor_modelo.fit(X_treino, y_treino)

# predizendo
y_pred = melhor_modelo.predict(X_teste)

In [606]:
print(metrics.accuracy_score(y_teste, y_pred))
print(metrics.classification_report(y_teste, y_pred, zero_division=0))

0.6688311688311688
              precision    recall  f1-score   support

           0       0.72      0.81      0.76       100
           1       0.54      0.41      0.46        54

    accuracy                           0.67       154
   macro avg       0.63      0.61      0.61       154
weighted avg       0.65      0.67      0.66       154

