In [419]:
import numpy as np
import pandas as pd
from statistics import mean
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [420]:
df = pd.read_csv('dataR2.csv')
df

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.500000,70,2.707,0.467409,8.8071,9.702400,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.124670,91,4.498,1.009651,17.9393,22.432040,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.169560,12.76600,928.220,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.819240,10.57635,773.920,1
...,...,...,...,...,...,...,...,...,...,...
111,45,26.850000,92,3.330,0.755688,54.6800,12.100000,10.96000,268.230,2
112,62,26.840000,100,4.530,1.117400,12.4500,21.420000,7.32000,330.160,2
113,65,32.050000,97,5.730,1.370998,61.4800,22.540000,10.33000,314.050,2
114,72,25.590000,82,2.820,0.570392,24.9600,33.750000,3.27000,392.460,2


## Separando Labels e Features e normalizando as features

Normalização é necessária para melhor performance na utilização do KNN

In [421]:
scaler = StandardScaler()

In [422]:
data_raw = df.values[:, :-1]
labels = df.values[:, -1]

data = scaler.fit_transform(data_raw)

## KNN básico

Aplicando o KNN básico, sem utilização de KFold para ter uma base

In [423]:
trainData, testData, trainLabels, testLabels = train_test_split(data,
labels, stratify=labels, test_size=0.2, random_state=42)

kVals = range(1, 30, 2)
accuracies = []
for k in kVals:
        # train the k-Nearest Neighbor classifier with the current value of `k`
        model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
        model.fit(trainData, trainLabels)
        # evaluate the model and update the accuracies list
        score = model.score(testData, testLabels)
        print("k=%d, accuracy=%.2f%%" % (k, score * 100))
        accuracies.append(score)

model = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
model.fit(trainData, trainLabels)
score = model.score(testData, testLabels)
print("accuracy=%.2f%%" % (score * 100))

k=1, accuracy=70.83%
k=3, accuracy=70.83%
k=5, accuracy=87.50%
k=7, accuracy=75.00%
k=9, accuracy=75.00%
k=11, accuracy=75.00%
k=13, accuracy=70.83%
k=15, accuracy=66.67%
k=17, accuracy=66.67%
k=19, accuracy=70.83%
k=21, accuracy=66.67%
k=23, accuracy=66.67%
k=25, accuracy=66.67%
k=27, accuracy=62.50%
k=29, accuracy=58.33%
accuracy=87.50%


## KNN com K Fold externo e interno

In [424]:
skf = StratifiedKFold(n_splits=5)
kVals = [1, 3, 5, 11, 21, 31]

# Etapa 1 - Separar em treino e teste
accuracies = []
for train, test in skf.split(data, labels):

    # Etapa 2 - Para cada valor de hiperparametro
    accuracies_kvals = []
    for k in kVals:

        # Etapa 2 - Para cada fold
        models_accuracies = []
        for t_train, t_test in skf.split(data[train], labels[train]):

            # Etapa 2 - Treinar um modelo e medir a acurácia
            model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
            model.fit(data[train[t_train]], labels[train[t_train]])
            score = model.score(data[train[t_test]], labels[train[t_test]])
            models_accuracies.append(score)
        
        # Etapa 2 - Salvar a acurácia média dos k folds para cada hiperparametro
        kVal_mean_accuracy = mean(models_accuracies)
        accuracies_kvals.append(kVal_mean_accuracy)

    # Etapa 3 - Com o melhor hiperparâmetro, treina o modelo com todos os k folds dos dados de treino
    print(accuracies_kvals)
    best_k = kVals[accuracies_kvals.index(max(accuracies_kvals))]

    model = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
    model.fit(data[train], labels[train])
    
    score = model.score(data[test], labels[test])
    accuracies.append(score)
    print(f"\n Best K: {best_k}, Accuracy: {score*100} \n")

# Etapa 4 - Tira a média das k acurácias
mean_accuracy = mean(accuracies)
print("========================================\n Mean Accuracy \n========================================")
print("mean accuracy: %.2f%%" % (mean_accuracy*100))

[0.5654970760233918, 0.564327485380117, 0.5742690058479532, 0.5654970760233918, 0.5754385964912281, 0.6187134502923977]

 Best K: 31, Accuracy: 70.83333333333334 

[0.6111111111111112, 0.6421052631578947, 0.6309941520467837, 0.62046783625731, 0.6947368421052631, 0.643859649122807]

 Best K: 21, Accuracy: 56.52173913043478 

[0.5923976608187135, 0.5912280701754385, 0.6011695906432748, 0.57953216374269, 0.5578947368421052, 0.6251461988304093]

 Best K: 31, Accuracy: 78.26086956521739 

[0.5900584795321637, 0.6116959064327485, 0.5894736842105263, 0.5900584795321637, 0.5690058479532164, 0.504093567251462]

 Best K: 3, Accuracy: 65.21739130434783 

[0.6883040935672514, 0.7, 0.6777777777777777, 0.6900584795321637, 0.7327485380116959, 0.6567251461988304]

 Best K: 21, Accuracy: 47.82608695652174 

 Mean Accuracy 
mean accuracy: 63.73%


Agora treinando o modelo com todo o conjunto de dados de treino, utilizando o melhor K obtido da validação cruzada K-Fold

In [425]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, stratify=labels, test_size=0.2, random_state=42)

best_k = 31

model = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
model.fit(X_train, y_train)

# evalute model
score = model.score(X_test, y_test)
print("k=%d, accuracy=%.2f%%" % (best_k, score * 100))

k=31, accuracy=62.50%


## Decision Tree com K-Fold externo e interno

Decision Tree não precisa que os dados estejam normalizados, então irei utilizar as features da forma que são retiradas do DataFrame

In [426]:
data = df.values[:, :-1]
labels = df.values[:, -1]

In [427]:
skf = StratifiedKFold(n_splits=5)
max_depth_vals = np.arange(10, 101, 10)
criterion_vals = ['gini', 'entropy', 'log_loss']

# Etapa 1 - Separar em treino e teste
accuracies = []
for train, test in skf.split(data, labels):

    # Etapa 2 - Para cada valor de hiperparametro
    accuracies_max_depth_vals = []
    best_criterion_for_x_max_depth = []
    for max_depth in max_depth_vals:

        accuracies_criterion_vals = []
        for criterion in criterion_vals:
        # Etapa 2 - Para cada fold
            models_accuracies = []
            for t_train, t_test in skf.split(data[train], labels[train]):

                # Etapa 2 - Treinar um modelo e medir a acurácia
                model = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion, random_state=42)
                model.fit(data[train[t_train]], labels[train[t_train]])
                score = model.score(data[train[t_test]], labels[train[t_test]])
                models_accuracies.append(score)

            # Etapa 2 - Salvar a acurácia média dos k folds para cada hiperparametro
            criterion_mean_accuracy = mean(models_accuracies)
            accuracies_criterion_vals.append(criterion_mean_accuracy)

        # Etapa 2 - Salvar o melhor hiperparametro 'criterion' para cada X max depth
        best_criterion_for_x_max_depth.append(criterion_vals[accuracies_criterion_vals.index(max(accuracies_criterion_vals))])
        accuracies_max_depth_vals.append(max(accuracies_criterion_vals))

    print(accuracies_max_depth_vals)
    print(best_criterion_for_x_max_depth)

    # Etapa 3 - Com os melhores hiperparâmetros, treinar o modelo com todos os k folds dos dados de treino
    best_max_depth = max_depth_vals[accuracies_max_depth_vals.index(max(accuracies_max_depth_vals))]
    best_criterion = best_criterion_for_x_max_depth[accuracies_max_depth_vals.index(max(accuracies_max_depth_vals))]

    model = DecisionTreeClassifier(max_depth=best_max_depth, criterion=best_criterion, random_state=42)
    model.fit(data[train], labels[train])

    # Etapa 3 - Para cada K fold calcula a acurácia do modelo
    y_pred = model.predict(data[test])
    score = accuracy_score(labels[test], y_pred)
    accuracies.append(score)

    print(f"\n Best max_depth: {best_max_depth}, Best criterion: {best_criterion}, Accuracy: {score*100} \n")

# Etapa 4 - Tira a média das k acurácias
mean_accuracy = mean(accuracies)
print("========================================\n Mean Accuracy \n========================================")
print("mean accuracy: %.2f%%" % (mean_accuracy*100))

[0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362, 0.6099415204678362]
['entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy']

 Best max_depth: 10, Best criterion: entropy, Accuracy: 62.5 

[0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889, 0.6888888888888889]
['gini', 'gini', 'gini', 'gini', 'gini', 'gini', 'gini', 'gini', 'gini', 'gini']

 Best max_depth: 10, Best criterion: gini, Accuracy: 73.91304347826086 

[0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666]
['entropy', 'entropy', 'entropy', 'entropy', 'entropy', 'entropy', 

In [428]:
X_train, X_test, y_train, y_test= train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)

best_max_depth = 10
best_criterion = 'entropy'
model = DecisionTreeClassifier(max_depth=best_max_depth, criterion=best_criterion, random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)

print("========================================\n Model Final Accuracy \n========================================")
print("max_depth: %d, criterion: %s, accuracy: %.2f%%" % (best_max_depth, best_criterion, mean_accuracy*100))

 Model Final Accuracy 
max_depth: 10, criterion: entropy, accuracy: 74.24%


## Conclusão

<p>Com base nas acurácias finais com os melhores hiperparâmetros escolhidos para cada modelo, infere-se que a Árvore de Decisão é melhor nesse caso.</p>
<p>Entretanto, é curioso que na utilização do KNN sem o Kfold, percebe-se que o melhor K é 5 e em nenhum momento durante a utilização da validação cruzada, o K = 5 aparece com uma boa acurácia, mas os melhores valores foram K = 3, 21 e 31, que ao serem utilizados com todos os dados não criam um modelo tão preciso quanto ao K = 5. </p> 
<p>Portanto, imagina-se que a quantidade de dados (cerca de 100 samples nesse dataset) não foi suficiente para que o modelo convergisse completamente, visto que cada conjunto do K-Fold interno utilizado para decidir o melhor K, tinha um tamaho de aproximadamente 20 amostras</p>