In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
column_names_pt_br = [
    'classe',
    'alcool',
    'acido_malico',
    'cinzas',
    'alcalinidade_de_cinzas',
    'magnesio',
    'fenois_totais',
    'flavanoides',
    'fenois_nao_flavanoides',
    'proantocianinas',
    'intensidade_de_cor',
    'matiz',
    'od280_od315_de_vinhos_diluidos',
    'prolina'
]
vinhos = pd.read_csv(url, names=column_names_pt_br, dtype={'classe': object})


X = vinhos.drop('classe', axis=1)
y = vinhos['classe']


def knn_cross_validation(X, y, k_values, random_states):
    results = []

    for random_state in random_states:

        kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

        accuracies_k3 = []
        accuracies_k5 = []

        print(f"Random State: {random_state}")


        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            for k in k_values:

                knn = KNeighborsClassifier(n_neighbors=k)
                knn.fit(X_train, y_train)

                y_pred = knn.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)

                print(f"  Fold {len(accuracies_k3) + len(accuracies_k5) + 1} | k={k} | Accuracy = {accuracy:.4f}")

                if k == 3:
                    accuracies_k3.append(accuracy)
                else:
                    accuracies_k5.append(accuracy)

        mean_accuracy_k3 = np.mean(accuracies_k3)
        std_accuracy_k3 = np.std(accuracies_k3)
        mean_accuracy_k5 = np.mean(accuracies_k5)
        std_accuracy_k5 = np.std(accuracies_k5)

        results.append({
            'random_state': random_state,
            'mean_accuracy_k3': mean_accuracy_k3,
            'std_accuracy_k3': std_accuracy_k3,
            'mean_accuracy_k5': mean_accuracy_k5,
            'std_accuracy_k5': std_accuracy_k5
        })

        print(f"\nResumo para random_state={random_state}:")
        print(f"K=3: Média = {mean_accuracy_k3:.4f}, Desvio Padrão = {std_accuracy_k3:.4f}")
        print(f"K=5: Média = {mean_accuracy_k5:.4f}, Desvio Padrão = {std_accuracy_k5:.4f}")

        if mean_accuracy_k3 > mean_accuracy_k5:
            print(f"Melhor k: 3\n")
        elif mean_accuracy_k5 > mean_accuracy_k3:
            print(f"Melhor k: 5\n")
        else:
            print(f"Empate entre k=3 e k=5\n")

    results_df = pd.DataFrame(results)
    return results_df


k_values = [3, 5]
random_states = [42, 17, 24]

results_df = knn_cross_validation(X, y, k_values, random_states)

print("\nTabela Final de Resultados:")
print(results_df)



Random State: 42
  Fold 1 | k=3 | Accuracy = 0.8333
  Fold 2 | k=5 | Accuracy = 0.7222
  Fold 3 | k=3 | Accuracy = 0.8333
  Fold 4 | k=5 | Accuracy = 0.7222
  Fold 5 | k=3 | Accuracy = 0.7778
  Fold 6 | k=5 | Accuracy = 0.6667
  Fold 7 | k=3 | Accuracy = 0.5000
  Fold 8 | k=5 | Accuracy = 0.6111
  Fold 9 | k=3 | Accuracy = 0.7222
  Fold 10 | k=5 | Accuracy = 0.6667
  Fold 11 | k=3 | Accuracy = 0.6111
  Fold 12 | k=5 | Accuracy = 0.5556
  Fold 13 | k=3 | Accuracy = 0.6667
  Fold 14 | k=5 | Accuracy = 0.7222
  Fold 15 | k=3 | Accuracy = 0.4444
  Fold 16 | k=5 | Accuracy = 0.4444
  Fold 17 | k=3 | Accuracy = 0.7647
  Fold 18 | k=5 | Accuracy = 0.8235
  Fold 19 | k=3 | Accuracy = 0.8824
  Fold 20 | k=5 | Accuracy = 0.7059

Resumo para random_state=42:
K=3: Média = 0.7036, Desvio Padrão = 0.1392
K=5: Média = 0.6641, Desvio Padrão = 0.1001
Melhor k: 3

Random State: 17
  Fold 1 | k=3 | Accuracy = 0.6111
  Fold 2 | k=5 | Accuracy = 0.6667
  Fold 3 | k=3 | Accuracy = 0.6667
  Fold 4 | k=5 | Ac