# 1. Leitura dos Dados

In [2]:
import pandas as pd

# URL do dataset Wine
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

# Nome das colunas em português BR
column_names_pt_br = [
    'classe',
    'alcool',
    'acido_malico',
    'cinzas',
    'alcalinidade_de_cinzas',
    'magnesio',
    'fenois_totais',
    'flavanoides',
    'fenois_nao_flavanoides',
    'proantocianinas',
    'intensidade_de_cor',
    'matiz',
    'od280_od315_de_vinhos_diluidos',
    'prolina'
]

# Ler o arquivo CSV com as colunas especificadas, definindo a coluna 'classe' como object
vinhos = pd.read_csv(url, names=column_names_pt_br, dtype={'classe': object})

2. Pré-processamento dos Dados

In [3]:
# Separando as variáveis
X = vinhos.drop('classe', axis=1)
y = vinhos['classe']

3. Configuração do Experimento

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Configuração do Experimento
random_states = [42, 17, 24]
k_values = [3, 5]

for random_state in random_states:
    print(f"Experimento com random_state = {random_state}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y)

    for k in k_values:
        print(f"  KNN com k = {k}")
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"    Acurácia: {accuracy}")

Experimento com random_state = 42
  KNN com k = 3
    Acurácia: 0.6851851851851852
  KNN com k = 5
    Acurácia: 0.7222222222222222
Experimento com random_state = 17
  KNN com k = 3
    Acurácia: 0.7592592592592593
  KNN com k = 5
    Acurácia: 0.7592592592592593
Experimento com random_state = 24
  KNN com k = 3
    Acurácia: 0.7037037037037037
  KNN com k = 5
    Acurácia: 0.6851851851851852


4. Estrutura de Cross-Validation com KFold

In [5]:
from sklearn.model_selection import KFold
import numpy as np



# Estrutura de Cross-Validation com KFold
random_states = [42, 17, 24]
k_values = [3, 5]
n_splits = 10

for random_state in random_states:
    print(f"Experimento com random_state = {random_state}")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for k in k_values:
        print(f"  KNN com k = {k}")
        accuracies = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        mean_accuracy = np.mean(accuracies)
        std_accuracy = np.std(accuracies)
        print(f"    Acurácia Média: {mean_accuracy}")
        print(f"    Desvio Padrão da Acurácia: {std_accuracy}")

Experimento com random_state = 42
  KNN com k = 3
    Acurácia Média: 0.70359477124183
    Desvio Padrão da Acurácia: 0.13918691592617136
  KNN com k = 5
    Acurácia Média: 0.6640522875816993
    Desvio Padrão da Acurácia: 0.10007472964927165
Experimento com random_state = 17
  KNN com k = 3
    Acurácia Média: 0.6986928104575163
    Desvio Padrão da Acurácia: 0.10403044405660643
  KNN com k = 5
    Acurácia Média: 0.6875816993464052
    Desvio Padrão da Acurácia: 0.11211770116307625
Experimento com random_state = 24
  KNN com k = 3
    Acurácia Média: 0.7127450980392157
    Desvio Padrão da Acurácia: 0.09681644876344142
  KNN com k = 5
    Acurácia Média: 0.7127450980392157
    Desvio Padrão da Acurácia: 0.09397285976504524


5. Análise dos Resultados

In [6]:
# Análise dos Resultados
results = []

for random_state in random_states:
    best_k = None
    best_accuracy = -1
    for k in k_values:
        accuracies = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        mean_accuracy = np.mean(accuracies)

        if mean_accuracy > best_accuracy:
            best_accuracy = mean_accuracy
            best_k = k
        elif mean_accuracy == best_accuracy:
            if best_k is not None:
                best_k = "Empate entre k=3 e k=5" # Registra o empate

        results.append({'random_state': random_state, 'k': k, 'mean_accuracy': mean_accuracy, 'std_accuracy': np.std(accuracies)})

    print(f"Melhor valor de k para random_state={random_state}: {best_k}")

results_df = pd.DataFrame(results)
print("\nResultados completos:")
results_df

Melhor valor de k para random_state=42: Empate entre k=3 e k=5
Melhor valor de k para random_state=17: Empate entre k=3 e k=5
Melhor valor de k para random_state=24: Empate entre k=3 e k=5

Resultados completos:


Unnamed: 0,random_state,k,mean_accuracy,std_accuracy
0,42,3,0.712745,0.096816
1,42,5,0.712745,0.093973
2,17,3,0.712745,0.096816
3,17,5,0.712745,0.093973
4,24,3,0.712745,0.096816
5,24,5,0.712745,0.093973
