In [4]:
import pandas as pd

# URL do dataset Wine
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

# Nome das colunas em português BR
column_names_pt_br = [
    'classe',
    'alcool',
    'acido_malico',
    'cinzas',
    'alcalinidade_de_cinzas',
    'magnesio',
    'fenois_totais',
    'flavanoides',
    'fenois_nao_flavanoides',
    'proantocianinas',
    'intensidade_de_cor',
    'matiz',
    'od280_od315_de_vinhos_diluidos',
    'prolina'
]

# Ler o arquivo CSV com as colunas especificadas, definindo a coluna 'classe' como object
vinhos = pd.read_csv(url, names=column_names_pt_br, dtype={'classe': object})

# Separando as features (X) e o target (y)
X = vinhos.drop('classe', axis=1)
y = vinhos['classe']

# (Opcional) Imprimir as primeiras linhas de X e y para verificar
print("X (features):\n", X.head())
print("\ny (target):\n", y.head())

X (features):
    alcool  acido_malico  cinzas  alcalinidade_de_cinzas  magnesio  \
0   14.23          1.71    2.43                    15.6       127   
1   13.20          1.78    2.14                    11.2       100   
2   13.16          2.36    2.67                    18.6       101   
3   14.37          1.95    2.50                    16.8       113   
4   13.24          2.59    2.87                    21.0       118   

   fenois_totais  flavanoides  fenois_nao_flavanoides  proantocianinas  \
0           2.80         3.06                    0.28             2.29   
1           2.65         2.76                    0.26             1.28   
2           2.80         3.24                    0.30             2.81   
3           3.85         3.49                    0.24             2.18   
4           2.80         2.69                    0.39             1.82   

   intensidade_de_cor  matiz  od280_od315_de_vinhos_diluidos  prolina  
0                5.64   1.04                         

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


random_states = [42, 17, 24]
k_values = [3, 5]

results = {}

for random_state in random_states:
    results[random_state] = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state, stratify=y) #stratify

    # Padronizar os dados (importante para KNN)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[random_state][f"k={k}"] = accuracy

# Imprimindo os resultados de forma organizada
for random_state, accuracies in results.items():
    print(f"Resultados para random_state={random_state}:")
    for k, accuracy in accuracies.items():
        print(f"  {k}: {accuracy:.4f}")

Resultados para random_state=42:
  k=3: 0.9444
  k=5: 0.9444
Resultados para random_state=17:
  k=3: 0.9630
  k=5: 0.9815
Resultados para random_state=24:
  k=3: 0.9815
  k=5: 0.9815


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

random_states = [42, 17, 24]
k_values = [3, 5]
n_splits = 10

results = {}

for random_state in random_states:
    results[random_state] = {}
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for k in k_values:
        accuracies = []
        for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Padronizar os dados (importante para KNN)
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        mean_accuracy = np.mean(accuracies)
        std_accuracy = np.std(accuracies)
        results[random_state][f"k={k}"] = {"mean": mean_accuracy, "std": std_accuracy}

# Imprimindo os resultados
for random_state, k_results in results.items():
    print(f"Resultados para random_state={random_state}:")
    for k, metrics in k_results.items():
        print(f"  {k}: Acurácia média = {metrics['mean']:.4f}, Desvio Padrão = {metrics['std']:.4f}")

Resultados para random_state=42:
  k=3: Acurácia média = 0.9493, Desvio Padrão = 0.0400
  k=5: Acurácia média = 0.9663, Desvio Padrão = 0.0371
Resultados para random_state=17:
  k=3: Acurácia média = 0.9546, Desvio Padrão = 0.0619
  k=5: Acurácia média = 0.9660, Desvio Padrão = 0.0456
Resultados para random_state=24:
  k=3: Acurácia média = 0.9373, Desvio Padrão = 0.0699
  k=5: Acurácia média = 0.9601, Desvio Padrão = 0.0568


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Dicionário para armazenar os melhores resultados
best_results = {}

# Analisando os resultados e identificando o melhor k para cada random_state
for random_state, k_results in results.items():
    best_k = None
    best_accuracy = 0
    print(f"\nResultados para random_state={random_state}:")
    for k, metrics in k_results.items():
        print(f"  {k}: Acurácia média = {metrics['mean']:.4f}, Desvio Padrão = {metrics['std']:.4f}")
        if metrics['mean'] > best_accuracy:
            best_accuracy = metrics['mean']
            best_k = k
        elif metrics['mean'] == best_accuracy:  # Verifica se há empate
            best_k = f"Empate entre {best_k} e {k}"  # Registra o empate

    best_results[random_state] = {"best_k": best_k, "accuracy": best_accuracy}

# Imprimindo os melhores resultados para cada random_state
print("\nMelhores resultados para cada random_state:")
for random_state, best_result in best_results.items():
    print(f"  random_state={random_state}: Melhor k = {best_result['best_k']}, Acurácia = {best_result['accuracy']:.4f}")


Resultados para random_state=42:
  k=3: Acurácia média = 0.9493, Desvio Padrão = 0.0400
  k=5: Acurácia média = 0.9663, Desvio Padrão = 0.0371

Resultados para random_state=17:
  k=3: Acurácia média = 0.9546, Desvio Padrão = 0.0619
  k=5: Acurácia média = 0.9660, Desvio Padrão = 0.0456

Resultados para random_state=24:
  k=3: Acurácia média = 0.9373, Desvio Padrão = 0.0699
  k=5: Acurácia média = 0.9601, Desvio Padrão = 0.0568

Melhores resultados para cada random_state:
  random_state=42: Melhor k = k=5, Acurácia = 0.9663
  random_state=17: Melhor k = k=5, Acurácia = 0.9660
  random_state=24: Melhor k = k=5, Acurácia = 0.9601
