In [2]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# URL do dataset Wine
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'

# Nome das colunas em português BR
column_names_pt_br = [
    'classe',
    'alcool',
    'acido_malico',
    'cinzas',
    'alcalinidade_de_cinzas',
    'magnesio',
    'fenois_totais',
    'flavanoides',
    'fenois_nao_flavanoides',
    'proantocianinas',
    'intensidade_de_cor',
    'matiz',
    'od280_od315_de_vinhos_diluidos',
    'prolina'
]

# Ler o arquivo CSV com as colunas especificadas, definindo a coluna 'classe' como object
vinhos = pd.read_csv(url, names=column_names_pt_br, dtype={'classe': object})

In [12]:
# Splitting into feature matrix X and target vector y
X = vinhos.drop('classe', axis=1)
y = vinhos['classe']

# Setting random states and K values for the experiment
random_states = [42, 17, 24]
k_values = [3, 5]

# List to store results
results = []

# Running K-Fold Cross-Validation for each random_state
for random_state in random_states:
    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

    # To store accuracies for each k value
    accuracies = {k: [] for k in k_values}

    # Looping through KFold splits
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Looping through each k value
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)

            # Making predictions and calculating accuracy
            y_pred = knn.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            # Storing accuracy for each fold
            accuracies[k].append(accuracy)

            # Printing fold accuracy for detailed analysis
            print(f"random_state={random_state}, k={k}, Fold Accuracy: {accuracy:.4f}")

    # Calculating mean and standard deviation of accuracy for each k
    for k in k_values:
        mean_accuracy = sum(accuracies[k]) / len(accuracies[k])
        std_accuracy = (sum([(x - mean_accuracy) ** 2 for x in accuracies[k]]) / len(accuracies[k])) ** 0.5
        results.append({
            'random_state': random_state,
            'k': k,
            'mean_accuracy': mean_accuracy,
            'std_accuracy': std_accuracy
        })

# Showing results in a DataFrame
results_df = pd.DataFrame(results)

# Displaying the mean and standard deviation of accuracy for each random_state and k
print("\nMean and Standard Deviation for each random_state and k:")
print(results_df)

# Identifying the best k for each random_state, handling ties
print("\nBest k for each random_state:")

# Iterate over each random_state to find the best k
for random_state in random_states:
    # Filter the results for the current random_state
    best_k_results = results_df[results_df['random_state'] == random_state]

    # Sort by mean accuracy in descending order
    best_k_results = best_k_results.sort_values(by='mean_accuracy', ascending=False)

    # Get the highest mean accuracy
    best_mean_accuracy = best_k_results['mean_accuracy'].iloc[0]

    # Filter for k values that have the best mean accuracy
    best_k_values = best_k_results[best_k_results['mean_accuracy'] == best_mean_accuracy]

    # Print the best k or if there is a tie
    if len(best_k_values) > 1:
        print(f"For random_state={random_state}, there is a tie between k values:")
        for index, row in best_k_values.iterrows():
            print(f"  k={row['k']} with mean accuracy {row['mean_accuracy']:.4f}")
    else:
        best_k = best_k_values.iloc[0]
        print(f"For random_state={random_state}, the best k is {best_k['k']} with mean accuracy {best_k['mean_accuracy']:.4f}")

random_state=42, k=3, Fold Accuracy: 0.8333
random_state=42, k=5, Fold Accuracy: 0.7222
random_state=42, k=3, Fold Accuracy: 0.8333
random_state=42, k=5, Fold Accuracy: 0.7222
random_state=42, k=3, Fold Accuracy: 0.7778
random_state=42, k=5, Fold Accuracy: 0.6667
random_state=42, k=3, Fold Accuracy: 0.5000
random_state=42, k=5, Fold Accuracy: 0.6111
random_state=42, k=3, Fold Accuracy: 0.7222
random_state=42, k=5, Fold Accuracy: 0.6667
random_state=42, k=3, Fold Accuracy: 0.6111
random_state=42, k=5, Fold Accuracy: 0.5556
random_state=42, k=3, Fold Accuracy: 0.6667
random_state=42, k=5, Fold Accuracy: 0.7222
random_state=42, k=3, Fold Accuracy: 0.4444
random_state=42, k=5, Fold Accuracy: 0.4444
random_state=42, k=3, Fold Accuracy: 0.7647
random_state=42, k=5, Fold Accuracy: 0.8235
random_state=42, k=3, Fold Accuracy: 0.8824
random_state=42, k=5, Fold Accuracy: 0.7059
random_state=17, k=3, Fold Accuracy: 0.6111
random_state=17, k=5, Fold Accuracy: 0.6667
random_state=17, k=3, Fold Accur