In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.datasets import load_iris, load_wine
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import requests
import io

# Funkcja do wczytywania danych zbioru GLASS
def load_glass_data():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"

    col_names = ['Id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
    glass_data = pd.read_csv(url, names=col_names, index_col='Id')
    X = glass_data.drop('Type', axis=1)
    y = glass_data['Type']
    return X, y

def load_dataset(dataset_name):
    if dataset_name == 'IRIS':
        data = load_iris()
        X, y = data.data, data.target
    elif dataset_name == 'WINE':
        data = load_wine()
        X, y = data.data, data.target
    elif dataset_name == 'GLASS':
        X, y = load_glass_data()
    else:
        return "Nieznany zbiór danych"

    return X, y

# Funkcja do przeprowadzenia eksperymentu z k-NN
def k_nn_experiment(dataset_name, k_values, voting_methods, distance_metrics, folds, shuffle=True):
    results = []

    X, y = load_dataset(dataset_name)

    # Standaryzacja danych
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    for k in k_values:
        for voting in voting_methods:
            for distance in distance_metrics:
                # Konfiguracja algorytmu k-NN
                clf = KNeighborsClassifier(n_neighbors=k, weights=voting, metric=distance)

                # Walidacja krzyżowa
                if folds > 1:
                    cv = StratifiedKFold(n_splits=folds, shuffle=shuffle) if dataset_name != 'GLASS' else KFold(n_splits=folds, shuffle=shuffle)
                    scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='accuracy')
                    results.append((dataset_name, k, voting, distance, folds, scores.mean()))
                else:
                    # Podział na zbiór treningowy i testowy
                    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
                    clf.fit(X_train, y_train)
                    y_pred = clf.predict(X_test)
                    accuracy = accuracy_score(y_test, y_pred)
                    results.append((dataset_name, k, voting, distance, folds, accuracy))

    return pd.DataFrame(results, columns=['Dataset', 'K', 'Voting', 'Distance', 'Folds', 'Accuracy'])

# Przykładowe parametry eksperymentu
k_values = [1, 5, 10]
voting_methods = ['uniform', 'distance']  # uniform to głosowanie większościowe, distance to głosowanie ważone odległością
distance_metrics = ['euclidean', 'manhattan', 'minkowski']
folds = 5
datasets = ['IRIS', 'WINE', 'GLASS']
results = []

for dataset in datasets:
    results.append(k_nn_experiment(dataset, k_values, voting_methods, distance_metrics, folds))
    print(f'Zakończono eksperyment dla zbioru {dataset}:')
    print(results[-1])

Zakończono eksperyment dla zbioru IRIS:
   Dataset   K    Voting   Distance  Folds  Accuracy
0     IRIS   1   uniform  euclidean      5  0.953333
1     IRIS   1   uniform  manhattan      5  0.926667
2     IRIS   1   uniform  minkowski      5  0.940000
3     IRIS   1  distance  euclidean      5  0.946667
4     IRIS   1  distance  manhattan      5  0.933333
5     IRIS   1  distance  minkowski      5  0.946667
6     IRIS   5   uniform  euclidean      5  0.953333
7     IRIS   5   uniform  manhattan      5  0.953333
8     IRIS   5   uniform  minkowski      5  0.953333
9     IRIS   5  distance  euclidean      5  0.940000
10    IRIS   5  distance  manhattan      5  0.953333
11    IRIS   5  distance  minkowski      5  0.946667
12    IRIS  10   uniform  euclidean      5  0.960000
13    IRIS  10   uniform  manhattan      5  0.933333
14    IRIS  10   uniform  minkowski      5  0.953333
15    IRIS  10  distance  euclidean      5  0.966667
16    IRIS  10  distance  manhattan      5  0.940000
17    

In [4]:
def perform_cross_validation(dataset_name, n_splits=5, n_neighbors=5, weights='uniform', metric='minkowski', p=2):
    """Przeprowadza walidację krzyżową na podanym zbiorze danych."""
    
    X, y = load_dataset(dataset_name)
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Parametry dla walidacji krzyżowej
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric, p=p)
    scores = cross_val_score(knn, X, y, cv=kf, scoring='accuracy')
    
    return scores.mean(), scores.std()

for dataset in ['IRIS', 'WINE', 'GLASS']:
    mean, std = perform_cross_validation(dataset)
    print(f'Zbiór: {dataset}, średnia dokładność: {mean}, odchylenie standardowe: {std}')
    




Zbiór: IRIS, średnia dokładność: 0.9666666666666668, odchylenie standardowe: 0.029814239699997188
Zbiór: WINE, średnia dokładność: 0.6801587301587302, odchylenie standardowe: 0.0426981176599236
Zbiór: GLASS, średnia dokładność: 0.6586932447397563, odchylenie standardowe: 0.042095935194034914


In [5]:
from sklearn.model_selection import LeaveOneOut

# Funkcja do przeprowadzenia eksperymentu z k-NN z rozszerzeniem o LOOCV
def k_nn_experiment_extended(dataset_name, k_values, voting_methods, distance_metrics, folds, shuffle=True, loocv=False):
    results = []

    X, y = load_dataset(dataset_name)

    # Standaryzacja danych
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Wybór metody walidacji krzyżowej
    if loocv:
        cv = LeaveOneOut()
    else:
        if folds > 1:
            if dataset_name != 'GLASS':
                cv = StratifiedKFold(n_splits=folds, shuffle=shuffle)
            else:
                cv = KFold(n_splits=folds, shuffle=shuffle)
        else:
            raise ValueError("Invalid number of folds for non-LOOCV method.")

    for k in k_values:
        for voting in voting_methods:
            for distance in distance_metrics:
                clf = KNeighborsClassifier(n_neighbors=k, weights=voting, metric=distance)
                
                # Walidacja krzyżowa
                scores = cross_val_score(clf, X_scaled, y, cv=cv, scoring='accuracy')
                results.append((dataset_name, k, voting, distance, "LOOCV" if loocv else f"{folds}-fold", scores.mean(), scores.std()))

    return pd.DataFrame(results, columns=['Dataset', 'K', 'Voting', 'Distance', 'Validation', 'Mean Accuracy', 'Std Dev'])

# Przykładowe użycie z rozszerzeniem o LOOCV dla zbioru IRIS
results_loocv = k_nn_experiment_extended('IRIS', k_values, voting_methods, distance_metrics, folds=0, loocv=True)
results_loocv.head()


Unnamed: 0,Dataset,K,Voting,Distance,Validation,Mean Accuracy,Std Dev
0,IRIS,1,uniform,euclidean,LOOCV,0.946667,0.224697
1,IRIS,1,uniform,manhattan,LOOCV,0.926667,0.260683
2,IRIS,1,uniform,minkowski,LOOCV,0.946667,0.224697
3,IRIS,1,distance,euclidean,LOOCV,0.946667,0.224697
4,IRIS,1,distance,manhattan,LOOCV,0.926667,0.260683
