In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
# toglie la colonna id e monk_id e le prime righe di commenti
def monk_create_df(path):
    columns = ["id", "output", "a1", "a2", "a3", "a4", "a5", "a6", "monk_id"]
    df = pd.read_csv(path, names=columns, delimiter=" ")

    df = df.drop('id', axis='columns') #droppiamo id
    df = df.drop("monk_id", axis='columns') #droppiamo id del dato monk
    return df

#MONK 1
monk1_tr=monk_create_df("/content/monks-1.train")
monk1_ts=monk_create_df("/content/monks-1.test")
X_train_1 = monk1_tr.drop(columns=["output"]) 
y_train_1 = monk1_tr["output"]
X_test_1 = monk1_ts.drop(columns=["output"])
y_test_1 = monk1_ts["output"]

#MONK2
monk2_tr=monk_create_df("/content/monks-2.train")
monk2_ts=monk_create_df("/content/monks-2.test")
X_train_2 = monk2_tr.drop(columns=["output"]) 
y_train_2 = monk2_tr["output"]
X_test_2 = monk2_ts.drop(columns=["output"])
y_test_2 = monk2_ts["output"]

#MONK3
monk3_tr=monk_create_df("/content/monks-3.train")
monk3_ts=monk_create_df("/content/monks-3.test")
X_train_3 = monk3_tr.drop(columns=["output"]) 
y_train_3 = monk3_tr["output"]
X_test_3 = monk3_ts.drop(columns=["output"])
y_test_3 = monk3_ts["output"]


In [3]:
def monk_knn(X_train, y_train, X_test, y_test):
    pipe = Pipeline([
        ('scaler', None),
        ('knn', KNeighborsClassifier())
                    ])
    # specify the parameters for the grid search
    param_grid = {'knn__n_neighbors': [1,2,3,4,5,10,15,20,21,22,23,24,25,30,40],
                  'scaler': [StandardScaler(), MinMaxScaler()],
                  'knn__weights': ['uniform', 'distance'],
                  'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
                  'knn__p': [2, 3, 4]
                  }
    # create the grid search object
    knn_gs = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, refit=True, scoring='accuracy')

    # fit the grid search to the data
    knn_gs.fit(X_train, y_train.ravel()) # .ravel will convert that array shape to (n, ) (i.e. flatten it) 
  
    #predict
    y_pred = knn_gs.predict(X_test)

    #print results
    print("Best parameters: ", knn_gs.best_params_)
    print("Best score accuracy: ", knn_gs.best_score_)
    print("Best score mse: ", knn_gs.cv_results_['mean_test_score'][0])

print("------------1--------------", monk_knn(X_train_1, y_train_1, X_test_1, y_test_1))
print("------------2--------------", monk_knn(X_train_2, y_train_2, X_test_2, y_test_2))
print("------------3--------------", monk_knn( X_train_3, y_train_3, X_test_3, y_test_3))

Best parameters:  {'knn__metric': 'minkowski', 'knn__n_neighbors': 25, 'knn__p': 4, 'knn__weights': 'uniform', 'scaler': StandardScaler()}
Best score accuracy:  0.7333333333333333
Best score mse:  0.5253333333333333
------------1-------------- None
Best parameters:  {'knn__metric': 'euclidean', 'knn__n_neighbors': 4, 'knn__p': 2, 'knn__weights': 'distance', 'scaler': MinMaxScaler()}
Best score accuracy:  0.6811051693404634
Best score mse:  0.6450980392156863
------------2-------------- None
Best parameters:  {'knn__metric': 'manhattan', 'knn__n_neighbors': 20, 'knn__p': 2, 'knn__weights': 'distance', 'scaler': StandardScaler()}
Best score accuracy:  0.8936666666666667
Best score mse:  0.7776666666666666
------------3-------------- None
