In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report




In [114]:
def read_ds(path):
  """
  parse CSV data set and
  returns a tuple (input, target)
  """
  df = pd.read_csv(path, sep=" ", names=['NaN','y','x1','x2','x3','x4','x5','x6','garbage'])
  y, df = df['y'], df.drop(columns=['NaN','garbage','y'])
  
  # One-hot encoding categorical variables
  df = pd.get_dummies(df, columns=['x1','x2','x3','x4','x5','x6']).astype('int')

  return (df, y)

In [115]:
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=7)

In [116]:
# Function extracting each grid from dictionary of grids
def list_grids(grids_dict):
    return [grids_dict[item] for item in grids_dict]

In [117]:
# Function that prints the classification report
def print_report_score(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['0', '1']))

In [118]:
GRID_GAUSSIAN_NB = { 'var_smoothing': np.logspace(0,-9, num=100) }
GRID_BERNULLI_NB = { 'alpha': np.linspace(0,1, num=100), "force_alpha":[True] }
GRID_KNN = { 
            'n_neighbors' : range(1,25), 
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 
            'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'], 
            'weights' : ['distance', 'uniform'] 
            }
GRID_MNB = {  }

GRID_SVM = {
  "linear_rbf_sigmoid": {
   "kernel": ['linear', 'rbf','sigmoid'],
   "C":[0.01,10,100],
   "gamma" : ['scale', 'auto']
  },
  "poly": {
    "kernel": ['poly'],
    "C":[0.01,10,100],
    "degree": [2,3,5],
    "gamma" : ['scale', 'auto']
  }
    } 

params_map = {
    'gaussian_nb': GRID_GAUSSIAN_NB,
    'bernulli_nb': GRID_BERNULLI_NB,
    'multinomial_nb': GRID_MNB,
    'knn': GRID_KNN,
    "svm": GRID_SVM
    }

In [119]:
def execute_gridesearch(X, y, model, model_name):
  """
  Performs a GridSearchCV with the given model and parameters
  """
  params = params_map[model_name]
  if(model_name == "svm"):
    params = list_grids(params)
  grid = GridSearchCV(model(), params,scoring="accuracy",refit=True, cv=cv_strategy, n_jobs=-1,verbose=10).fit(X, y)
  results = pd.DataFrame(grid.cv_results_)
  print("Mean validation accuracy: ", results["mean_test_score"][grid.best_index_])
  print("Mean std validation accuracy: ", results["std_test_score"][grid.best_index_])
  return grid



In [120]:

def grid_search_models(X,y,X_test,y_test):
    models_to_use = [ (GaussianNB, "gaussian_nb"), (KNeighborsClassifier, "knn"), (BernoulliNB, "bernulli_nb"), (MultinomialNB, "multinomial_nb"),(SVC,"svm")]

    for model, name in models_to_use:
        grid = execute_gridesearch(X, y, model, name)
        print("Model used: " + name + ", best parameters: " + str(grid.best_params_) )
        y_pred = grid.best_estimator_.predict(X_test)
        print("accuracy on test set {:.3f}".format(accuracy_score(y_test,y_pred)))
        print_report_score(y_test,y_pred)


# Monk 1

In [121]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-1.train"
TS_PATH = "./monks/datasets/monks-1.test"

In [122]:
# read training and test set
X_train, y_train = read_ds(TR_PATH)
X_test,  y_test  = read_ds(TS_PATH)

In [123]:
grid_search_models(X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Mean validation accuracy:  0.7333333333333334
Mean std validation accuracy:  0.12161871201057473
Model used: gaussian_nb, best parameters: {'var_smoothing': 0.02310129700083159}
accuracy on test set 0.748
              precision    recall  f1-score   support

           0       0.67      0.96      0.79       216
           1       0.93      0.54      0.68       216

    accuracy                           0.75       432
   macro avg       0.80      0.75      0.74       432
weighted avg       0.80      0.75      0.74       432

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Mean validation accuracy:  0.8306666666666667
Mean std validation accuracy:  0.029694743268426766
Model used: knn, best parameters: {'algorithm': 'ball_tree', 'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
accuracy on test set 0.880
              precision    recall  f1-score   support

           0       0.88      0.88   

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b


Mean validation accuracy:  0.7333333333333334
Mean std validation accuracy:  0.12161871201057473
Model used: bernulli_nb, best parameters: {'alpha': 0.0, 'force_alpha': True}
accuracy on test set 0.750
              precision    recall  f1-score   support

           0       0.67      1.00      0.80       216
           1       1.00      0.50      0.67       216

    accuracy                           0.75       432
   macro avg       0.83      0.75      0.73       432
weighted avg       0.83      0.75      0.73       432

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Mean validation accuracy:  0.7020000000000001
Mean std validation accuracy:  0.0988736567544662
Model used: multinomial_nb, best parameters: {}
accuracy on test set 0.713
              precision    recall  f1-score   support

           0       0.69      0.76      0.73       216
           1       0.74      0.66      0.70       216

    accuracy                           0.71       432
   macro avg       0.72

# Monk 2

In [124]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-2.train"
TS_PATH = "./monks/datasets/monks-2.test"

In [125]:
# read training and test set
X_train, y_train = read_ds(TR_PATH)
X_test,  y_test  = read_ds(TS_PATH)

In [126]:
grid_search_models(X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Mean validation accuracy:  0.5859180035650623
Mean std validation accuracy:  0.06877106070495258
Model used: gaussian_nb, best parameters: {'var_smoothing': 1.0}
accuracy on test set 0.630
              precision    recall  f1-score   support

           0       0.67      0.88      0.76       290
           1       0.32      0.11      0.17       142

    accuracy                           0.63       432
   macro avg       0.50      0.50      0.46       432
weighted avg       0.56      0.63      0.57       432

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Mean validation accuracy:  0.669162210338681
Mean std validation accuracy:  0.06615424523400157
Model used: knn, best parameters: {'algorithm': 'ball_tree', 'metric': 'euclidean', 'n_neighbors': 23, 'weights': 'uniform'}
accuracy on test set 0.650
              precision    recall  f1-score   support

           0       0.68      0.91      0.78       290


# Monk 3

In [127]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-3.train"
TS_PATH = "./monks/datasets/monks-3.test"

In [128]:
# read training and test set
X_train, y_train = read_ds(TR_PATH)
X_test,  y_test  = read_ds(TS_PATH)

In [129]:
grid_search_models(X_train,y_train,X_test,y_test)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Mean validation accuracy:  0.9339999999999999
Mean std validation accuracy:  0.03383949040856128
Model used: gaussian_nb, best parameters: {'var_smoothing': 0.533669923120631}
accuracy on test set 0.972
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       204
           1       1.00      0.95      0.97       228

    accuracy                           0.97       432
   macro avg       0.97      0.97      0.97       432
weighted avg       0.97      0.97      0.97       432

Fitting 5 folds for each of 768 candidates, totalling 3840 fits
Mean validation accuracy:  0.9183333333333333
Mean std validation accuracy:  0.03619392214170773
Model used: knn, best parameters: {'algorithm': 'ball_tree', 'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}
accuracy on test set 0.933
              precision    recall  f1-score   support

           0       0.91      0.96     