In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from monk_helpers import CV
from sklearn.metrics import mean_squared_error



In [2]:
def read_ds(path):
  """
  parse CSV data set and
  returns a tuple (input, target)
  """
  df = pd.read_csv(path, sep=" ", names=['NaN','y','x1','x2','x3','x4','x5','x6','garbage'])
  y, df = df['y'], df.drop(columns=['NaN','garbage','y'])
  
  # One-hot encoding categorical variables
  df = pd.get_dummies(df, columns=['x1','x2','x3','x4','x5','x6']).astype('int')

  return (df, y)

In [3]:
# Datasets Path
TR_PATH_1 = "./monks/datasets/monks-1.train"
TS_PATH_1 = "./monks/datasets/monks-1.test"
# Datasets Path
TR_PATH_2 = "./monks/datasets/monks-2.train"
TS_PATH_2 = "./monks/datasets/monks-2.test"
# Datasets Path
TR_PATH_3 = "./monks/datasets/monks-3.train"
TS_PATH_3 = "./monks/datasets/monks-3.test"

In [4]:


cv_strategy = CV

In [5]:
# Function extracting each grid from dictionary of grids
def list_grids(grids_dict):
    return [grids_dict[item] for item in grids_dict]

In [6]:
# Function that prints the classification report
def print_report_score(test_label, test_pred):
    mse = mean_squared_error(test_label, test_pred)
    print("MSE: ", mse)
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['0', '1']))

In [7]:
GRID_GAUSSIAN_NB = { 'var_smoothing': np.logspace(0,-9, num=100) }
GRID_BERNULLI_NB = { 'alpha': np.linspace(0,1, num=100), "force_alpha":[True] }
GRID_KNN = { 
            'n_neighbors' : range(1,25), 
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 
            'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'], 
            'weights' : ['distance', 'uniform'] 
            }
GRID_MNB = {  }

GRID_SVM = {
  "linear_rbf_sigmoid": {
   "kernel": ['linear', 'rbf','sigmoid'],
   "C":[0.01,10,100],
   "gamma" : ['scale', 'auto']
  },
  "poly": {
    "kernel": ['poly'],
    "C":[0.01,10,100],
    "degree": [2,3,5],
    "gamma" : ['scale', 'auto']
  }
    } 
GRID_LGREG = { 
    "lbfgs_newton-cg_newton-cholesky_sag": {
    "penalty": ['l2', None],
    "fit_intercept":[True,False],
    "class_weight":[{0:0.6,1:0.4},"balanced"],
    "solver":['lbfgs', 'newton-cg', 'newton-cholesky', 'sag'],
    },
    "liblinear":{
    "penalty": ['l1', 'l2'],
    "fit_intercept":[True,False],
    "class_weight":[{0:0.6,1:0.4},"balanced"],
    "solver":['liblinear'],
    },
    "saga":{
    "penalty": ['l1', 'l2','elasticnet',None],
    "l1_ratio":[0.5,0.6,0.7,0.3],
    "fit_intercept":[True,False],
    "class_weight":[{0:0.6,1:0.4},"balanced"],
    "solver":['saga'],
    }
 }

GRID_DT = {
        'criterion': ['gini', 'entropy','log_loss'],
        'max_depth': [5,10,None],
        'max_features': ['sqrt', 'log2', None],
        'min_samples_split': [2,4,8,16],
        'min_samples_leaf': [1,2,4,8]
        }
        
#Ensemble methods:
GRID_RF = {
    'max_depth': [5, 15, None],
    'max_features': ['log2', None],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [32, 64, 128],
    "bootstrap": [True, False],
    "criterion": ["entropy", "gini", "log_loss"]
    }

params_map = {
    'gaussian_nb': GRID_GAUSSIAN_NB,
    'bernulli_nb': GRID_BERNULLI_NB,
    'multinomial_nb': GRID_MNB,
    'knn': GRID_KNN,
    "svm": GRID_SVM,
    "logistic_regression":GRID_LGREG,
    "decision_tree":GRID_DT,
    "random_forest":GRID_RF
    }

In [8]:
def execute_gridesearch(X, y, model, model_name):
  params = params_map[model_name]
  if((model_name == "logistic_regression") or (model_name == "svm")):
    params = list_grids(params)
  grid = GridSearchCV(model, params,scoring="accuracy",refit=True, cv=cv_strategy, n_jobs=-1,verbose=10).fit(X, y)
  results = pd.DataFrame(grid.cv_results_)

  print("Mean validation accuracy: ", results["mean_test_score"][grid.best_index_])

  print("Mean validation accuracy: ", results["mean_test_score"][grid.best_index_])
  print("Mean std validation accuracy: ", results["std_test_score"][grid.best_index_])
  return grid



In [9]:
# Function that prints the confusion matrix
def print_confusion_matrix(test_label, pred_label):
    cm = confusion_matrix(test_label, pred_label)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["0","1"])
    disp.plot()
    plt.show()

In [10]:

def do_sklearn_GridSearchCV(X,y,X_test,y_test,model,model_name,i):
    print("Model Used: " + model_name + " On monk: ",i)
    grid = execute_gridesearch(X, y, model, model_name)
    print("Model used: " + model_name + ", best parameters: " + str(grid.best_params_) )
    y_pred = grid.best_estimator_.predict(X_test)
    print("accuracy on test set {:.3f}".format(accuracy_score(y_test,y_pred)))
    print_report_score(y_test,y_pred)
    print_confusion_matrix(y_test,y_pred)
    print("------------------------------------------------------------------------------------------------------")


In [11]:
def grid_search_model(model,model_name):
    for i in range(0,3):
        if i == 0:
            X_train, y_train = read_ds(TR_PATH_1)
            X_test, y_test = read_ds(TS_PATH_1)
        elif i == 1:
            X_train, y_train = read_ds(TR_PATH_2)
            X_test, y_test = read_ds(TS_PATH_2)
        elif i == 2:
            X_train, y_train = read_ds(TR_PATH_3)
            X_test, y_test = read_ds(TS_PATH_3)
        grid_svm = do_sklearn_GridSearchCV(X_train,y_train,X_test,y_test,model,model_name,i+1)

# Logistic Regression

In [None]:
grid_search_model(LogisticRegression(),"logistic_regression")


# Naive bayes

GaussianNB

In [None]:
grid_search_model(GaussianNB(),"gaussian_nb")

BernoulliNB

In [None]:
grid_search_model(BernoulliNB(),"bernulli_nb")

MultinomialNB

In [None]:
grid_search_model(MultinomialNB(),"multinomial_nb")

# K-NN

In [None]:
grid_search_model(KNeighborsClassifier(),"knn")

# SVM

In [None]:
grid_search_model(SVC(),"svm")

# Decision tree

In [None]:
grid_search_model(DecisionTreeClassifier(),"decision_tree")

# Ensemble methods

Random Forest Classifier

In [None]:
grid_search_model(RandomForestClassifier(),"random_forest")
