In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

In [73]:
# Datasets Path
TR_PATH = "./monks/datasets/monks-1.train"
TS_PATH = "./monks/datasets/monks-1.test"

In [74]:
def read_ds(path):
  """
  parse CSV data set and
  returns a tuple (input, target)
  """
  df = pd.read_csv(path, sep=" ", names=['NaN','y','x1','x2','x3','x4','x5','x6','garbage'])
  y, df = df['y'], df.drop(columns=['NaN','garbage','y'])
  
  # One-hot encoding categorical variables
  df = pd.get_dummies(df, columns=['x1','x2','x3','x4','x5','x6']).astype('int')

  return (df, y)

In [75]:
# read training and test set
X_train, y_train = read_ds(TR_PATH)
X_test,  y_test  = read_ds(TS_PATH)

In [76]:
GRID_GAUSSIAN_NB = { 'var_smoothing': np.logspace(0,-9, num=100) }
GRID_BERNULLI_NB = { 'alpha': np.linspace(0,1, num=100) }
GRID_KNN = { 
              'n_neighbors' : range(1,25), 
            'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 
            'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'], 
            'weights' : ['distance', 'uniform'] 
            }
GRID_MNB = {  }

params_map = {
    'gaussian_nb': GRID_GAUSSIAN_NB,
    'bernulli_nb': GRID_BERNULLI_NB,
    'multinomial_nb': GRID_MNB,
    'knn': GRID_KNN
    }

In [77]:
def execute_gridesearch(X, y, model, model_name):
  """
  Performs a GridSearchCV with the given model and parameters
  """
  cv = KFold(n_splits=5, shuffle=True, random_state=7)
  params = params_map[model_name]
  
  grid = GridSearchCV(model, params, cv=cv, n_jobs=-1).fit(X, y)
  results = pd.DataFrame(grid.cv_results_)
  best_result = abs(results["mean_test_accuracy"][grid.best_index_])
  best_std = results["std_test_accuracy"][grid.best_index_]
  return grid.best_estimator_, best_result, best_std, grid.best_params_


In [78]:
from sklearn.metrics import accuracy_score


models_to_use = [ {GaussianNB(), "gaussian_nb"}, {KNeighborsClassifier(), "knn"}, {BernoulliNB(), "bernulli_nb"}, {MultinomialNB(), "multinomial_nb"}]

for model, name in models_to_use:
    best = execute_gridesearch(X_train, y_train, model, name)
    print("Model used: " + name )

    y_pred = best.predict(X_test)
    print("accuracy on test set {:.3f}".format(accuracy_score(y_test,y_pred)))
    print("Model used: " + name + ", best parameters: " + str(best.best_params_) )


Model used: gaussian_nb


AttributeError: 'tuple' object has no attribute 'predict'

In [None]:
# training the model on training set

# TODO MULTINOMIAL (original dataset), BERNOULLI 
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# making predictions on the testing set
y_pred = gnb.predict(X_test)

# comparing actual response values (y_test) with predicted response values (y_pred)
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)


Gaussian Naive Bayes model accuracy(in %): 75.0
