In [233]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.svm import SVC
#from utilities.monk_helpers import read_ds
from sklearn.metrics import classification_report



In [234]:
import pandas as pd
def read_ds(path):
  """
  parse CSV data set and
  returns a tuple (input, target)
  """
  df = pd.read_csv(path, sep=" ", names=['NaN','y','x1','x2','x3','x4','x5','x6','garbage'])
  y, df = df['y'], df.drop(columns=['NaN','garbage','y'])
  
  # One-hot encoding categorical variables
  df = pd.get_dummies(df, columns=['x1','x2','x3','x4','x5','x6']).astype('int')

  return (df, y)

In [235]:
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [236]:
# Function extracting each grid from dictionary of grids
def list_grids(grids_dict):
    return [grids_dict[item] for item in grids_dict]

# Function performing gridsearch cv according to sklearn and saving results as a csv
def do_sklearn_GridSearchCV(model_name,model,param_grid,scoring,refit,cv,return_train_score,n_jobs,X,y):
    hp_search = GridSearchCV(model,
                                param_grid=param_grid,
                                scoring=scoring,
                                refit=refit,
                                cv=cv,
                                return_train_score=return_train_score,
                                n_jobs=n_jobs,
                                verbose=10
                                ).fit(X, y)
    return hp_search

In [237]:
def get_grid_params(grid):
    index_best_model = grid.best_index_ 
    results_dict = grid.cv_results_
    print(f"best parameters {results_dict['params'][index_best_model]}")

In [238]:
# Function that prints the classification report
def print_report_score(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['0', '1']))

In [239]:
monk1_TR_PATH = "../monks/datasets/monks-1.train"
monk1_TS_PATH = "../monks/datasets/monks-1.test"
monk2_TR_PATH = "../monks/datasets/monks-2.train"
monk2_TS_PATH = "../monks/datasets/monks-2.test"
monk3_TR_PATH = "../monks/datasets/monks-3.train"
monk3_TS_PATH = "../monks/datasets/monks-3.test"

In [240]:
X_train, y_train = read_ds(monk1_TR_PATH)
X_test, y_test = read_ds(monk1_TS_PATH)

In [241]:
X_train

Unnamed: 0,x1_1,x1_2,x1_3,x2_1,x2_2,x2_3,x3_1,x3_2,x4_1,x4_2,x4_3,x5_1,x5_2,x5_3,x5_4,x6_1,x6_2
0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,1,0
1,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1
2,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0
3,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1
4,1,0,0,1,0,0,0,1,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,0,0,1,0,0,1,0,1,1,0,0,0,0,0,1,0,1
120,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,1
121,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1
122,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1


# SVM

In [242]:
params = {
  "linear_rbf_sigmoid": {
   "kernel": ['linear', 'rbf','sigmoid'],
   "C":[0.01,10,100],
   "gamma" : ['scale', 'auto']
  },
  "poly": {
    "kernel": ['poly'],
    "C":[0.01,10,100],
    "degree": [2,3,5],
    "gamma" : ['scale', 'auto']
  }
    } 

In [245]:
def grid_search_svm_monk():
    for i in range(0,3):
        if i == 0:
            X_train, y_train = read_ds(monk1_TR_PATH)
            X_test, y_test = read_ds(monk1_TS_PATH)
        elif i == 1:
            X_train, y_train = read_ds(monk2_TR_PATH)
            X_test, y_test = read_ds(monk2_TS_PATH)
        elif i == 2:
            X_train, y_train = read_ds(monk3_TR_PATH)
            X_test, y_test = read_ds(monk3_TS_PATH)
        grid_svm = do_sklearn_GridSearchCV("SVC",SVC(),list_grids(params),['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],"accuracy",cv_strategy,True,4,X_train,y_train)
        print(f"Best grid parameters for monk {i+1}")
        get_grid_params(grid_svm)
        svm_test_pred = grid_svm.best_estimator_.predict(X_test)
        print(f"Accuracy on monk {i+1}")
        print_report_score(y_test,svm_test_pred)


In [246]:
grid_search_svm_monk()

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best grid parameters for monk 1
best parameters {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
Accuracy on monk 1
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       216
           1       1.00      1.00      1.00       216

    accuracy                           1.00       432
   macro avg       1.00      1.00      1.00       432
weighted avg       1.00      1.00      1.00       432

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best grid parameters for monk 2
best parameters {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
Accuracy on monk 2
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       290
           1       1.00      1.00      1.00       142

    accuracy                           1.00       432
   macro avg       1.00      1.00      1.00       432
weighted avg       1.00 