Notebook for comparing the models using cross validation
1. Load dataset
2. Split data into folds of training and test subsets.
3. For each fold, for each model:
    - Preprocess training data.
    - Fit model to training data.
    - Process test data based on training data learned parameters.
    - Test trainig data.

In [1]:
import pandas as pd
import settings
from HelperFunctions import *
from MixedNB import MixedNB
from MyKNN import MyKNN
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

import time



In [2]:
#loading data
fullTrainData = pd.read_excel(settings.labelledDatapath)

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
#these steps are independent of actual data content, so can be done on training and test data together.
dtData = preprocessDT(fullTrainData)
nbData = preprocessNB(fullTrainData)
knnData = preprocessKNN(fullTrainData)

In [4]:
naiveBayes = MixedNB()
knn = MyKNN()
dt = tree.DecisionTreeClassifier()
rf = RandomForestClassifier()

In [5]:
#dictionaries of parameters to be tested for each classifier
nbParam = {
    "alpha": [0.2, 0.4, 0.6, 0.8, 1]
}

knnParam = {
    "n_neighbors": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    "weights": ["uniform", "distance"]
}

dtParam = {
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"],
    "max_depth": [5, 6, 7, 8, None],
    "min_samples_split": [2, 3, 4, 5, 6],

    "max_features": [None, "sqrt", "log2"],
    "class_weight": [None, "balanced"],
    "ccp_alpha": [0, 0.2, 0.4, 0,6]
}

rfParam = {
    "n_estimators": [30, 90, 100, 110, 170],
    "max_depth": [5, 6, 7, 8, None],
    "min_samples_split": [3, 4, 5, 6],
    "max_features": [None, "sqrt", "log2"],
    "bootstrap": [True, False],
    "oob_score": [True, False],
    "class_weight": ["balanced", "balanced_subsample", None]
}

In [6]:
searchCV = GridSearchCV(estimator = dt,
                       param_grid = dtParam,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 1,
                       n_jobs = 1
                       )

In [7]:
start = time.time()
features = dtData.drop(labels=["class"], axis=1)
labels = dtData["class"]
searchCV.fit(features, labels)
dtBestScore = searchCV.best_score_
dtBestParams = searchCV.best_params_
end = time.time()
print(f"took {end-start}s")
#searchCV.cv_results_

Fitting 5 folds for each of 4500 candidates, totalling 22500 fits
took 1552.4210844039917s


In [8]:
dtBestScore

91416.3

In [9]:
dtBestParams

{'ccp_alpha': 0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': None,
 'min_samples_split': 5,
 'splitter': 'best'}

In [10]:
searchCV = GridSearchCV(estimator = naiveBayes,
                       param_grid = nbParam,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 2,
                       n_jobs = 1
                       )

In [11]:
features = nbData.drop(labels=["class"], axis=1)
labels = nbData["class"]
searchCV.fit(features, labels)
nbBestScore = searchCV.best_score_
nbBestParams = searchCV.best_params_
#searchCV.cv_results_

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END ..........................................alpha=0.2; total time=   0.0s
[CV] END ..........................................alpha=0.4; total time=   0.0s
[CV] END ..........................................alpha=0.4; total time=   0.0s
[CV] END ..........................................alpha=0.4; total time=   0.0s
[CV] END ..........................................alpha=0.4; total time=   0.0s
[CV] END ..........................................alpha=0.4; total time=   0.0s
[CV] END ..........................................alpha=0.6; total time=   0.0s
[CV] END ........................................

In [12]:
nbBestScore

54810.0

In [13]:
nbBestParams

{'alpha': 0.2}

In [14]:
searchCV = GridSearchCV(estimator = knn,
                       param_grid = knnParam,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 2,
                       n_jobs = 1
                       )

In [15]:
features = knnData.drop(labels=["class"], axis=1)
labels = knnData["class"]
searchCV.fit(features, labels)
knnBestScore = searchCV.best_score_
knnBestParams = searchCV.best_params_
#searchCV.cv_results_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END .....................n_neighbors=3, weights=uniform; total time=   0.6s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   0.6s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   0.6s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   0.6s
[CV] END .....................n_neighbors=3, weights=uniform; total time=   0.6s
[CV] END ....................n_neighbors=3, weights=distance; total time=   0.6s
[CV] END ....................n_neighbors=3, weights=distance; total time=   0.6s
[CV] END ....................n_neighbors=3, weights=distance; total time=   0.6s
[CV] END ....................n_neighbors=3, weights=distance; total time=   0.6s
[CV] END ....................n_neighbors=3, weights=distance; total time=   0.6s
[CV] END .....................n_neighbors=4, weights=uniform; total time=   0.6s
[CV] END .....................n_neighbors=4, we

[CV] END ....................n_neighbors=13, weights=uniform; total time=   0.7s
[CV] END ....................n_neighbors=13, weights=uniform; total time=   0.7s
[CV] END ....................n_neighbors=13, weights=uniform; total time=   0.7s
[CV] END ....................n_neighbors=13, weights=uniform; total time=   0.7s
[CV] END ...................n_neighbors=13, weights=distance; total time=   0.7s
[CV] END ...................n_neighbors=13, weights=distance; total time=   0.7s
[CV] END ...................n_neighbors=13, weights=distance; total time=   0.7s
[CV] END ...................n_neighbors=13, weights=distance; total time=   0.7s
[CV] END ...................n_neighbors=13, weights=distance; total time=   0.7s
[CV] END ....................n_neighbors=14, weights=uniform; total time=   0.7s
[CV] END ....................n_neighbors=14, weights=uniform; total time=   0.7s
[CV] END ....................n_neighbors=14, weights=uniform; total time=   0.7s
[CV] END ...................

In [16]:
knnBestScore

70327.8

In [17]:
knnBestParams

{'n_neighbors': 17, 'weights': 'uniform'}

In [19]:
searchCV = GridSearchCV(estimator = rf,
                       param_grid = rfParam,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 1,
                       n_jobs = 1
                       )

In [21]:
start = time.time()
features = dtData.drop(labels=["class"], axis=1)
labels = dtData["class"]
searchCV.fit(features, labels)
rfBestScore = searchCV.best_score_
rfBestParams = searchCV.best_params_
end = time.time()
print(f"took {end-start}s")
#searchCV.cv_results_
print(rfBestScore)
print(rfBestParams)

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
4500 fits failed out of a total of 18000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4500 fits failed with the following error:
Traceback (most recent call last):
  File "/home/peter/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/peter/.local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 434, in fit
    raise ValueError(

took 46824.35323858261s
92973.0
{'bootstrap': True, 'class_weight': 'balanced_subsample', 'max_depth': 8, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 90, 'oob_score': False}


In [22]:
rfBestScore

92973.0

In [24]:
#another grid search to finer tune parameters (fixing some hyperparameters as previous search took 13 hours)
rfParamRefined = {
    "n_estimators": [89, 90, 91],
    "max_depth": [7, 8, 9],
    "min_samples_split": [3, 4, 5, 6],
    "max_features": [None],
    "class_weight": ["balanced_subsample"]
}

In [25]:
searchCV = GridSearchCV(estimator = rf,
                       param_grid = rfParamRefined,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 1,
                       n_jobs = 1
                       )

In [26]:
start = time.time()
features = dtData.drop(labels=["class"], axis=1)
labels = dtData["class"]
searchCV.fit(features, labels)
rfBestScoreRefined = searchCV.best_score_
rfBestParamsRefined = searchCV.best_params_
end = time.time()
print(f"took {end-start}s")
#searchCV.cv_results_
print(rfBestScoreRefined)
print(rfBestParamsRefined)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
took 1083.1545114517212s
92884.9
{'class_weight': 'balanced_subsample', 'max_depth': 9, 'max_features': None, 'min_samples_split': 6, 'n_estimators': 90}


In [27]:
#even more refined grid search
rfParamRefined = {
    "n_estimators": [90],
    "max_depth": [8, 9, 10, 11, 12],
    "min_samples_split": [5, 6, 7, 8, 9],
    "max_features": [None],
    "class_weight": ["balanced_subsample"]
}

In [28]:
searchCV = GridSearchCV(estimator = rf,
                       param_grid = rfParamRefined,
                       scoring = make_scorer(customScorer, greater_is_better=True),
                       cv = 5,
                       verbose = 1,
                       n_jobs = 1
                       )

In [29]:
start = time.time()
features = dtData.drop(labels=["class"], axis=1)
labels = dtData["class"]
searchCV.fit(features, labels)
rfBestScoreRefined = searchCV.best_score_
rfBestParamsRefined = searchCV.best_params_
end = time.time()
print(f"took {end-start}s")
#searchCV.cv_results_
print(rfBestScoreRefined)
print(rfBestParamsRefined)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
took 906.6505987644196s
93469.5
{'class_weight': 'balanced_subsample', 'max_depth': 10, 'max_features': None, 'min_samples_split': 7, 'n_estimators': 90}
