## Importing Libraries and Setting Up Environment

In [1]:
import numpy as np
import pickle
import os
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
models_dir = '../models'
model_files = {
    'log': 'log.pkl',
    'DT_model': 'DT_model.pkl',
    'RF_model': 'RF_model.pkl',
    'SVM_model': 'SVM_model.pkl',
    'final_model_XGBoost_model': 'final_model_XGBoost_model.pkl',
    'Voting_model': 'Voting_model.pkl'
}

models = {}

for name, fname in model_files.items():
    path = os.path.join(models_dir, fname)
    with open(path, 'rb') as file:
        models[name] = pickle.load(file)

with open('../.pkl/X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('../.pkl/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)

## Hyperparameter Tuning

In [3]:
param_grid_log = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

param_grid_DT = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

param_dist_RF = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_dist_SVM = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 5],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

param_dist_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1]
}

param_dist_voting = {
    'log_model__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'log_model__penalty': ['l1', 'l2', 'elasticnet', None],
    'log_model__solver': ['lbfgs', 'liblinear', 'saga'],
    'log_model__max_iter': [100, 500, 1000],

    'RF_model__n_estimators': [100, 200, 300],
    'RF_model__max_depth': [None, 10, 20, 30],
    'RF_model__min_samples_split': [2, 5, 10],
    'RF_model__min_samples_leaf': [1, 2, 4],
    'RF_model__bootstrap': [True, False],

    'xgb_model__n_estimators': [100, 200, 300],
    'xgb_model__max_depth': [3, 5, 7],
    'xgb_model__learning_rate': [0.01, 0.1, 0.2]
}


In [4]:
grid_log = GridSearchCV(models['log'], param_grid=param_grid_log, cv=5, scoring='accuracy', n_jobs=-1)
grid_log.fit(X_train, y_train)
print('Best Score log:', grid_log.best_score_)
print('Best Params log:', grid_log.best_params_)

Best Score log: 0.8345522898154478
Best Params log: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


In [5]:
grid_DT = GridSearchCV(models['DT_model'], param_grid=param_grid_DT, cv=5, scoring='accuracy', n_jobs=-1)
grid_DT.fit(X_train, y_train)
print('Best Score DT:', grid_DT.best_score_)
print('Best Params DT:', grid_DT.best_params_)

Best Score DT: 0.7847573479152427
Best Params DT: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'splitter': 'random'}


In [6]:
grid_RF = RandomizedSearchCV(models['RF_model'], param_distributions=param_dist_RF, cv=3, scoring='accuracy', n_jobs=-1)
grid_RF.fit(X_train, y_train)
print('Best Score RF:', grid_RF.best_score_)
print('Best Params RF:', grid_RF.best_params_)

Best Score RF: 0.8215223097112861
Best Params RF: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}


In [7]:
grid_SVM = RandomizedSearchCV(models['SVM_model'], param_distributions=param_dist_SVM, cv=5, scoring='accuracy', n_jobs=-1)
grid_SVM.fit(X_train, y_train)
print('Best Score SVM:', grid_SVM.best_score_)
print('Best Params SVM:', grid_SVM.best_params_)

Best Score SVM: 0.8240601503759398
Best Params SVM: {'kernel': 'rbf', 'gamma': 0.1, 'degree': 2, 'C': 1}


In [8]:
grid_xgb = RandomizedSearchCV(models['final_model_XGBoost_model'], param_distributions=param_dist_xgb, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)
grid_xgb.fit(X_train, y_train)
print('Best Score xgb:', grid_xgb.best_score_)
print('Best Params xgb:', grid_xgb.best_params_)

Best Score xgb: 0.8215223097112861
Best Params xgb: {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


In [9]:
grid_voting = RandomizedSearchCV(models['Voting_model'], param_distributions=param_dist_voting, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1)
grid_voting.fit(X_train, y_train)
print('Best Score voting:', grid_voting.best_score_)
print('Best Params voting:', grid_voting.best_params_)

Best Score voting: 0.8188976377952756
Best Params voting: {'xgb_model__n_estimators': 300, 'xgb_model__max_depth': 7, 'xgb_model__learning_rate': 0.01, 'log_model__solver': 'liblinear', 'log_model__penalty': 'l2', 'log_model__max_iter': 1000, 'log_model__C': 0.01, 'RF_model__n_estimators': 300, 'RF_model__min_samples_split': 10, 'RF_model__min_samples_leaf': 1, 'RF_model__max_depth': 20, 'RF_model__bootstrap': False}
