In [4]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    GridSearchCV
)
from sklearn.metrics import(
    confusion_matrix
) 
warnings.filterwarnings('ignore')

In [3]:
X_train = np.load("artifacts/X_train.npz")['arr_0']
X_test = np.load("artifacts/X_test.npz")['arr_0']
y_train = np.load("artifacts/y_train.npz")['arr_0']
y_test = np.load("artifacts/y_test.npz")['arr_0']

### Define Models with hyperparameters

In [None]:
lr_param_grid = {
    'max_iter': [1000, 5000, 10000],
}

dt_param_grid = {
    'max_depth' : [8,12,16, 20],
    'criterion': ['gini', 'entropy', 'log_loss'],
}

# only adding few params for randon forest because it takes a lot of time to train
rf_param_grid = {
    'n_estimators': [100],
    'max_depth': [8, 12],
    'criterion': ['gini', 'entropy', 'log_loss'],
}
# Dictionary to hold parameter grids for each model
param_grids = {
    "Logistic Regression": lr_param_grid,
    "Decision Tree": dt_param_grid,
    "Random Forest": rf_param_grid
}

In [10]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

### configure K-fold CV

In [11]:
cv = StratifiedKFold(
    n_splits=6,
    shuffle=True, 
    random_state=42)


### Model Training with gridsearch

In [12]:
grid_search_results = {}
# models is a dictionary where keys are model names and values are the model instances
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1',
        cv=cv,
        verbose=1,
        return_train_score= False
    )

    print(f"fitting gridSearchCV for {model_name} .")
    grid_search.fit(X_train, y_train)

    grid_search_results[model_name] = grid_search
    print(f"GridSearchCV for {model_name} completed.")

    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_:.4f}\n")

Tuning Logistic Regression...
fitting gridSearchCV for Logistic Regression .
Fitting 6 folds for each of 3 candidates, totalling 18 fits
GridSearchCV for Logistic Regression completed.
Best parameters for Logistic Regression: {'max_iter': 1000}
Best score for Logistic Regression: 0.7156

Tuning Decision Tree...
fitting gridSearchCV for Decision Tree .
Fitting 6 folds for each of 12 candidates, totalling 72 fits
GridSearchCV for Decision Tree completed.
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 20}
Best score for Decision Tree: 0.8373

Tuning Random Forest...
fitting gridSearchCV for Random Forest .
Fitting 6 folds for each of 6 candidates, totalling 36 fits
GridSearchCV for Random Forest completed.
Best parameters for Random Forest: {'criterion': 'gini', 'max_depth': 12, 'n_estimators': 100}
Best score for Random Forest: 0.8726

