In [1]:
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import(
                                    StratifiedKFold,
                                    cross_validate,
                                    GridSearchCV
)
from sklearn.metrics import (
                            accuracy_score,
                            precision_score,
                            confusion_matrix
)

As per the results of the previous test we can choose XGBoost, LightGBM, Bagging Classifier and Random Forest as the best suited models for hyper parameter tuning

### 1. Load the data

In [2]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
Y_train = np.load('artifacts/y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
Y_test = np.load('artifacts/y_test.npz')['arr_0']

### 2. Define Multi Models


In [3]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ["gini", "entropy"]
}

# Parameter grid for Bagging Classifier
bagging_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.8, 1.0],  # % of data for each model
    'max_features': [0.8, 1.0]  # % of features for each model
}

# Parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]  # % of data for each tree
}

# Parameter grid for LightGBM
lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [20, 31, 40] # Should be < 2^max_depth
}

param_grids = {
    "Random Forest": rf_param_grid,
    "Bagging Classifier": bagging_param_grid,
    "XGBoost": xgb_param_grid,
    "LightGBM": lgbm_param_grid
}

In [4]:
# lr_param_grid = {
#                 'max_iter' : [1000, 5000, 10000],
# }

# dt_param_grid = {
#                 'max_depth' : [8, 12, 16, 20],
#                 'criterion' : ["gini", "entropy", "log_loss"]
# }

# rf_param_grid = {
#                 'n_estimators' : [50, 100, 150, 200],
#                 'max_depth' : [8, 12, 16, 20],
#                 'criterion' : ["gini", "entropy", "log_loss"]
# }

# param_grids = {
#             'Logistic Regression' : lr_param_grid,
#             'Decision Tree' : dt_param_grid,
#             'Random Forest' : rf_param_grid
# }

### 3. Define Multi Models

In [5]:
models = {
           "Random Forest" : RandomForestClassifier(),
           "Bagging Classifier": BaggingClassifier(),
           "XGBoost": XGBClassifier(),
           "LightGBM": LGBMClassifier(verbose=-1)
}


### 4. Configure K-Fold CV

In [6]:
cv = StratifiedKFold(
                    n_splits=6,
                    random_state=42,
                    shuffle=True
)

### 5. Multi Model Training

In [7]:
grid_search_results = {}
for model_name, model in models.items():
    print(f"\n----Tuning {model_name} ----")

    param_grid = param_grids[model_name]

    grid_search = GridSearchCV(
                                estimator=model,
                                param_grid=param_grid,
                                cv=cv, scoring='f1',
                                verbose=0, return_train_score=False
    )

    print(f"Fitting gridSearchCV for {model_name}")

    grid_search.fit(X_train, Y_train)

    grid_search_results[model_name] = grid_search

    print(f"{model_name} gridSearchCV completed ...")
    print(f"Best paramaters: {grid_search.best_params_}")
    print(f"Best CV Score: {grid_search.best_score_}")


----Tuning Random Forest ----
Fitting gridSearchCV for Random Forest
Random Forest gridSearchCV completed ...
Best paramaters: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 100}
Best CV Score: 0.8456850239119807

----Tuning Bagging Classifier ----
Fitting gridSearchCV for Bagging Classifier
Bagging Classifier gridSearchCV completed ...
Best paramaters: {'max_features': 0.8, 'max_samples': 1.0, 'n_estimators': 200}
Best CV Score: 0.8421076732332153

----Tuning XGBoost ----
Fitting gridSearchCV for XGBoost
XGBoost gridSearchCV completed ...
Best paramaters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best CV Score: 0.8456106522210054

----Tuning LightGBM ----
Fitting gridSearchCV for LightGBM


[WinError 2] The system cannot find the file specified
  File "c:\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python313\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python313\Lib\subp

LightGBM gridSearchCV completed ...
Best paramaters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 20}
Best CV Score: 0.843657250696216


