In [1]:
#Imports
import json
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import (
                            mean_absolute_error, mean_squared_error, r2_score,
                            accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
                            )
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier

warnings.filterwarnings('ignore')
print("Import loaded")

Import loaded


In [2]:
ARTIFACTS_DIR = Path("artifacts")
FIGURES_DIR = Path("artifacts/figures")
N_SPLITS = 5


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#### 01) Load Preprocessed Arrays

In [3]:
X_train_reg = np.load(ARTIFACTS_DIR / 'X_train_reg.npz')['data']
X_test_reg = np.load(ARTIFACTS_DIR / 'X_test_reg.npz')['data']
y_train_reg = np.load(ARTIFACTS_DIR / 'y_train_reg.npz')['data']
y_test_reg = np.load(ARTIFACTS_DIR / 'y_test_reg.npz')['data']

X_train_cls = np.load(ARTIFACTS_DIR / 'X_train_cls.npz')['data']
X_test_cls = np.load(ARTIFACTS_DIR / 'X_test_cls.npz')['data']
y_train_cls = np.load(ARTIFACTS_DIR / 'y_train_cls.npz')['data']
y_test_cls = np.load(ARTIFACTS_DIR / 'y_test_cls.npz')['data']

print("Regression arrays loaded:")
print(f"   X_train : {X_train_reg.shape}, X_test : {X_test_reg.shape}")
print(f"   y_train : {y_train_reg.shape}, y_test : {y_test_reg.shape}")

print("\nClassification arrays loaded:")
print(f"   X_train : {X_train_cls.shape}, X_test : {X_test_cls.shape}")
print(f"   y_train : {y_train_cls.shape}, y_test : {y_test_cls.shape}")

Regression arrays loaded:
   X_train : (36735, 39), X_test : (9177, 39)
   y_train : (36735,), y_test : (9177,)

Classification arrays loaded:
   X_train : (36735, 39), X_test : (9177, 39)
   y_train : (36735,), y_test : (9177,)


In [4]:
"""
01. What is KFold ?

|-------------------------------------------------------------------------------|
|       F1      |       F2      |       F3      |       F4      |       F5      |
|-------------------------------------------------------------------------------|

Train - Train the Model
Test- Test model performance
Validation - Hyper Parameter Tuning

F1, F2, F3, F4 - Train, F5 - Validation
F1, F2, F3, F5 - Train, F4 - Validation
F1, F2, F4, F5 - Train, F3 - Validation 
....

02. What is StratifiedKFold ?

This comes in Handy when your dataset is Class Imbalanced ...
Total Sample Count - 16
Two Classses : A & B , A - 12 , B -4
N_SPLITS = 4 (Each split consist of 4)
Kfold the problem is it won't account pproportions. But StratifiedKFold solve this issue.

A : B = 3 : 1

with this everytime it create a split which satisfy A : B


"""

"\n01. What is KFold ?\n\n|-------------------------------------------------------------------------------|\n|       F1      |       F2      |       F3      |       F4      |       F5      |\n|-------------------------------------------------------------------------------|\n\nTrain - Train the Model\nTest- Test model performance\nValidation - Hyper Parameter Tuning\n\nF1, F2, F3, F4 - Train, F5 - Validation\nF1, F2, F3, F5 - Train, F4 - Validation\nF1, F2, F4, F5 - Train, F3 - Validation \n....\n\n02. What is StratifiedKFold ?\n\nThis comes in Handy when your dataset is Class Imbalanced ...\nTotal Sample Count - 16\nTwo Classses : A & B , A - 12 , B -4\nN_SPLITS = 4 (Each split consist of 4)\nKfold the problem is it won't account pproportions. But StratifiedKFold solve this issue.\n\nA : B = 3 : 1\n\nwith this everytime it create a split which satisfy A : B\n\n\n"

#### 02) Regression Hparam Tuning

In [5]:
param_grids_regg = {
    
                    'RF' : {
                            'model' : RandomForestRegressor(random_state=RANDOM_STATE),
                            'params' : {
                                        'n_estimators' : [100, 200],
                                        'min_samples_leaf' : [1, 5],
                                        'max_depth' : [6, 8, 12]
                                        }
                    },
                    'XGB' : {
                            'model' : XGBRegressor(random_state=RANDOM_STATE),
                            'params' : {
                                        'n_estimators' : [50, 100, 200],
                                        'learning_rate' : [0.05, 0.1],
                                        'max_depth' : [8, 10, 12]
                                        }
                    }
                }

kf = KFold(
        n_splits=N_SPLITS,
        shuffle=True, random_state=RANDOM_STATE       
        )

reg_tuning_results = {}
for name, config in param_grids_regg.items():
    print(f"\n Training {name} model...")

#Grid Search Cross Validation
    grid_search = GridSearchCV(
                        estimator=config['model'],
                        param_grid=config['params'],
                        cv=kf, scoring='neg_root_mean_squared_error',
                        verbose=3
                     )
    grid_search.fit(X_train_reg, y_train_reg)

    reg_tuning_results[name] = {
                                'best_params' : grid_search.best_params_,
                                'best_cv_score' : grid_search.best_score_,
                                'best_estimator' : grid_search.best_estimator_
                                }
    
    print(f"    Best Params : {grid_search.best_params_}")
    print(f"    Best Estimators : {grid_search.best_estimator_}")

print("\n regression Tuning Summary")
for name, result in reg_tuning_results.items():
    print(f"    {name} : RMSE=${result['best_cv_score']:.2f} |  params={result['best_params']}")




 Training RF model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=-45.677 total time=   7.3s
[CV 2/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=-46.547 total time=   7.3s
[CV 3/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=-46.037 total time=   7.0s
[CV 4/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=-45.985 total time=   7.1s
[CV 5/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=-45.391 total time=   7.8s
[CV 1/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=-45.673 total time=  12.9s
[CV 2/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=-46.528 total time=  13.4s
[CV 3/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=-46.037 total time=  12.2s
[CV 4/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=-45.989 total time=  12.4s
[CV 5/5] END max_depth=6, min_samp

#### 02) Classification Hparam Tuning


In [6]:
param_grids_cls = {
                    'RF' : {
                            'model' : RandomForestClassifier(random_state=RANDOM_STATE),
                            'params' : {
                                        'n_estimators' : [100, 200],
                                        'min_samples_leaf' : [1, 5],
                                        'max_depth' : [6, 8, 12]
                                        }
                    },
                    'XGB' : {
                            'model' : XGBClassifier(random_state=RANDOM_STATE),
                            'params' : {
                                        'n_estimators' : [50, 100, 200],                                     
                                        'learning_rate' : [0.05, 0.1],
                                        'max_depth' : [8, 12, 20]
                                        }
                    }
                }

kf = StratifiedKFold(
                n_splits=N_SPLITS,
                shuffle=True, random_state=RANDOM_STATE       
                )

cls_tuning_results = {}
for name, config in param_grids_cls.items():
    print(f"\n Training {name} model...")

#Grid Search Cross Validation
    grid_search = GridSearchCV(
                        estimator=config['model'],
                        param_grid=config['params'],
                        cv=kf, scoring='f1_weighted',
                        verbose=3
                     )
    grid_search.fit(X_train_cls, y_train_cls)

    cls_tuning_results[name] = {
                                'best_params' : grid_search.best_params_,
                                'best_cv_score' : grid_search.best_score_,
                                'best_estimator' : grid_search.best_estimator_
                                }
    
    print(f"    Best Params : {grid_search.best_params_}")
    print(f"    Best Estimators : {grid_search.best_estimator_}")

print("\n classification Tuning Summary")
for name, result in cls_tuning_results.items():
    print(f"    {name} : F1= {result['best_cv_score']:.2f} |  params={result['best_params']}")
    


 Training RF model...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=0.668 total time=   1.3s
[CV 2/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=0.655 total time=   1.3s
[CV 3/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=0.666 total time=   1.3s
[CV 4/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=0.673 total time=   1.3s
[CV 5/5] END max_depth=6, min_samples_leaf=1, n_estimators=100;, score=0.667 total time=   1.4s
[CV 1/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=0.670 total time=   2.8s
[CV 2/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=0.655 total time=   2.8s
[CV 3/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=0.668 total time=   2.7s
[CV 4/5] END max_depth=6, min_samples_leaf=1, n_estimators=200;, score=0.673 total time=   2.7s
[CV 5/5] END max_depth=6, min_samples_leaf=1, n_esti