In [None]:
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import ml

# Load data

In [None]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_validation, y_validation = joblib.load(
    "../data/train/preprocessed/validation_features_labels.joblib.gz"
)

# Estimators

In [None]:
estimators = {
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
}

# Parameters - broad search space

In [None]:
params = {
    "RandomForestClassifier": {
        "n_estimators": [100, 500],
        "max_depth": [100, 500, None],
        "min_samples_split": [2, 10, 25, 100],
        "min_samples_leaf": [1, 5, 10],
        "bootstrap": [True],
        "oob_score": [True, False],
        "max_samples": [None, 0.75],
    },
    "XGBClassifier": {
        "n_estimators": [250, 500],
        "max_depth": [100, 500, None],
        "learning_rate": [0.001, 0.01],
        "n_jobs": [1],
        "gamma": [0, 1, 5],
        "reg_alpha": [0],
        "reg_lambda": [1, 5],
    },
}

# Model selection - Grid Search

In [None]:
selector = ml.EstimatorSelector(estimators, params)
selector.fit(X_train, y_train, scoring="f1", cv=5, n_jobs=11)
joblib.dump(
    selector.score_summary(sort_by="mean_score"),
    f"../ml_artifacts/gridsearch_results/param_search_result.joblib.gz",
)

In [None]:
pd.set_option("display.max_colwidth", 500)

selector.score_summary(sort_by="mean_score")