In [None]:
import datetime as dt
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from xgboost import XGBRegressor

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import ml

# Load data

In [None]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_validation, y_validation = joblib.load(
    "../data/train/preprocessed/validation_features_labels.joblib.gz"
)

# Estimators

In [None]:
estimators = {
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
}

# Parameters - broad search space

In [None]:
params = {
    "RandomForestRegressor": {
        "n_estimators": [100, 500],
        "criterion": ["mse", "mae"],
        "max_depth": [100, 500, None],
        "min_samples_split": [2, 10, 25, 100],
        "min_samples_leaf": [1, 5, 10],
        "bootstrap": [True],
        "oob_score": [True, False],
        "max_samples": [None, .75]
    },
    "XGBRegressor": {
        "objective": ["reg:squarederror"],
        "n_estimators": [250, 500],
        "max_depth": [100, 500, None],
        "learning_rate": [.01],
        "n_jobs": [1],
        "gamma": [0, 1, 3, 5],
        "subsample": [1],
        "colsample_bytree": [0.5, 1.0],
        "reg_alpha": [0],
        "reg_lambda": [1, 3, 5],
        "base_score": [0, 1]
    },
}

# Model selection - Grid Search

In [None]:
selector = ml.EstimatorSelector(estimators, params)
selector.fit(X_train, y_train, scoring="neg_root_mean_squared_error", cv=5, n_jobs=11)
joblib.dump(
    selector.score_summary(sort_by="mean_score"),
    f"../ml_artifacts/gridsearch_results/{dt.datetime.now().strftime('%FT%T')}broad_param_search_result.joblib.gz"
)

In [None]:
pd.set_option("display.max_colwidth",500)

selector.score_summary(sort_by="mean_score")