In [1]:
import datetime as dt
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from xgboost import XGBRegressor

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import ml

# Load data

In [3]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_validation, y_validation = joblib.load(
    "../data/train/preprocessed/validation_features_labels.joblib.gz"
)

# Estimators

In [4]:
estimators = {
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
}

# Parameters - broad search space

In [5]:
params = {
    "RandomForestRegressor": {
        "n_estimators": [100, 500],
        "criterion": ["mse", "mae"],
        "max_depth": [100, 500, None],
        "min_samples_split": [2, 10, 25, 100],
        "min_samples_leaf": [1, 5, 10],
        "bootstrap": [True],
        "oob_score": [True, False],
        "max_samples": [None, .75]
    },
    "XGBRegressor": {
        "n_estimators": [100, 500],
        "max_depth": [100, 500, None],
        "learning_rate": [.001, .01, .1],
        "n_jobs": [1],
        "gamma": [.01, .05, .1],
        "subsample": [0.6, 0.8, 1],
        "colsample_bytree": [0.5, 1.0],
        "reg_alpha": [0],
        "reg_lambda": [.05, .5, 1., 3],
        "base_score": [0, 1]
    },
}

# Model selection - Grid Search

In [6]:
selector = ml.EstimatorSelector(estimators, params)
selector.fit(X_train, y_train, scoring="neg_root_mean_squared_error", cv=5, n_jobs=11)
joblib.dump(
    selector.score_summary(sort_by="mean_score"),
    f"../ml_artifacts/gridsearch_results/{dt.datetime.now().strftime('%FT%T')}broad_param_search_result.joblib.gz"
)

Running GridSearchCV for RandomForestRegressor.
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Running GridSearchCV for XGBRegressor.
Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


['../ml_artifacts/gridsearch_results/2021-05-23T14:13:24broad_param_search_result.joblib.gz']

In [7]:
selector.score_summary(sort_by="mean_score")

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,params
1151,XGBRegressor,-0.496387,-0.459071,-0.393981,0.038312,"{'base_score': 0, 'colsample_bytree': 0.5, 'ga..."
935,XGBRegressor,-0.497128,-0.459259,-0.393674,0.038427,"{'base_score': 0, 'colsample_bytree': 0.5, 'ga..."
719,XGBRegressor,-0.497488,-0.459333,-0.394033,0.038324,"{'base_score': 0, 'colsample_bytree': 0.5, 'ga..."
2445,XGBRegressor,-0.504987,-0.459993,-0.395838,0.040205,"{'base_score': 1, 'colsample_bytree': 0.5, 'ga..."
1150,XGBRegressor,-0.496427,-0.46002,-0.395293,0.038006,"{'base_score': 0, 'colsample_bytree': 0.5, 'ga..."
...,...,...,...,...,...,...
1394,XGBRegressor,-0.59695,-0.546465,-0.484602,0.039803,"{'base_score': 0, 'colsample_bytree': 1.0, 'ga..."
1382,XGBRegressor,-0.596962,-0.546466,-0.484601,0.039807,"{'base_score': 0, 'colsample_bytree': 1.0, 'ga..."
1406,XGBRegressor,-0.596962,-0.546466,-0.484601,0.039807,"{'base_score': 0, 'colsample_bytree': 1.0, 'ga..."
1380,XGBRegressor,-0.594731,-0.547704,-0.483443,0.041296,"{'base_score': 0, 'colsample_bytree': 1.0, 'ga..."
