In [27]:
import datetime as dt
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from xgboost import XGBRegressor

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import ml

# Load data

In [3]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_validation, y_validation = joblib.load(
    "../data/train/preprocessed/validation_features_labels.joblib.gz"
)

# Estimators

In [4]:
estimators = {
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
}

# Parameters - broad search space

In [36]:
params = {
    "RandomForestRegressor": {
        "n_estimators": [100, 500],
        "criterion": ["mse", "mae"],
        "max_depth": [100, 500, None],
        "min_samples_split": [2, 10, 25, 100],
        "min_samples_leaf": [1, 5, 10],
        "bootstrap": [True],
        "oob_score": [True, False],
        "max_samples": [None, .75]
    },
    "XGBRegressor": {
        "n_estimators": [100, 500],
        "max_depth": [100, 500, None],
        "learning_rate": [.001, .01, .1],
        "n_jobs": [1],
        "gamma": [.01, .05, .1],
        "subsample": [0.6, 0.8, 1],
        "colsample_bytree": [0.5, 1.0],
        "reg_alpha": [0],
        "reg_lambda": [.05, .5, 1., 3],
        "base_score": [0, 1]
    },
}

# Model selection - Grid Search

In [38]:
selector = ml.EstimatorSelector(estimators, params)
selector.fit(X_train, y_train, scoring="neg_root_mean_squared_error", cv=5, n_jobs=11)
joblib.dump(
    selector.score_summary(sort_by="mean_score"),
    f"../ml_artifacts/gridsearch_results/{dt.datetime.now().strftime('%FT%T')}broad_param_search_result.joblib.gz"
)

Running GridSearchCV for RandomForestRegressor.
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Running GridSearchCV for Ridge.
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Running GridSearchCV for SVR.
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Running GridSearchCV for XGBRegressor.
Fitting 5 folds for each of 972 candidates, totalling 4860 fits


['../ml_artifacts/gridsearch_results/2021-05-23T12:50:40broad_param_search_result.joblib.gz']

In [39]:
selector.score_summary(sort_by="mean_score")

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,params
775,SVR,-0.504235,-0.434606,-0.356607,0.05104,"{'C': 1.0, 'epsilon': 0.10772173450159415, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 100}"
769,SVR,-0.586565,-0.45974,-0.356607,0.080333,"{'C': 1.0, 'epsilon': 0.023207944168063883, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 100}"
763,SVR,-0.586565,-0.45974,-0.356607,0.080333,"{'C': 1.0, 'epsilon': 0.005, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 100}"
1156,XGBRegressor,-0.496709,-0.460911,-0.39614,0.037912,"{'base_score': 0, 'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 500, 'n_jobs': 1, 'reg_alpha': 0, 'reg_lambda': 1.0, 'subsample': 0.6}"
1154,XGBRegressor,-0.497554,-0.461246,-0.395876,0.038057,"{'base_score': 0, 'colsample_bytree': 0.5, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 500, 'n_jobs': 1, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.6}"
...,...,...,...,...,...,...
767,SVR,-4.820495,-4.107473,-3.308229,0.492584,"{'C': 1.0, 'epsilon': 0.005, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 1000}"
773,SVR,-4.830982,-4.112015,-3.350359,0.479077,"{'C': 1.0, 'epsilon': 0.023207944168063883, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 1000}"
772,SVR,-7.81331,-5.387465,-3.738398,1.684899,"{'C': 1.0, 'epsilon': 0.023207944168063883, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 100}"
766,SVR,-7.81331,-5.399477,-3.738398,1.701322,"{'C': 1.0, 'epsilon': 0.005, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': 100}"
