In [1]:
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import ml

# Load data

In [3]:
X_train, y_train = joblib.load(
    "../data/train/preprocessed/undersampled_train_features_labels.joblib.gz"
)

X_validation, y_validation = joblib.load(
    "../data/train/preprocessed/validation_features_labels.joblib.gz"
)

# Estimators

In [4]:
estimators = {
    "RandomForestRegressor": RandomForestRegressor(),
    "Ridge": Ridge(),
    "SVR": SVR(),
    "XGBRegressor": XGBRegressor(),
}

# Parameters - broad search space

In [7]:
params = {
    "RandomForestRegressor": {
        "n_estimators": [100, 500, 1500],
        "criterion": ["mse", "mae"],
        "max_depth": [100, 500, None],
        "min_samples_split": [2, 4, 12],
        "min_samples_leaf": [1, 5, 15],
        "max_features": [.5, None],
        "bootstrap": [True],
        "oob_score": [True],
        "max_samples": [None, .3, .7]
    },
    "Ridge": {
        "alpha": np.geomspace(5e-2, 5, num=5),
        "fit_intercept": [True, False],
        "max_iter": [None, 100, 1_000],
        "solver": ["saga", "cholesky", "lsqr"]
    },
    "SVR": {
        "kernel": ["rbf", "sigmoid"],
        "gamma": ["scale"],
        "C": np.geomspace(5e-1, 5, num=5),
        "epsilon": np.geomspace(5e-3, 5e-1, num=4),
        "max_iter": [-1, 100, 1_000]
    },
    "XGBRegressor": {
        "n_estimators": [100, 500, 1500],
        "max_depth": [100, 500, None],
        "learning_rate": [.01, .05, .1],
        # "booster": [],
        "gamma": [.01, .05, .1],
        "subsample": [0.6, 0.8],
        "colsample_bytree": [0.5, 1.0],
        "reg_alpha": [0],
        "reg_lambda": [.05, .5, 1.],
        "base_score": [0]
    },
}

# Model selection - Grid Search

In [13]:
selector = ml.EstimatorSelector(estimators, params)
selector.fit(X_train, y_train, scoring="neg_mean_absolute_error", cv=3, n_jobs=-1)

Running GridSearchCV for RandomForestRegressor.
Fitting 5 folds for each of 972 candidates, totalling 4860 fits


KeyboardInterrupt: 

In [None]:
selector.score_summary(sort_by="mean_score")

In [12]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we