In [2]:
model_name = "xgboost"
random_state = 42
randomized_grid_search = False
canonical_grid_search = False
bayesian_search = True

print(f"model_name: {model_name}")
print(f"randomized_grid_search: {randomized_grid_search}")
print(f"canonical_grid_search: {canonical_grid_search}")
print(f"bayesian_search: {bayesian_search}")

model_name: xgboost
randomized_grid_search: True
canonical_grid_search: False


In [None]:
# ONLY FOR COLAB. Reinstall default XGBoost
from packaging import version
import xgboost as xgb

if version.parse(xgb.__version__) < version.parse("1.3.1"):
  print(f"XGBoost: {xgb.__version__}. Try to re-install.")
  
  try:
    !pip uninstall xgboost
    !pip install xgboost
    print(f"XGBoost: {xgb.__version__}.\nYou need to restart kernel.")
  except Exception as e:
    print(f"Re-install XGBoost error: {e}")

else:
  print(f"XGBoost: {xgb.__version__}. Ok.")

In [None]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
except:
    dir = "/storage/"
else:
    dir = "/content/drive/MyDrive/Colab/gscreen/"

print(f"Data dir: {dir}")

In [None]:
try:
  from skopt import BayesSearchCV
  print(f"The scikit-optimize package exist.")
except Exception as e:
  print(f"Re-install XGBoost error: {e}")
  print(f"Try to re-install scikit-optimize package")
  ! pip install scikit-optimize

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import time

from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

print(f"Numpy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"XGBoost: {xgb.__version__}")

In [7]:
#%% Load data ------------------------------------------------------------------
with open(dir + f"X_fit.joblib.compressed", "rb") as f:
  X_fit = joblib.load(f)

with open(dir + f"X_train.joblib.compressed", "rb") as f:
  X_train = joblib.load(f)

with open(dir + f"X_val.joblib.compressed", "rb") as f:
  X_val = joblib.load(f)

print(f"X_fit: {X_fit.shape}, {type(X_fit)}\
\nX_train: {X_train.shape}, {type(X_train)}\
\nX_val: {X_val.shape}, {type(X_val)}\n")

y_fit = joblib.load(dir + f"y_fit.joblib")
y_train = joblib.load(dir + f"y_train.joblib")
y_val = joblib.load(dir + f"y_val.joblib")

print(f"y_fit: {y_fit.size}, {type(y_fit)}\
\ny_train: {y_train.size}, {type(y_train)}\
\ny_val: {y_val.size}, {type(y_val)}")

X_fit: (296721, 279)
X_train: (296727, 279)
X_val: (5000, 279)

y_fit: 296721
y_train: 296727
y_val: 5000


In [3]:
#%% Define model parameters for starting tuning
model_params = {
    # "tree_method": "gpu_hist",
    # "gpu_id": 0,
    "booster": "gbtree",
    "n_estimators": 2500,
    "objective": "reg:squarederror",
    # "objective": "reg:pseudohubererror",
    # "objective": "reg:gamma",
    "n_jobs": None,
    "random_state": random_state,
}
model = xgb.XGBRegressor(
    **model_params
)

In [4]:
#%% ---------------------- RandomizedSearchCV ----------------------------------
# Parameters' distributions tune in case RANDOMIZED grid search
## Dictionary with parameters names (str) as keys and distributions
## or lists of parameters to try.
## If a list is given, it is sampled uniformly.
param_dist = {
    "n_estimators": [x for x in range(2250, 3001, 250)],
    # ^ subsample: default=1. Lower ratios avoid over-fitting
    "subsample": [x / 10 for x in range(5, 11, 1)],
    # ^ "colsample_bytree: default=1. Lower ratios avoid over-fitting.
    "colsample_bytree": [x / 10 for x in range(6, 11, 1)],
    # ^ max_depth: default=6. Lower ratios avoid over-fitting.
    "max_depth": [x for x in range(6, 51, 4)],
    # ^ min_child_weight: default=1. Larger values avoid over-fitting.
    "min_child_weight": [1] + [x for x in range(2, 11, 2)],
    # ^ Eta (lr): default=0.3. Lower values avoid over-fitting.
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3],
    # ^ Lambda: default=1. Larger values avoid over-fitting.
    "reg_lambda": [1],  # + [x for x in range(2, 11, 1)],
    # ^ Gamma: default=0. Larger values avoid over-fitting.
    "gamma": [0.0],  # + [x/10 for x in range(5, 60, 5)]
}

In [None]:
#%% --------------------------- GridSearchCV -----------------------------------
# Parameters what we wish to tune in case SIMPLE grid search
## Dictionary with parameters names (str) as keys
## and lists of parameter settings to try as values
param_grid = {
    "n_estimators": [2400, 2500, 2600],
    # ^ subsample: default=1. Lower ratios avoid over-fitting
    "subsample": [0.6, 0.8],
    # ^ "colsample_bytree: default=1. Lower ratios avoid over-fitting.
    "colsample_bytree": [0.6, 0.8],
    # ^ max_depth: default=6. Lower ratios avoid over-fitting.
    "max_depth": [6, 12, 24],
    # ^ min_child_weight: default=1. Larger values avoid over-fitting.
    "min_child_weight": [1] + [x for x in range(2, 11, 2)],
    # ^ Eta (lr): default=0.3. Lower values avoid over-fitting.
    "learning_rate": [0.01, 0.1, 0.3],
    # ^ Lambda: default=1. Larger values avoid over-fitting.
    "reg_lambda": [0.5, 1, 2],
    # ^ Gamma: default=0. Larger values avoid over-fitting.
    "gamma": [0, 1, 2, 5]
}

In [None]:
#%% ----------------------- Bayesian Optimization ------------------------------
# Their core idea of Bayesian Optimization is simple:
# when a region of the space turns out to be good, it should be explored more.
# Real: Continuous hyperparameter space.
# Integer: Discrete hyperparameter space.
# Categorical: Categorical hyperparameter space.
bayes_space = {
    # "n_estimators": Integer(2000, 3000),
    "subsample": Real(0.6, 1.0),
    "colsample_bytree": Real(0.7, 1.0),
    "max_depth": Integer(3, 20),
    "min_child_weight": Integer(1, 20),
    "learning_rate": Real(0.01, 0.4),
    "reg_lambda": Real(0.5, 5),
    "gamma": Real(0.0, 5),
}

In [5]:
def accuracy(real_rates, predicted_rates):
    """Project's accuracy value estimator"""
    return np.average(abs(real_rates / predicted_rates - 1.0)) * 100.0

def calc_metrics(model, X, y):
    """Calculates result metrics"""

    from sklearn.metrics import max_error
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error

    y_true = y
    y_pred = model.predict(X)

    me = max_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mare = accuracy(y_true, y_pred)

    print(f"\tMax Error: {me}")
    print(f"\tMean Absolute Error: {mae}")
    print(f"\tRoot Mean Squared Error: {rmse}")
    print(f"\tMean Absolute Ratio Error: {mare}")

In [6]:
#%% Define custom scorer to evaluate basic models
scorer = make_scorer(
    score_func=accuracy,
    greater_is_better=True,  # Whether score_func is a score function (default),
    # meaning high is good, or a loss function, meaning low is good.
)
# Specific fitting parameters. #!Set early stopping to avoid overfitting
early_stopping_params = {
    "early_stopping_rounds": 20,
    "eval_metric": "mae",
    # "eval_metric": "mape",
    # "eval_metric": "rmse",
    "eval_set": [(X_val, y_val)],
}

#%% Define Cross Validation parameters for grid search
# Define classic cross validation method and params
cv = model_selection.RepeatedKFold(
    n_splits=3,  #! Dont forget
    n_repeats=1,
    random_state=random_state,
)

In [None]:
#%% ---------------------- RandomizedSearchCV ----------------------------------
rs_bp, rs_be = {}, {}
if randomized_grid_search:
    tic = time.time()
    # Define RANDOMIZED grid search
    random_search = model_selection.RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=30,  #! default=10 Number of parameter settings that are sampled.
        scoring=scorer,
        n_jobs=None,
        cv=cv,
        refit=True,
        return_train_score=True,
        verbose=8,
        random_state=random_state,
    )
    # Make RANDOMIZED grid search
    model_random_search = random_search.fit(
        X_fit,
        y_fit,
    )
    # Print out best parameters
    rs_bp = model_random_search.best_params_
    rs_be = model_random_search.best_estimator_
    print(f"\nRandomized search:\nBest params are:\n {rs_bp}")
    print(f"\nBest estimator is:\n {rs_be}")

    min, sec = divmod(time.time() - tic, 60)
    print(f"\nRandomized grid search taken: {int(min)}min {int(sec)}sec")

    # Print out results
    print(f"{model_name.title()} random search:")
    print("TRAIN set:")
    calc_metrics(model_random_search, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model_random_search, X=X_val, y=y_val)

    model = model_random_search.best_estimator_

Fitting 4 folds for each of 30 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
#%% --------------------------- GridSearchCV -----------------------------------
#! Dont forget to fine set up the Grid Search parameters in algo_*.py module
#! and reload that module
gs_bp, gs_be = {}, {}
if canonical_grid_search:
    tic = time.time()
    # Define SIMPLE grid search
    grid_search = model_selection.GridSearchCV(
        model,
        param_grid=param_grid,
        scoring=scorer,
        n_jobs=None,  # Number of jobs to run in parallel. -1 = using all processors
        cv=cv,  # Determines the cross-validation splitting strategy.
        refit=True,  # Refit model with the best found parameters on the whole dataset.
        return_train_score=True,
        verbose=8,  # Controls the verbosity: the higher, the more messages.
    )
    # Make SIMPLE grid search
    model_grid_search = grid_search.fit(
        X_fit,
        y_fit,
    )
    # Print out best parameters
    gs_bp = model_grid_search.best_params_
    print(f"Grid search:\nBest params are:\n {gs_bp}")

    min, sec = divmod(time.time() - tic, 60)
    print(f"\nGrid search taken: {int(min)}min {int(sec)}sec")

    # Print out results
    print(f"{model_name.title()} grid search:")
    print("TRAIN set:")
    calc_metrics(model_grid_search, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model_grid_search, X=X_val, y=y_val)

    model = model_grid_search.best_estimator_
    print(f"{model}")

In [None]:
#%% ----------------------- Bayesian Optimization ------------------------------
bs_bp, bs_be = {}, {}
if bayesian_search:
    tic = time.time()
    # Define bayesian search
    bayes_search = BayesSearchCV(
        model,
        search_spaces=bayes_space,
        n_iter=50,  #! default=50 Number of parameter settings that are sampled.
        scoring=scorer,
        n_jobs=None,  # Number of jobs to run in parallel. -1 = using all processors
        cv=3,  # default 3-fold cross validation.
        refit=True,
        verbose=3,
        return_train_score=True,
        random_state=random_state,
    )
    # Make search
    model_bayes_search = bayes_search.fit(
        X_fit,
        y_fit,
    )
    # Print out best parameters
    bs_bp = model_bayes_search.best_params_
    print(f"Bayesian search:\nBest params are:\n {bs_bp}")

    min, sec = divmod(time.time() - tic, 60)
    print(f"\nBayesian search taken: {int(min)}min {int(sec)}sec")

    # Print out results
    print(f"{model_name.title()} Bayesian search:")
    print("TRAIN set:")
    calc_metrics(model_bayes_search, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model_bayes_search, X=X_val, y=y_val)

    model = model_bayes_search.best_estimator_
    print(f"{model}")

In [None]:
#%% Fit the model w\o parameters searching -------------------------------------
if not randomized_grid_search and not canonical_grid_search and not bayesian_search:
    # Train the model
    tic = time.time()
    model.fit(X_fit, y_fit)
    # Evaluate time spent
    min, sec = divmod(time.time() - tic, 60)
    print(f"Time taken: {int(min)}min {int(sec)}sec")
    print(f"{model}\n")

    # Print out results
    print(f"{model_name.title()} model:")
    print("TRAIN set:")
    calc_metrics(model, X=X_train, y=y_train)
    print("VALIDATION set:")
    calc_metrics(model, X=X_val, y=y_val)