In [9]:
from Master import MasterDataframes, ModelTrainer


# Data handling
import pickle
import pandas as pd

# Types handling
import numpy as np

# Data science
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.metrics import mean_absolute_error, accuracy_score, make_scorer

from feature_engine.selection import DropCorrelatedFeatures, DropConstantFeatures


# Machine learning tool
import xgboost as xgb
# Optimization / feature engineering tools
import optuna

# Plotting
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# Smart options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

In [2]:
X, Y = MasterDataframes().prep_dataset_x_y("A", drop_features=True)

In [3]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=10, test_size=0.20)

objective_list_reg = ["reg:squarederror"]
tree_method = ["approx", "hist"]
metric_list = ["mae"]

def objective(trial):
    param = {
        "objective": trial.suggest_categorical("objective", objective_list_reg),
        "eval_metric": trial.suggest_categorical("eval_metric", metric_list),
        "tree_method": trial.suggest_categorical("tree_method", tree_method),
        "max_depth": trial.suggest_int("max_depth", 3, 10),  # Adjust the range
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),  # Increase the range
        "gamma": trial.suggest_float("gamma", 0.1, 1.0),  # Increase the lower bound
        "subsample": trial.suggest_discrete_uniform("subsample", 0.6, 1.0, 0.05),  # Reduce the range
        "colsample_bytree": trial.suggest_discrete_uniform(
            "colsample_bytree", 0.6, 1.0, 0.05
        ),  # Reduce the range
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1.0),
        "random_state": trial.suggest_int("random_state", 1, 1000),
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study = optuna.create_study(direction="minimize", study_name="regression")
study.optimize(objective, n_trials=10, n_jobs=6)

model = xgb.XGBRegressor(**study.best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = mean_absolute_error(y_test, y_pred)

print("graded! MAE: ", accuracy)

[I 2023-09-30 13:15:57,504] A new study created in memory with name: regression
[I 2023-09-30 13:16:01,105] Trial 1 finished with value: 261.30172238437825 and parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'tree_method': 'approx', 'max_depth': 5, 'learning_rate': 0.5886752298472008, 'n_estimators': 220, 'min_child_weight': 3, 'gamma': 0.6511788493066277, 'subsample': 0.75, 'colsample_bytree': 0.65, 'reg_alpha': 0.06299001856365276, 'reg_lambda': 0.18632148280412503, 'random_state': 940}. Best is trial 1 with value: 261.30172238437825.
[I 2023-09-30 13:16:06,772] Trial 3 finished with value: 223.84076433020405 and parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'tree_method': 'approx', 'max_depth': 3, 'learning_rate': 0.24705687394975925, 'n_estimators': 931, 'min_child_weight': 5, 'gamma': 0.816413296125129, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.7032252522593979, 'reg_lambda': 0.5989324072105601, 'random_state': 521}. Best i

graded! MAE:  193.52535243614156


In [4]:
thresholds = np.sort(model.feature_importances_)
best = accuracy
for thresh in thresholds[int(len(thresholds)/5) : ]:
    
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = xgb.XGBRegressor(**study.best_params)
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    
    accuracy = mean_absolute_error(y_test, y_pred)
    if best > accuracy:
        best = accuracy
        print(f"New best: {best}")
    print(f"Thresh={str(thresh)}, n={select_X_train.shape[1]}, MAE: {accuracy}, Best: {best}")

Thresh=0.011258452, n=16, MAE: 199.5134638976881, Best: 193.52535243614156
Thresh=0.011336514, n=15, MAE: 204.50074784170437, Best: 193.52535243614156
Thresh=0.0121909045, n=14, MAE: 207.6331154631172, Best: 193.52535243614156
Thresh=0.0130120935, n=13, MAE: 205.03324156038414, Best: 193.52535243614156
Thresh=0.013582612, n=12, MAE: 209.55253885387592, Best: 193.52535243614156
Thresh=0.0146718575, n=11, MAE: 222.00346962568662, Best: 193.52535243614156
Thresh=0.015248544, n=10, MAE: 229.84831619234893, Best: 193.52535243614156
Thresh=0.016560035, n=9, MAE: 238.06364985298603, Best: 193.52535243614156
Thresh=0.018180085, n=8, MAE: 241.5697962951548, Best: 193.52535243614156
Thresh=0.018819869, n=7, MAE: 244.48936988104126, Best: 193.52535243614156
Thresh=0.01899978, n=6, MAE: 243.4444711440046, Best: 193.52535243614156
Thresh=0.022283502, n=5, MAE: 246.25233251750342, Best: 193.52535243614156
Thresh=0.02579325, n=4, MAE: 245.00379403394993, Best: 193.52535243614156
Thresh=0.02656203, n=

In [11]:
rfecv = RFECV(estimator=model, verbose=1, step=1, cv=KFold(5), scoring=make_scorer(mean_absolute_error))
rfecv.fit(X_train, y_train)

X_train_selected_CV = rfecv.transform(X_train)
X_test_selected_CV = rfecv.transform(X_test)

Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.


In [None]:
rf_selected = xgb.XGBRegressor(**study.best_params)
rf_selected.fit(X_train_selected_CV, y_train)

print(f"Train MAE score: {rf_selected.score(X_train_selected_CV, y_train):.2f}")
print(f"Test MAE score: {rf_selected.score(X_test_selected_CV, y_test):.2f}")

Train MAE score: 0.79
Test MAE score: 0.79


In [None]:
model_a = ModelTrainer().train_model(location="A", trials=10)

[I 2023-09-30 13:09:02,804] A new study created in memory with name: regression
[I 2023-09-30 13:09:03,718] Trial 0 finished with value: 223.1582632716738 and parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'tree_method': 'approx', 'max_depth': 3, 'learning_rate': 0.07559332505875452, 'n_estimators': 96, 'min_child_weight': 4, 'gamma': 0.15468354063481535, 'subsample': 0.6, 'colsample_bytree': 0.7, 'reg_alpha': 0.24018418337506595, 'reg_lambda': 0.1606536939781901, 'random_state': 358}. Best is trial 0 with value: 223.1582632716738.
[I 2023-09-30 13:09:10,386] Trial 6 finished with value: 237.28175825278004 and parameters: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'tree_method': 'approx', 'max_depth': 4, 'learning_rate': 0.40754522586537706, 'n_estimators': 528, 'min_child_weight': 7, 'gamma': 0.8816892093404158, 'subsample': 0.7, 'colsample_bytree': 0.65, 'reg_alpha': 0.9254798027618522, 'reg_lambda': 0.46185980190964643, 'random_state': 684}. Best is

R2:  0.8843697816454247
RMSE:  402.45292853211225
graded! MAE:  187.65976903393374
Best params: {
    "objective": "reg:squarederror",
    "eval_metric": "mae",
    "tree_method": "hist",
    "max_depth": 8,
    "learning_rate": 0.01865443598282598,
    "n_estimators": 689,
    "min_child_weight": 4,
    "gamma": 0.2907210433473007,
    "subsample": 0.9,
    "colsample_bytree": 0.6,
    "reg_alpha": 0.9186744956996298,
    "reg_lambda": 0.6151767547850471,
    "random_state": 967
}
