In [29]:
from Master import MasterDataframes


# Data handling
import pickle
import pandas as pd

# Helper functions
from functions import get_days_sinse_beginning_of_year, get_seconds_of_day

# Types handling
import numpy as np

# Data science
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.discriminant_analysis import StandardScaler

# Machine learning tool
import xgboost as xgb
# Optimization / feature engineering tools
import optuna

# Plotting
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# Smart options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

In [10]:
X, Y = MasterDataframes().prep_dataset_x_y("A")

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=10, test_size=0.20, groups=None)

def objective(trial):
    param = {
        "objective": "reg:linear",
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.01, 1.0),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.01, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.01, 1.0),
        "random_state": trial.suggest_int("random_state", 1, 1000),
    }
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

study = optuna.create_study(direction="minimize", study_name="regression")
study.optimize(objective, n_trials=10)

model = xgb.XGBRegressor(**study.best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

predictions = [round(value) for value in y_pred]

accuracy = mean_absolute_error(y_test, y_pred)
print("MAE: %.2f%%" % (accuracy))

[I 2023-09-29 11:54:06,253] A new study created in memory with name: regression
[I 2023-09-29 11:54:08,921] Trial 0 finished with value: 552225973.8416195 and parameters: {'max_depth': 10, 'learning_rate': 0.6403698866093191, 'n_estimators': 729, 'min_child_weight': 10, 'gamma': 0.10379618775652646, 'subsample': 0.11332722970223123, 'colsample_bytree': 0.21140713077007542, 'reg_alpha': 0.3082468104568156, 'reg_lambda': 0.8284468801827363, 'random_state': 471}. Best is trial 0 with value: 552225973.8416195.
[I 2023-09-29 11:54:10,691] Trial 1 finished with value: 483.1535115596932 and parameters: {'max_depth': 5, 'learning_rate': 0.6903092636296524, 'n_estimators': 689, 'min_child_weight': 5, 'gamma': 0.3606264981914766, 'subsample': 0.3481328409035814, 'colsample_bytree': 0.48115730538457707, 'reg_alpha': 0.8477706285988587, 'reg_lambda': 0.6234714224662702, 'random_state': 350}. Best is trial 1 with value: 483.1535115596932.
[I 2023-09-29 11:54:12,254] Trial 2 finished with value: 232

MAE: 17827.29%


In [24]:
thresholds = np.sort(model.feature_importances_)
best = accuracy
for thresh in thresholds[int(len(thresholds)/4) : ]:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    # train model
    selection_model = xgb.XGBRegressor(**study.best_params)
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    
    accuracy = mean_absolute_error(y_test, y_pred)
    if best > accuracy:
        best = accuracy
        print(f"New best: {best}")
    print(f"Thresh={str(thresh)}, n={select_X_train.shape[1]}, MAE: {accuracy}, Best: {best}")

New best: 180.09733287984827
Thresh=0.0016273983, n=73, MAE: 180.09733287984827, Best: 180.09733287984827
Thresh=0.0019661684, n=72, MAE: 181.56535625984182, Best: 180.09733287984827
Thresh=0.0023614739, n=71, MAE: 180.76675521200988, Best: 180.09733287984827
Thresh=0.002474928, n=70, MAE: 180.99984651742383, Best: 180.09733287984827
Thresh=0.0025197142, n=69, MAE: 182.10846923070955, Best: 180.09733287984827
Thresh=0.0026132972, n=68, MAE: 183.51906247248445, Best: 180.09733287984827
Thresh=0.0027077012, n=67, MAE: 181.77093489279997, Best: 180.09733287984827
New best: 178.9970749403461
Thresh=0.0029031972, n=66, MAE: 178.9970749403461, Best: 178.9970749403461
Thresh=0.0030150632, n=65, MAE: 180.3222760848616, Best: 178.9970749403461
New best: 178.20908320870808
Thresh=0.0031922213, n=64, MAE: 178.20908320870808, Best: 178.20908320870808
New best: 178.06597252851486
Thresh=0.0033383314, n=63, MAE: 178.06597252851486, Best: 178.06597252851486
Thresh=0.0033546565, n=62, MAE: 182.1830375