In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
                                    StratifiedKFold, 
                                    GridSearchCV
                                    )
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit

In [2]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
y_train = np.load('artifacts/Y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
y_test = np.load('artifacts/Y_test.npz')['arr_0']

In [3]:
# Function for Optuna objective (example for XGBoost; repeat for others)
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42
    }
    model = xgb.XGBRegressor(**params)
    
    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
    
    # Use TimeSeriesSplit instead of random cv=5
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

# Run Optuna 
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)
best_params_xgb = study_xgb.best_params
best_value_xgb = study_xgb.best_value

print("Best XGBoost Params:", best_params_xgb)
print("Best XGBoost MAE Score:", -best_value_xgb)  # Convert back to positive MAE
print(f"Best XGBoost MAE: {-best_value_xgb:.4f}")

[I 2025-08-22 17:53:09,906] A new study created in memory with name: no-name-bbf3add3-0edc-489c-ae5d-7b2de5b8cb0f
[I 2025-08-22 17:53:11,281] Trial 0 finished with value: -8.508890804741783 and parameters: {'n_estimators': 178, 'max_depth': 4, 'learning_rate': 0.12600118497502277, 'subsample': 0.6014207832470704, 'colsample_bytree': 0.8256211576723352}. Best is trial 0 with value: -8.508890804741783.
[I 2025-08-22 17:53:13,199] Trial 1 finished with value: -8.504567928715218 and parameters: {'n_estimators': 260, 'max_depth': 4, 'learning_rate': 0.09541773291283905, 'subsample': 0.7024559403399236, 'colsample_bytree': 0.7214183159517761}. Best is trial 1 with value: -8.504567928715218.
[I 2025-08-22 17:53:15,757] Trial 2 finished with value: -8.545705876177818 and parameters: {'n_estimators': 177, 'max_depth': 8, 'learning_rate': 0.03494297104433743, 'subsample': 0.725948692154791, 'colsample_bytree': 0.8007372620870656}. Best is trial 1 with value: -8.504567928715218.
[I 2025-08-22 17:

Best XGBoost Params: {'n_estimators': 211, 'max_depth': 3, 'learning_rate': 0.16679759530113814, 'subsample': 0.8336117918923477, 'colsample_bytree': 0.6560630106568957}
Best XGBoost MAE Score: 8.485918114388033
Best XGBoost MAE: 8.4859


In [4]:
# Repeat for LightGBM
def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42,
        'verbose': -1
    }
    model = lgb.LGBMRegressor(**params)
    
    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
        
    # Use TimeSeriesSplit instead of random cv=5
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50)
best_params_lgb = study_lgb.best_params
best_value_lgb = study_lgb.best_value

print("Best LightGBM Params:", best_params_lgb)
print("Best LightGBM MAE Score:", -best_value_lgb)  # Convert back to positive MAE
print(f"Best LightGBM MAE: {-best_value_lgb:.4f}")

[I 2025-08-22 17:55:46,331] A new study created in memory with name: no-name-4f21f1ba-ec33-4671-a889-25aaaa0be7ab
[I 2025-08-22 17:55:51,396] Trial 0 finished with value: -8.537307855641306 and parameters: {'n_estimators': 214, 'max_depth': 13, 'learning_rate': 0.1393200759788335, 'subsample': 0.8636713719050699, 'colsample_bytree': 0.7985614409374419}. Best is trial 0 with value: -8.537307855641306.
[I 2025-08-22 17:55:52,933] Trial 1 finished with value: -8.507422820191909 and parameters: {'n_estimators': 59, 'max_depth': 16, 'learning_rate': 0.2442208652950465, 'subsample': 0.6954738042456109, 'colsample_bytree': 0.871999169191161}. Best is trial 1 with value: -8.507422820191909.
[I 2025-08-22 17:55:54,873] Trial 2 finished with value: -8.537148813621888 and parameters: {'n_estimators': 183, 'max_depth': 4, 'learning_rate': 0.25520487643937934, 'subsample': 0.6826826780052776, 'colsample_bytree': 0.9648767932595352}. Best is trial 1 with value: -8.507422820191909.
[I 2025-08-22 17:5

Best LightGBM Params: {'n_estimators': 245, 'max_depth': 3, 'learning_rate': 0.13760524557337434, 'subsample': 0.9269820004647961, 'colsample_bytree': 0.8027916618753622}
Best LightGBM MAE Score: 8.490643069355562
Best LightGBM MAE: 8.4906


In [5]:
# Repeat for CatBoost (similar, params: iterations, depth, learning_rate, subsample, colsample_bylevel)
def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'random_state': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)

    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
        
    # Use TimeSeriesSplit instead of random cv=5
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=50)
best_params_cat = study_cat.best_params
best_value_cat = study_cat.best_value

print("Best CatBoost Params:", best_params_cat)
print("Best CatBoost MAE Score:", -best_value_cat)  # Convert back to positive MAE
print(f"Best CatBoost MAE: {-best_value_cat:.4f}")

[I 2025-08-22 17:58:22,531] A new study created in memory with name: no-name-e6931551-107b-4873-8da2-e8235f77ec88
[I 2025-08-22 17:58:26,938] Trial 0 finished with value: -8.492702940667602 and parameters: {'iterations': 183, 'depth': 3, 'learning_rate': 0.23624250001062497, 'subsample': 0.6335058056584243, 'colsample_bylevel': 0.7988558823464064}. Best is trial 0 with value: -8.492702940667602.
[I 2025-08-22 17:59:21,551] Trial 1 finished with value: -8.726298339753555 and parameters: {'iterations': 160, 'depth': 16, 'learning_rate': 0.17347421905403854, 'subsample': 0.6287911850153914, 'colsample_bylevel': 0.718676282944387}. Best is trial 0 with value: -8.492702940667602.
[I 2025-08-22 17:59:44,648] Trial 2 finished with value: -8.775657365880994 and parameters: {'iterations': 228, 'depth': 14, 'learning_rate': 0.248548917717266, 'subsample': 0.9857958825229934, 'colsample_bylevel': 0.733195233694835}. Best is trial 0 with value: -8.492702940667602.
[I 2025-08-22 17:59:49,489] Trial

Best CatBoost Params: {'iterations': 231, 'depth': 3, 'learning_rate': 0.23543055114146785, 'subsample': 0.8059294532139416, 'colsample_bylevel': 0.7429421871899814}
Best CatBoost MAE Score: 8.489460181841231
Best CatBoost MAE: 8.4895


In [6]:
# Step: Retrain tuned models and evaluate
tuned_models = {
    'Tuned XGBoost': xgb.XGBRegressor(**best_params_xgb),
    'Tuned LightGBM': lgb.LGBMRegressor(**best_params_lgb, verbose=-1),
    'Tuned CatBoost': CatBoostRegressor(**best_params_cat, verbose=0)
}

In [7]:
# Define function to train and evaluate a model
def train_evaluate(model, model_name):
    # Train
    model.fit(X_train, y_train)
    
    # Predict (logged scale)
    preds_log = model.predict(X_test)
    
    # Inverse transform to original scale for metrics
    preds = np.expm1(preds_log)
    y_test_orig = np.expm1(y_test)
    
    # Metrics
    mae = mean_absolute_error(y_test_orig, preds)
    rmse = np.sqrt(mean_squared_error(y_test_orig, preds))
    r2 = r2_score(y_test_orig, preds)
    
    # Feature importance (for tree-based)
    fi = None
    if hasattr(model, 'feature_importances_'):
        fi = model.feature_importances_
    elif hasattr(model, 'get_feature_importance'):
        fi = model.get_feature_importance()

    if fi is not None:
        feature_names = X_train.columns if isinstance(X_train, pd.DataFrame) else [f"f{i}" for i in range(X_train.shape[1])]
        importances = pd.Series(fi, index=feature_names).sort_values(ascending=False)
        print(f"\nTop 5 Features for {model_name}:\n{importances.head()}")
    
    return {'Model': model_name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

In [8]:
tuned_results = []
for name, model in tuned_models.items():
    tuned_results.append(train_evaluate(model, name))  # Reuse function from base

tuned_df = pd.DataFrame(tuned_results)
print("\nTuned Model Comparison:")
print(tuned_df)


Top 5 Features for Tuned XGBoost:
f32    0.118967
f38    0.117628
f40    0.113651
f42    0.110664
f29    0.082768
dtype: float32

Top 5 Features for Tuned LightGBM:
f0     338
f20    112
f21     86
f22     80
f1      70
dtype: int32

Top 5 Features for Tuned CatBoost:
f38    14.217139
f32    12.393683
f40    11.045087
f42    10.293143
f0      7.796659
dtype: float64

Tuned Model Comparison:
            Model       MAE       RMSE        R2
0   Tuned XGBoost  8.030974  11.145810  0.684334
1  Tuned LightGBM  8.032828  11.138757  0.684734
2  Tuned CatBoost  8.010300  11.136969  0.684835
