In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
                                    StratifiedKFold, 
                                    GridSearchCV
                                    )
from sklearn.metrics import confusion_matrix
warnings.filterwarnings('ignore')
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit

In [2]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
y_train = np.load('artifacts/Y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
y_test = np.load('artifacts/Y_test.npz')['arr_0']

In [3]:
# Function for Optuna objective (example for XGBoost; repeat for others)
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42
    }
    model = xgb.XGBRegressor(**params)
    
    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
    
    # Use TimeSeriesSplit instead of random cv=5
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

# Run Optuna 
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)
best_params_xgb = study_xgb.best_params
best_value_xgb = study_xgb.best_value

print("Best XGBoost Params:", best_params_xgb)
print("Best XGBoost MAE Score:", -best_value_xgb)  # Convert back to positive MAE
print(f"Best XGBoost MAE: {-best_value_xgb:.4f}")

[I 2025-08-22 18:46:32,134] A new study created in memory with name: no-name-12fd8c3b-c0f2-464b-8e67-9de960dc60c5
[I 2025-08-22 18:46:33,283] Trial 0 finished with value: -8.59860913474613 and parameters: {'n_estimators': 137, 'max_depth': 4, 'learning_rate': 0.05736471246712876, 'subsample': 0.6251576614466313, 'colsample_bytree': 0.8675635519038313}. Best is trial 0 with value: -8.59860913474613.
[I 2025-08-22 18:46:36,228] Trial 1 finished with value: -8.615926880076085 and parameters: {'n_estimators': 163, 'max_depth': 9, 'learning_rate': 0.021202518837642337, 'subsample': 0.7325469665025984, 'colsample_bytree': 0.9688243176182938}. Best is trial 0 with value: -8.59860913474613.
[I 2025-08-22 18:46:37,015] Trial 2 finished with value: -8.554287588370297 and parameters: {'n_estimators': 81, 'max_depth': 5, 'learning_rate': 0.08605418679923313, 'subsample': 0.8097779517945465, 'colsample_bytree': 0.812217570389743}. Best is trial 2 with value: -8.554287588370297.
[I 2025-08-22 18:46:

Best XGBoost Params: {'n_estimators': 217, 'max_depth': 3, 'learning_rate': 0.15414785835129766, 'subsample': 0.8831953859498567, 'colsample_bytree': 0.8022172245813096}
Best XGBoost MAE Score: 8.452786374061379
Best XGBoost MAE: 8.4528


In [4]:
# Repeat for LightGBM
def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42,
        'verbose': -1
    }
    model = lgb.LGBMRegressor(**params)
    
    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
        
    # Use TimeSeriesSplit instead of random cv=5
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50)
best_params_lgb = study_lgb.best_params
best_value_lgb = study_lgb.best_value

print("Best LightGBM Params:", best_params_lgb)
print("Best LightGBM MAE Score:", -best_value_lgb)  # Convert back to positive MAE
print(f"Best LightGBM MAE: {-best_value_lgb:.4f}")

[I 2025-08-22 18:48:45,302] A new study created in memory with name: no-name-e780bd33-1169-427f-a510-a3bd2c1a654c
[I 2025-08-22 18:48:47,203] Trial 0 finished with value: -8.485991628177407 and parameters: {'n_estimators': 69, 'max_depth': 16, 'learning_rate': 0.16679793316993355, 'subsample': 0.6075999142415109, 'colsample_bytree': 0.759698502317446}. Best is trial 0 with value: -8.485991628177407.
[I 2025-08-22 18:48:49,168] Trial 1 finished with value: -8.520525816040927 and parameters: {'n_estimators': 79, 'max_depth': 14, 'learning_rate': 0.2606850437964723, 'subsample': 0.67964193199874, 'colsample_bytree': 0.7326456362879937}. Best is trial 0 with value: -8.485991628177407.
[I 2025-08-22 18:48:55,187] Trial 2 finished with value: -8.576176683220869 and parameters: {'n_estimators': 265, 'max_depth': 9, 'learning_rate': 0.23436145912913464, 'subsample': 0.921602482086893, 'colsample_bytree': 0.7994724943460797}. Best is trial 0 with value: -8.485991628177407.
[I 2025-08-22 18:48:5

Best LightGBM Params: {'n_estimators': 140, 'max_depth': 3, 'learning_rate': 0.2599855943525703, 'subsample': 0.8653157639528063, 'colsample_bytree': 0.6996016591440931}
Best LightGBM MAE Score: 8.463817999518277
Best LightGBM MAE: 8.4638


In [None]:
# Repeat for CatBoost (similar, params: iterations, depth, learning_rate, subsample, colsample_bylevel)
def objective_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 300),
        'depth': trial.suggest_int('depth', 3, 16),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
        'random_state': 42,
        'verbose': 0
    }
    model = CatBoostRegressor(**params)

    # CV on logged y, but score on inverse for MAE
    def custom_scorer(estimator, X, y):
        preds_log = estimator.predict(X)
        preds = np.expm1(preds_log)
        y_orig = np.expm1(y)s
        return -mean_absolute_error(y_orig, preds)  # Neg for minimization
        
    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X_train, y_train, cv=tscv, scoring=custom_scorer)
    return score.mean()

study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=50)
best_params_cat = study_cat.best_params
best_value_cat = study_cat.best_value

print("Best CatBoost Params:", best_params_cat)
print("Best CatBoost MAE Score:", -best_value_cat)  # Convert back to positive MAE
print(f"Best CatBoost MAE: {-best_value_cat:.4f}")

[I 2025-08-22 18:50:48,516] A new study created in memory with name: no-name-d58858b6-ca7e-456c-8d78-c183447cc8ab
[I 2025-08-22 18:52:54,968] Trial 0 finished with value: -8.7498539576803 and parameters: {'iterations': 213, 'depth': 16, 'learning_rate': 0.20139574540297275, 'subsample': 0.9722403483970333, 'colsample_bylevel': 0.6063832525712383}. Best is trial 0 with value: -8.7498539576803.
[I 2025-08-22 18:54:20,866] Trial 1 finished with value: -8.732905072668425 and parameters: {'iterations': 272, 'depth': 15, 'learning_rate': 0.18605769092902186, 'subsample': 0.982025879617662, 'colsample_bylevel': 0.8758347896652905}. Best is trial 1 with value: -8.732905072668425.
[I 2025-08-22 18:57:11,639] Trial 2 finished with value: -8.701123265458506 and parameters: {'iterations': 296, 'depth': 16, 'learning_rate': 0.1260929193302688, 'subsample': 0.6070362005425208, 'colsample_bylevel': 0.6949846678138509}. Best is trial 2 with value: -8.701123265458506.
[I 2025-08-22 18:57:39,471] Trial 

Best CatBoost Params: {'iterations': 213, 'depth': 4, 'learning_rate': 0.2099163579513301, 'subsample': 0.7929183907449346, 'colsample_bylevel': 0.9192274439171659}
Best CatBoost MAE Score: 8.458653857443931
Best CatBoost MAE: 8.4587


In [6]:
# Step: Retrain tuned models and evaluate
tuned_models = {
    'Tuned XGBoost': xgb.XGBRegressor(**best_params_xgb),
    'Tuned LightGBM': lgb.LGBMRegressor(**best_params_lgb, verbose=-1),
    'Tuned CatBoost': CatBoostRegressor(**best_params_cat, verbose=0)
}

In [7]:
# Define function to train and evaluate a model
def train_evaluate(model, model_name):
    # Train
    model.fit(X_train, y_train)
    
    # Predict (logged scale)
    preds_log = model.predict(X_test)
    
    # Inverse transform to original scale for metrics
    preds = np.expm1(preds_log)
    y_test_orig = np.expm1(y_test)
    
    # Metrics
    mae = mean_absolute_error(y_test_orig, preds)
    rmse = np.sqrt(mean_squared_error(y_test_orig, preds))
    r2 = r2_score(y_test_orig, preds)
    
    # Feature importance (for tree-based)
    fi = None
    if hasattr(model, 'feature_importances_'):
        fi = model.feature_importances_
    elif hasattr(model, 'get_feature_importance'):
        fi = model.get_feature_importance()

    if fi is not None:
        feature_names = X_train.columns if isinstance(X_train, pd.DataFrame) else [f"f{i}" for i in range(X_train.shape[1])]
        importances = pd.Series(fi, index=feature_names).sort_values(ascending=False)
        print(f"\nTop 5 Features for {model_name}:\n{importances.head()}")
    
    return {'Model': model_name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

In [8]:
tuned_results = []
for name, model in tuned_models.items():
    tuned_results.append(train_evaluate(model, name))  # Reuse function from base

tuned_df = pd.DataFrame(tuned_results)
print("\nTuned Model Comparison:")
print(tuned_df)


Top 5 Features for Tuned XGBoost:
f38    0.154879
f32    0.116662
f42    0.111875
f40    0.109052
f29    0.081925
dtype: float32



Top 5 Features for Tuned LightGBM:
f0     203
f20     65
f21     45
f1      44
f22     43
dtype: int32

Top 5 Features for Tuned CatBoost:
f38    13.037677
f32    12.575601
f42    10.292195
f40    10.023203
f29     8.121683
dtype: float64

Tuned Model Comparison:
            Model       MAE       RMSE        R2
0   Tuned XGBoost  7.982738  11.081593  0.687961
1  Tuned LightGBM  8.005724  11.090762  0.687445
2  Tuned CatBoost  7.974690  11.076789  0.688232


In [9]:
# Save the best tuned CatBoost model to pickle file
import pickle

# Get the best CatBoost model (which is already trained from the evaluation loop)
best_catboost_model = tuned_models['Tuned CatBoost']

# Save the model to a pickle file
with open('best_catboost_model.pkl', 'wb') as f:
    pickle.dump(best_catboost_model, f)

print("Best CatBoost model saved to 'best_catboost_model.pkl'")

# Also save the best parameters for reference
with open('best_catboost_params.pkl', 'wb') as f:
    pickle.dump(best_params_cat, f)

print("Best CatBoost parameters saved to 'best_catboost_params.pkl'")

# Display the saved model info
print(f"\nSaved model: {type(best_catboost_model).__name__}")
print(f"Model parameters: {best_params_cat}")
print(f"Model performance - MAE: {7.974690:.6f}, RMSE: {11.076789:.6f}, R2: {0.688232:.6f}")

Best CatBoost model saved to 'best_catboost_model.pkl'
Best CatBoost parameters saved to 'best_catboost_params.pkl'

Saved model: CatBoostRegressor
Model parameters: {'iterations': 213, 'depth': 4, 'learning_rate': 0.2099163579513301, 'subsample': 0.7929183907449346, 'colsample_bylevel': 0.9192274439171659}
Model performance - MAE: 7.974690, RMSE: 11.076789, R2: 0.688232


In [11]:
# ... existing code ...

# Load the saved best CatBoost model and compare predictions with y_test
import pickle
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the saved model
with open('best_catboost_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions
predictions_log = loaded_model.predict(X_test)

# Convert predictions back to original scale (since y_train was log-transformed)
predictions = np.expm1(predictions_log)
y_test_orig = np.expm1(y_test)

# Calculate metrics
mae = mean_absolute_error(y_test_orig, predictions)
rmse = np.sqrt(mean_squared_error(y_test_orig, predictions))
r2 = r2_score(y_test_orig, predictions)

print("Loaded Model Performance Comparison:")
print(f"MAE: {mae:.6f}")
print(f"RMSE: {rmse:.6f}")
print(f"R2: {r2:.6f}")

# Compare first 10 predictions vs actual values
print(f"\nFirst 10 Predictions vs Actual Values:")
print("Predicted\tActual\t\tDifference")
print("-" * 50)
for i in range(min(10, len(predictions))):
    pred = predictions[i]
    actual = y_test_orig[i]
    diff = pred - actual
    print(f"{pred:.4f}\t\t{actual:.4f}\t\t{diff:+.4f}")

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual': y_test_orig,
    'Predicted': predictions,
    'Difference': predictions - y_test_orig,
    'Absolute_Error': np.abs(predictions - y_test_orig)
})

print(f"\nComparison Summary:")
print(f"Total samples: {len(comparison_df)}")
print(f"Mean absolute error: {comparison_df['Absolute_Error'].mean():.6f}")
print(f"Standard deviation of errors: {comparison_df['Difference'].std():.6f}")
print(f"Min error: {comparison_df['Difference'].min():.6f}")
print(f"Max error: {comparison_df['Difference'].max():.6f}")

# Show the first few rows of the comparison
print(f"\nFirst 5 rows of detailed comparison:")
print(comparison_df.head())

Loaded Model Performance Comparison:
MAE: 7.974690
RMSE: 11.076789
R2: 0.688232

First 10 Predictions vs Actual Values:
Predicted	Actual		Difference
--------------------------------------------------
25.2032		21.0000		+4.2032
25.2032		21.0000		+4.2032
26.8380		23.0000		+3.8380
34.6309		33.0000		+1.6309
31.6658		27.0000		+4.6658
36.9934		27.0000		+9.9934
31.6658		32.0000		-0.3342
36.9934		48.0000		-11.0066
27.8805		21.0000		+6.8805
70.0051		90.0000		-19.9949

Comparison Summary:
Total samples: 40739
Mean absolute error: 7.974690
Standard deviation of errors: 11.076896
Min error: -60.203295
Max error: 51.751484

First 5 rows of detailed comparison:
   Actual  Predicted  Difference  Absolute_Error
0    21.0  25.203200    4.203200        4.203200
1    21.0  25.203200    4.203200        4.203200
2    23.0  26.837983    3.837983        3.837983
3    33.0  34.630884    1.630884        1.630884
4    27.0  31.665830    4.665830        4.665830
