# âš¡ Day 4 â€” Classical ML Models
## Energy Consumption Forecasting | Claysys AI Hackathon 2026

**Date:** February 22, 2026  
**Objective:** Train Random Forest, XGBoost, and LightGBM regressors using engineered lag features.

---

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from src.models.ml_models import RandomForestForecaster, XGBoostForecaster, LightGBMForecaster
from src.evaluation import compute_metrics, plot_predictions, compare_models, plot_model_comparison

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams.update({'figure.dpi': 120})
print('âœ… Day 4 Setup complete')

## 1. Load Processed Features

In [None]:
train_df = pd.read_csv('../data/processed/train.csv', index_col='Datetime', parse_dates=True)
test_df  = pd.read_csv('../data/processed/test.csv',  index_col='Datetime', parse_dates=True)

# Remove non-numeric and non-feature columns
drop_cols = ['season']
train_df = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
test_df  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])

TARGET = 'Global_active_power'

# Features = everything except target
feature_cols = [c for c in train_df.select_dtypes(include=[np.number]).columns if c != TARGET]

X_train, y_train = train_df[feature_cols], train_df[TARGET]
X_test,  y_test  = test_df[feature_cols],  test_df[TARGET]

print(f'X_train: {X_train.shape},  y_train: {y_train.shape}')
print(f'X_test : {X_test.shape },  y_test : {y_test.shape }')
print(f'Number of features: {X_train.shape[1]}')

## 2. Model 1 â€” Random Forest

In [None]:
rf_model = RandomForestForecaster(n_estimators=200, max_depth=None)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
metrics_rf = compute_metrics(y_test.values, rf_preds, model_name='Random Forest')

# Feature importance
fi = rf_model.feature_importance().head(15)
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(fi['feature'][::-1], fi['importance'][::-1], color='steelblue', alpha=0.85)
ax.set_title('Random Forest â€” Top 15 Feature Importances', fontweight='bold')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.savefig('../reports/figures/rf_feature_importance.png', bbox_inches='tight')
plt.show()

rf_model.save('random_forest.pkl')

In [None]:
plot_predictions(y_test.values[:168], rf_preds[:168],
                 index=test_df.index[:168],
                 model_name='Random Forest (First Week of Test)',
                 filename='rf_predictions.png')

## 3. Model 2 â€” XGBoost

In [None]:
# Validation split for early stopping (last 10% of train)
val_size = int(len(X_train) * 0.1)
X_tr, X_val = X_train.iloc[:-val_size], X_train.iloc[-val_size:]
y_tr, y_val = y_train.iloc[:-val_size], y_train.iloc[-val_size:]

xgb_model = XGBoostForecaster(n_estimators=500, learning_rate=0.05, max_depth=6)
xgb_model.fit(X_tr, y_tr, X_val=X_val, y_val=y_val)

xgb_preds = xgb_model.predict(X_test)
metrics_xgb = compute_metrics(y_test.values, xgb_preds, model_name='XGBoost')

xgb_model.save('xgboost.pkl')

plot_predictions(y_test.values[:168], xgb_preds[:168],
                 index=test_df.index[:168],
                 model_name='XGBoost (First Week of Test)',
                 filename='xgb_predictions.png')

## 4. Model 3 â€” LightGBM

In [None]:
lgbm_model = LightGBMForecaster(n_estimators=500, learning_rate=0.05, num_leaves=63)
lgbm_model.fit(X_tr, y_tr, X_val=X_val, y_val=y_val)

lgbm_preds = lgbm_model.predict(X_test)
metrics_lgbm = compute_metrics(y_test.values, lgbm_preds, model_name='LightGBM')

lgbm_model.save('lightgbm.pkl')

plot_predictions(y_test.values[:168], lgbm_preds[:168],
                 index=test_df.index[:168],
                 model_name='LightGBM (First Week of Test)',
                 filename='lgbm_predictions.png')

## 5. Day 4 Model Comparison

In [None]:
results_day4 = [metrics_rf, metrics_xgb, metrics_lgbm]
comparison_df = compare_models(results_day4)

comparison_df.to_csv('../reports/ml_results.csv')
plot_model_comparison(comparison_df, metric='RMSE')

# Plot all models side-by-side on a test week
fig, ax = plt.subplots(figsize=(15, 5))
n = 168  # 1 week
ax.plot(test_df.index[:n], y_test.values[:n], label='Actual', color='black', linewidth=1.5, zorder=5)
ax.plot(test_df.index[:n], rf_preds[:n],   label='Random Forest', color='#2196F3', linewidth=1, linestyle='--')
ax.plot(test_df.index[:n], xgb_preds[:n],  label='XGBoost',       color='#FF9800', linewidth=1, linestyle='--')
ax.plot(test_df.index[:n], lgbm_preds[:n], label='LightGBM',      color='#4CAF50', linewidth=1, linestyle='--')
ax.set_title('ML Models â€” 1 Week Forecast Comparison', fontweight='bold')
ax.set_ylabel('Global Active Power (kW)')
ax.legend(loc='upper right')
plt.tight_layout()
plt.savefig('../reports/figures/ml_comparison_week.png', bbox_inches='tight')
plt.show()

print('\nðŸŽ‰ Day 4 Complete! ML models trained and saved.')
print('   Ready for Day 5: Deep Learning â€” LSTM & GRU with PyTorch')