# Model Training: XGBoost Price Forecasting

This notebook demonstrates how to use the feature pipeline to train an XGBoost model for electricity price forecasting.

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

from utils.preprocessing import (
    create_merged_dataset,
    split_train_val_test,
    handle_missing_values
)
from features.pipeline import TimeSeriesFeatureEngine
from models.train import (
    train_xgboost_model,
    train_baseline_model,
    evaluate_model,
    get_feature_importance,
    save_model
)
from models.baselines import NaivePersistence

# Plotly for interactive visualizations
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

## 1. Load and Prepare Data

In [2]:
# Load merged dataset from cache
df = create_merged_dataset(db_path="../data/cache.db")

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nColumns: {df.columns.tolist()}")

df.head()

Dataset shape: (26305, 6)
Date range: 2022-01-01 00:00:00+01:00 to 2025-01-01 00:00:00+01:00

Columns: ['price', 'load_forecast', 'actual_load', 'wind_onshore', 'wind_offshore', 'solar']


Unnamed: 0_level_0,price,load_forecast,actual_load,wind_onshore,wind_offshore,solar
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00+01:00,124.7,11532.1925,10374.1425,2646.25,1352.75,0.0
2022-01-01 01:00:00+01:00,124.7,11085.105,10249.9175,2419.75,1307.5,0.0
2022-01-01 02:00:00+01:00,134.0,10805.7575,9907.035,2161.75,1234.75,0.0
2022-01-01 03:00:00+01:00,58.8,10676.2475,9782.305,1954.75,1129.25,0.0
2022-01-01 04:00:00+01:00,37.67,10643.9175,9589.525,1706.75,1044.0,0.0


In [3]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])

Missing values per column:
load_forecast     1
actual_load       1
wind_onshore     25
wind_offshore    25
solar            25
dtype: int64


In [4]:
# Handle missing values (forward fill with limit)
df_clean = handle_missing_values(df, strategy='forward_fill', limit=24)

# Drop any remaining NaN rows
df_clean = df_clean.dropna()

print(f"Clean dataset shape: {df_clean.shape}")

Clean dataset shape: (26305, 6)


## 2. Split Data (Time-based)

In [5]:
# Time-based split (no shuffling!)
train_df, val_df, test_df = split_train_val_test(
    df_clean,
    train_end="2023-12-31",
    val_end="2024-06-30"
)

Train: 2022-01-01 00:00:00+01:00 to 2023-12-31 00:00:00+01:00 (17497 samples)
Val:   2023-12-31 01:00:00+01:00 to 2024-06-30 00:00:00+02:00 (4367 samples)
Test:  2024-06-30 01:00:00+02:00 to 2025-01-01 00:00:00+01:00 (4441 samples)


## 3. Create Features

In [6]:
# Initialize feature engine
feature_engine = TimeSeriesFeatureEngine(
    target_col='price',
    forecast_horizon=24,
    feature_config={'scaler_type': 'robust'},
    price_lags=[0, 1, 2, 3, 6, 12, 24, 48, 144, 168],  # Add lag 0 and 144 for baselines + recent + weekly
    price_windows=[6, 12, 24, 168],             # Short-term + long-term windows
    load_lags=[1, 24, 168]                      # Recent + seasonal
)

print("Feature engine initialized")

Feature engine initialized


In [7]:
# Create features on the FULL dataset first, then split
# This ensures lag features have access to historical data
print("Creating features on full dataset...")
X_full, y_full = feature_engine.prepare_data(df_clean, create_target=True)

print(f"Full dataset: X={X_full.shape}, y={y_full.shape}")

# Now split the features based on the original time splits
train_end_date = pd.Timestamp("2023-12-31 23:00:00+01:00")
val_end_date = pd.Timestamp("2024-06-30 23:00:00+02:00")

# Split based on index
X_train = X_full[X_full.index <= train_end_date]
y_train = y_full[y_full.index <= train_end_date]

X_val = X_full[(X_full.index > train_end_date) & (X_full.index <= val_end_date)]
y_val = y_full[(y_full.index > train_end_date) & (y_full.index <= val_end_date)]

X_test = X_full[X_full.index > val_end_date]
y_test = y_full[y_full.index > val_end_date]

print(f"\nTrain: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape}, y={y_val.shape}")
print(f"Test:  X={X_test.shape}, y={y_test.shape}")

Creating features on full dataset...
Full dataset: X=(26281, 67), y=(26281,)

Train: X=(17520, 67), y=(17520,)
Val:   X=(4367, 67), y=(4367,)
Test:  X=(4394, 67), y=(4394,)


In [8]:
# Handle NaN values appropriately
# For lag features, forward fill is better than filling with 0
# For other features, use 0

lag_cols = [col for col in X_train.columns if 'lag' in col.lower()]
other_cols = [col for col in X_train.columns if col not in lag_cols]

# Forward fill lag features (more reasonable than 0)
X_train.loc[:, lag_cols] = X_train[lag_cols].ffill().fillna(0)
X_val.loc[:, lag_cols] = X_val[lag_cols].ffill().fillna(0)
X_test.loc[:, lag_cols] = X_test[lag_cols].ffill().fillna(0)

# Fill other features with 0
X_train.loc[:, other_cols] = X_train[other_cols].fillna(0)
X_val.loc[:, other_cols] = X_val[other_cols].fillna(0)
X_test.loc[:, other_cols] = X_test[other_cols].fillna(0)

print(f"Total features created: {X_train.shape[1]}")
print(f"\nFeature names (first 20):")
print(X_train.columns.tolist()[:20])

Total features created: 67

Feature names (first 20):
['load_forecast', 'actual_load', 'wind_onshore', 'wind_offshore', 'solar', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'is_weekend', 'total_renewable', 'total_wind', 'residual_load', 'residual_load_pct', 'renewable_penetration', 'wind_onshore_ratio', 'wind_offshore_ratio', 'solar_ratio']


## 4. Train XGBoost Model

In [9]:
# Fit scaler on training data
feature_engine.fit(X_train, y_train)

# Transform features
X_train_scaled = feature_engine.transform(X_train)
X_val_scaled = feature_engine.transform(X_val)
X_test_scaled = feature_engine.transform(X_test)

In [10]:
# Train model with early stopping
model = train_xgboost_model(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    params={
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
    },
    n_estimators=1000,
    early_stopping_rounds=50,
    verbose=True
)

[0]	validation_0-rmse:118.27024	validation_1-rmse:103.65035
[1]	validation_0-rmse:113.46519	validation_1-rmse:99.23352
[2]	validation_0-rmse:109.00440	validation_1-rmse:95.62522
[3]	validation_0-rmse:104.84554	validation_1-rmse:91.66255
[4]	validation_0-rmse:100.85155	validation_1-rmse:88.06930
[5]	validation_0-rmse:97.12713	validation_1-rmse:84.42352
[6]	validation_0-rmse:93.49271	validation_1-rmse:80.82915
[7]	validation_0-rmse:90.05641	validation_1-rmse:77.84960
[8]	validation_0-rmse:86.81071	validation_1-rmse:74.58469
[9]	validation_0-rmse:83.83056	validation_1-rmse:71.70913
[10]	validation_0-rmse:80.98599	validation_1-rmse:69.24115
[11]	validation_0-rmse:78.32550	validation_1-rmse:66.53434
[12]	validation_0-rmse:75.76822	validation_1-rmse:64.13778
[13]	validation_0-rmse:73.45132	validation_1-rmse:61.78048
[14]	validation_0-rmse:71.25861	validation_1-rmse:60.03690
[15]	validation_0-rmse:69.12983	validation_1-rmse:58.21390
[16]	validation_0-rmse:67.12553	validation_1-rmse:56.28722
[

## 5. Evaluate Model

In [11]:
# Evaluate on all sets
train_metrics = evaluate_model(model, X_train_scaled, y_train, set_name="Train")
val_metrics = evaluate_model(model, X_val_scaled, y_val, set_name="Validation")
test_metrics = evaluate_model(model, X_test_scaled, y_test, set_name="Test")

2025-12-17 20:45:11 - train_model - INFO - Train Set Performance:
2025-12-17 20:45:11 - train_model - INFO -   MAE:  15.26 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   RMSE: 20.60 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:11 - train_model - INFO -   R²:   0.9721
2025-12-17 20:45:11 - train_model - INFO - Validation Set Performance:
2025-12-17 20:45:11 - train_model - INFO -   MAE:  20.70 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   RMSE: 28.37 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:11 - train_model - INFO -   R²:   0.4043
2025-12-17 20:45:11 - train_model - INFO - Test Set Performance:
2025-12-17 20:45:11 - train_model - INFO -   MAE:  27.05 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   RMSE: 41.30 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:11 - train_model - INFO -   R²:   0.4839


## 6. Baseline Models (Naive Forecasters)

Let's compare the XGBoost model against simple baseline models to understand the value added by machine learning.

In [12]:
# Train 24-hour persistence baseline (predict tomorrow's price = today's price)
baseline_24h = NaivePersistence(strategy='24h')
baseline_24h.fit(X_train, y_train)

# Train 168-hour persistence baseline (predict next week's price = last week's price)
baseline_168h = NaivePersistence(strategy='168h')
baseline_168h.fit(X_train, y_train)

print("Baseline models initialized successfully!")
print(f"  - 24h Persistence: Uses price_lag_0 (current price)")
print(f"  - 168h Persistence: Uses price_lag_144 (price from 6 days ago)")

Baseline models initialized successfully!
  - 24h Persistence: Uses price_lag_0 (current price)
  - 168h Persistence: Uses price_lag_144 (price from 6 days ago)


In [13]:
# Evaluate baseline models on test set
print("Evaluating Baseline Models on Test Set:")
print("="*60)

baseline_24h_metrics = evaluate_model(baseline_24h, X_test, y_test, set_name="Baseline 24h")
print()
baseline_168h_metrics = evaluate_model(baseline_168h, X_test, y_test, set_name="Baseline 168h")
print()
print("XGBoost Model (for comparison):")
print(f"  MAE:  {test_metrics['MAE']:.2f} €/MWh")
print(f"  RMSE: {test_metrics['RMSE']:.2f} €/MWh")
print(f"  R²:   {test_metrics['R2']:.4f}")

Evaluating Baseline Models on Test Set:
2025-12-17 20:45:11 - train_model - INFO - Baseline 24h Set Performance:
2025-12-17 20:45:11 - train_model - INFO -   MAE:  29.82 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   RMSE: 47.21 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:11 - train_model - INFO -   R²:   0.3256

2025-12-17 20:45:11 - train_model - INFO - Baseline 168h Set Performance:
2025-12-17 20:45:11 - train_model - INFO -   MAE:  35.55 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   RMSE: 59.39 €/MWh
2025-12-17 20:45:11 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:11 - train_model - INFO -   R²:   -0.0675

XGBoost Model (for comparison):
  MAE:  27.05 €/MWh
  RMSE: 41.30 €/MWh
  R²:   0.4839


In [14]:
# Get predictions from all models
y_pred_24h = baseline_24h.predict(X_test)
y_pred_168h = baseline_168h.predict(X_test)
y_pred_xgb = model.predict(X_test_scaled)

# Compare all models
comparison_df = pd.DataFrame({
    'Baseline 24h': baseline_24h_metrics,
    'Baseline 168h': baseline_168h_metrics,
    'XGBoost': test_metrics
})

print("\nModel Comparison (Test Set):")
print(comparison_df.round(2))


Model Comparison (Test Set):
      Baseline 24h  Baseline 168h  XGBoost
MAE          29.82          35.55    27.05
RMSE         47.21          59.39    41.30
MAPE           inf            inf      inf
R2            0.33          -0.07     0.48


In [15]:
# Visualize model comparison
fig = go.Figure()

metrics = ['MAE', 'RMSE', 'R2']
models = ['Baseline 24h', 'Baseline 168h', 'XGBoost']

for metric in metrics:
    if metric == 'R2':
        # R² on secondary y-axis
        continue
    fig.add_trace(go.Bar(
        name=metric,
        x=models,
        y=[comparison_df.loc[metric, model] for model in models],
    ))

fig.update_layout(
    title='Model Performance Comparison (Test Set)',
    xaxis_title='Model',
    yaxis_title='Error (€/MWh)',
    barmode='group',
    height=500,
    showlegend=True
)

fig.show()

In [16]:
# Compare predictions visually on a subset of test data (first 7 days)
n_hours = 7 * 24  # 7 days
plot_start = 0
plot_end = min(n_hours, len(y_test))

fig = go.Figure()

# Actual prices
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_test.values[plot_start:plot_end],
    name='Actual',
    line=dict(color='black', width=2),
    mode='lines'
))

# XGBoost predictions
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_xgb[plot_start:plot_end],
    name='XGBoost',
    line=dict(color='green', width=1.5),
    mode='lines'
))

# 24h baseline
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_24h[plot_start:plot_end],
    name='Baseline 24h',
    line=dict(color='blue', width=1.5, dash='dash'),
    mode='lines'
))

# 168h baseline
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_168h[plot_start:plot_end],
    name='Baseline 168h',
    line=dict(color='red', width=1.5, dash='dot'),
    mode='lines'
))

fig.update_layout(
    title='Model Predictions Comparison (First Week of Test Set)',
    xaxis_title='Date',
    yaxis_title='Price (€/MWh)',
    height=500,
    hovermode='x unified',
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)

fig.show()

In [17]:
# Calculate improvement over baselines
mae_improvement_24h = ((baseline_24h_metrics['MAE'] - test_metrics['MAE']) / baseline_24h_metrics['MAE']) * 100
mae_improvement_168h = ((baseline_168h_metrics['MAE'] - test_metrics['MAE']) / baseline_168h_metrics['MAE']) * 100

rmse_improvement_24h = ((baseline_24h_metrics['RMSE'] - test_metrics['RMSE']) / baseline_24h_metrics['RMSE']) * 100
rmse_improvement_168h = ((baseline_168h_metrics['RMSE'] - test_metrics['RMSE']) / baseline_168h_metrics['RMSE']) * 100

print("\n" + "="*60)
print("XGBoost IMPROVEMENT OVER BASELINES")
print("="*60)
print(f"\nVs. 24h Persistence:")
print(f"  MAE improvement:  {mae_improvement_24h:.1f}%")
print(f"  RMSE improvement: {rmse_improvement_24h:.1f}%")

print(f"\nVs. 168h Persistence:")
print(f"  MAE improvement:  {mae_improvement_168h:.1f}%")
print(f"  RMSE improvement: {rmse_improvement_168h:.1f}%")

print(f"\nAbsolute Performance:")
print(f"  XGBoost MAE:      {test_metrics['MAE']:.2f} €/MWh")
print(f"  24h Baseline MAE: {baseline_24h_metrics['MAE']:.2f} €/MWh")
print(f"  168h Baseline MAE: {baseline_168h_metrics['MAE']:.2f} €/MWh")


XGBoost IMPROVEMENT OVER BASELINES

Vs. 24h Persistence:
  MAE improvement:  9.3%
  RMSE improvement: 12.5%

Vs. 168h Persistence:
  MAE improvement:  23.9%
  RMSE improvement: 30.5%

Absolute Performance:
  XGBoost MAE:      27.05 €/MWh
  24h Baseline MAE: 29.82 €/MWh
  168h Baseline MAE: 35.55 €/MWh


## 6.5. ARIMA Statistical Model

Let's also train an ARIMA/SARIMAX model to see how statistical time series methods compare.

In [18]:
# Import ARIMA model
from models.statistical import ArimaEstimator
from models.train import train_statistical_model

# Select basic exogenous features (avoid overfitting with too many features)
basic_features = ['load_forecast', 'wind_onshore', 'wind_offshore', 'solar',
                 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos']
exog_cols = [c for c in basic_features if c in X_train.columns]

print(f"Using {len(exog_cols)} exogenous features for ARIMA:")
print(exog_cols)

Using 8 exogenous features for ARIMA:
['load_forecast', 'wind_onshore', 'wind_offshore', 'solar', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos']


In [19]:
# Train ARIMA model with manual order (1,1,1)
# Note: This will take 1-2 minutes
print("Training ARIMA(1,1,1) model with exogenous variables...")

arima_model = ArimaEstimator(
    order=(1, 1, 1),
    seasonal_order=(0, 0, 0, 0),
    use_auto_arima=False,
    exog_cols=exog_cols,
    scale_exog=True
)

# Fit on training data
arima_model.fit(X_train, y_train)

print("ARIMA model training complete!")

Training ARIMA(1,1,1) model with exogenous variables...
ARIMA model training complete!


In [20]:
# Evaluate ARIMA model on test set
arima_metrics = evaluate_model(arima_model, X_test, y_test, set_name="ARIMA")

# Get predictions for visualization
y_pred_arima = arima_model.predict(X_test)

# Update comparison
comparison_df['ARIMA'] = arima_metrics

print("\nUpdated Model Comparison (Test Set):")
print(comparison_df[['Baseline 24h', 'Baseline 168h', 'XGBoost', 'ARIMA']].round(2))

2025-12-17 20:45:17 - train_model - INFO - ARIMA Set Performance:
2025-12-17 20:45:17 - train_model - INFO -   MAE:  44.14 €/MWh
2025-12-17 20:45:17 - train_model - INFO -   RMSE: 57.34 €/MWh
2025-12-17 20:45:17 - train_model - INFO -   MAPE: inf%
2025-12-17 20:45:17 - train_model - INFO -   R²:   0.0052

Updated Model Comparison (Test Set):
      Baseline 24h  Baseline 168h  XGBoost  ARIMA
MAE          29.82          35.55    27.05  44.14
RMSE         47.21          59.39    41.30  57.34
MAPE           inf            inf      inf    inf
R2            0.33          -0.07     0.48   0.01


In [21]:
# Visualize ARIMA predictions alongside other models
n_hours = 7 * 24  # 7 days
plot_start = 0
plot_end = min(n_hours, len(y_test))

fig = go.Figure()

# Actual prices
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_test.values[plot_start:plot_end],
    name='Actual',
    line=dict(color='black', width=2),
    mode='lines'
))

# XGBoost predictions
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_xgb[plot_start:plot_end],
    name='XGBoost',
    line=dict(color='green', width=1.5),
    mode='lines'
))

# ARIMA predictions
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_arima[plot_start:plot_end],
    name='ARIMA',
    line=dict(color='purple', width=1.5),
    mode='lines'
))

# 24h baseline
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_24h[plot_start:plot_end],
    name='Baseline 24h',
    line=dict(color='blue', width=1.5, dash='dash'),
    mode='lines'
))

# 168h baseline
fig.add_trace(go.Scatter(
    x=y_test.index[plot_start:plot_end],
    y=y_pred_168h[plot_start:plot_end],
    name='Baseline 168h',
    line=dict(color='red', width=1.5, dash='dot'),
    mode='lines'
))

fig.update_layout(
    title='All Models Comparison (First Week of Test Set)',
    xaxis_title='Date',
    yaxis_title='Price (€/MWh)',
    height=500,
    hovermode='x unified',
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01)
)

fig.show()

In [22]:
# Performance comparison bar chart with ARIMA
fig = go.Figure()

models_list = ['Baseline 24h', 'Baseline 168h', 'ARIMA', 'XGBoost']
metrics_to_plot = ['MAE', 'RMSE']

for metric in metrics_to_plot:
    fig.add_trace(go.Bar(
        name=metric,
        x=models_list,
        y=[comparison_df.loc[metric, model] for model in models_list],
    ))

fig.update_layout(
    title='Model Performance Comparison (Test Set) - All Models',
    xaxis_title='Model',
    yaxis_title='Error (€/MWh)',
    barmode='group',
    height=500,
    showlegend=True
)

fig.show()

In [23]:
# Plot predictions vs actual
y_test_pred = model.predict(X_test_scaled)

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Test Set: Actual vs Predicted Prices', 'Predicted vs Actual Scatter Plot'),
    vertical_spacing=0.12
)

# Time series plot
fig.add_trace(
    go.Scatter(x=y_test.index, y=y_test.values, name='Actual', 
               line=dict(color='blue', width=1.5), opacity=0.7),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=y_test.index, y=y_test_pred, name='Predicted', 
               line=dict(color='red', width=1.5), opacity=0.7),
    row=1, col=1
)

# Scatter plot
fig.add_trace(
    go.Scatter(x=y_test.values, y=y_test_pred, mode='markers',
               marker=dict(color='blue', size=4, opacity=0.5),
               name='Predictions', showlegend=False),
    row=2, col=1
)

# Add perfect prediction line
min_val, max_val = y_test.min(), y_test.max()
fig.add_trace(
    go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
               mode='lines', line=dict(color='red', dash='dash', width=2),
               name='Perfect Prediction', showlegend=False),
    row=2, col=1
)

# Update axes
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="Price (€/MWh)", row=1, col=1)
fig.update_xaxes(title_text="Actual Price (€/MWh)", row=2, col=1)
fig.update_yaxes(title_text="Predicted Price (€/MWh)", row=2, col=1)

fig.update_layout(height=800, showlegend=True)
fig.show()

In [24]:
# Error distribution and residual plot
errors = y_test.values - y_test_pred

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Error Distribution', 'Residual Plot'),
    horizontal_spacing=0.12
)

# Histogram of errors
fig.add_trace(
    go.Histogram(x=errors, nbinsx=50, name='Errors',
                 marker=dict(color='steelblue', line=dict(color='black', width=1))),
    row=1, col=1
)

# Add vertical line at 0
fig.add_vline(x=0, line_dash="dash", line_color="red", line_width=2, row=1, col=1)

# Residual scatter plot
fig.add_trace(
    go.Scatter(x=y_test_pred, y=errors, mode='markers',
               marker=dict(color='steelblue', size=4, opacity=0.5),
               name='Residuals', showlegend=False),
    row=1, col=2
)

# Add horizontal line at 0
fig.add_hline(y=0, line_dash="dash", line_color="red", line_width=2, row=1, col=2)

# Update axes
fig.update_xaxes(title_text="Prediction Error (€/MWh)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Predicted Price (€/MWh)", row=1, col=2)
fig.update_yaxes(title_text="Prediction Error (€/MWh)", row=1, col=2)

fig.update_layout(height=400, showlegend=False)
fig.show()

## 7. Feature Importance

In [25]:
# Get top features
feature_importance = get_feature_importance(
    model,
    X_train.columns.tolist(),
    top_n=20
)

print("Top 20 Most Important Features:")
print(feature_importance)

Top 20 Most Important Features:
                   feature  importance
24             price_lag_0    0.317366
25             price_lag_1    0.295739
26             price_lag_2    0.094244
41    price_rolling_12_max    0.030291
32           price_lag_144    0.021378
37     price_rolling_6_max    0.020968
8          day_of_week_cos    0.014651
31            price_lag_48    0.013952
30            price_lag_24    0.012896
46  price_rolling_168_mean    0.011167
23    load_forecast_future    0.008545
10               month_cos    0.008380
45    price_rolling_24_max    0.005527
0            load_forecast    0.004888
38   price_rolling_12_mean    0.004700
7          day_of_week_sin    0.004418
48   price_rolling_168_min    0.004268
42   price_rolling_24_mean    0.004257
50     load_forecast_lag_1    0.004015
6                 hour_cos    0.004007


In [26]:
# Plot feature importance
fig = go.Figure()

fig.add_trace(go.Bar(
    x=feature_importance['importance'].values,
    y=feature_importance['feature'].values,
    orientation='h',
    marker=dict(color='steelblue')
))

fig.update_layout(
    title='Top 20 Feature Importance',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=600,
    yaxis=dict(autorange='reversed')  # Highest importance at top
)

fig.show()

## 8. Save Model

In [27]:
# Save trained model and feature engine
save_model(
    model,
    feature_engine,
    save_dir="../models",
    model_name="xgboost_24h_forecast"
)

2025-12-17 20:45:17 - train_model - INFO - Model saved to: ../models/xgboost_24h_forecast.json
2025-12-17 20:45:17 - train_model - INFO - Feature engine saved to: ../models/xgboost_24h_forecast_feature_engine.pkl
2025-12-17 20:45:17 - train_model - INFO - Feature names saved to: ../models/xgboost_24h_forecast_features.json


## 9. Model Insights

In [28]:
# Analyze errors by hour of day
error_by_hour = pd.DataFrame({
    'hour': y_test.index.hour,
    'error': np.abs(errors)
}).groupby('hour')['error'].mean()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=error_by_hour.index,
    y=error_by_hour.values,
    marker=dict(color='steelblue')
))

fig.update_layout(
    title='Forecast Error by Hour of Day',
    xaxis_title='Hour of Day',
    yaxis_title='Mean Absolute Error (€/MWh)',
    height=400,
    showlegend=False
)

fig.show()

In [29]:
# Metrics comparison across train/val/test sets
metrics_df = pd.DataFrame({
    'Train': train_metrics,
    'Validation': val_metrics,
    'Test': test_metrics
})

# Plot metrics comparison
fig = go.Figure()

for metric in ['MAE', 'RMSE']:
    fig.add_trace(go.Bar(
        name=metric,
        x=['Train', 'Validation', 'Test'],
        y=metrics_df.loc[metric],
    ))

fig.update_layout(
    title='Model Performance Across Sets',
    xaxis_title='Dataset',
    yaxis_title='Error (€/MWh)',
    barmode='group',
    height=400
)

fig.show()