# Model Training: XGBoost Price Forecasting

This notebook demonstrates how to use the feature pipeline to train an XGBoost model for electricity price forecasting.

In [24]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Add src to path
sys.path.append('../src')

from utils.preprocessing import (
    create_merged_dataset,
    split_train_val_test,
    handle_missing_values
)
from features.pipeline import TimeSeriesFeatureEngine
from models.train import (
    train_xgboost_model,
    evaluate_model,
    get_feature_importance,
    save_model
)

# Plotly for interactive visualizations
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

## 1. Load and Prepare Data

In [25]:
# Load merged dataset from cache
df = create_merged_dataset(db_path="../data/cache.db")

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nColumns: {df.columns.tolist()}")

df.head()

Dataset shape: (26305, 6)
Date range: 2022-01-01 00:00:00+01:00 to 2025-01-01 00:00:00+01:00

Columns: ['price', 'load_forecast', 'actual_load', 'wind_onshore', 'wind_offshore', 'solar']


Unnamed: 0_level_0,price,load_forecast,actual_load,wind_onshore,wind_offshore,solar
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-01 00:00:00+01:00,124.7,11532.1925,10374.1425,2646.25,1352.75,0.0
2022-01-01 01:00:00+01:00,124.7,11085.105,10249.9175,2419.75,1307.5,0.0
2022-01-01 02:00:00+01:00,134.0,10805.7575,9907.035,2161.75,1234.75,0.0
2022-01-01 03:00:00+01:00,58.8,10676.2475,9782.305,1954.75,1129.25,0.0
2022-01-01 04:00:00+01:00,37.67,10643.9175,9589.525,1706.75,1044.0,0.0


In [26]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])

Missing values per column:
load_forecast     1
actual_load       1
wind_onshore     25
wind_offshore    25
solar            25
dtype: int64


In [27]:
# Handle missing values (forward fill with limit)
df_clean = handle_missing_values(df, strategy='forward_fill', limit=24)

# Drop any remaining NaN rows
df_clean = df_clean.dropna()

print(f"Clean dataset shape: {df_clean.shape}")

Clean dataset shape: (26305, 6)


## 2. Split Data (Time-based)

In [28]:
# Time-based split (no shuffling!)
train_df, val_df, test_df = split_train_val_test(
    df_clean,
    train_end="2023-12-31",
    val_end="2024-06-30"
)

Train: 2022-01-01 00:00:00+01:00 to 2023-12-31 00:00:00+01:00 (17497 samples)
Val:   2023-12-31 01:00:00+01:00 to 2024-06-30 00:00:00+02:00 (4367 samples)
Test:  2024-06-30 01:00:00+02:00 to 2025-01-01 00:00:00+01:00 (4441 samples)


## 3. Create Features

In [29]:
# Initialize feature engine
feature_engine = TimeSeriesFeatureEngine(
    target_col='price',
    forecast_horizon=24,
    feature_config={'scaler_type': 'robust'},
    price_lags=[1, 2, 3, 6, 12, 24, 48, 168],  # Recent hours + same-hour lags
    price_windows=[6, 12, 24, 168],             # Short-term + long-term windows
    load_lags=[1, 24, 168]                      # Recent + seasonal
)

print("Feature engine initialized")

Feature engine initialized


In [30]:
# Create features for train, val, test sets
print("Creating training features...")
X_train, y_train = feature_engine.prepare_data(train_df, create_target=True)

print("Creating validation features...")
X_val, y_val = feature_engine.prepare_data(val_df, create_target=True)

print("Creating test features...")
X_test, y_test = feature_engine.prepare_data(test_df, create_target=True)

print(f"\nTrain: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape}, y={y_val.shape}")
print(f"Test:  X={X_test.shape}, y={y_test.shape}")

Creating training features...
Creating validation features...
Creating test features...

Train: X=(17473, 65), y=(17473,)
Val:   X=(4343, 65), y=(4343,)
Test:  X=(4417, 65), y=(4417,)


In [31]:
# Fill any remaining NaN values with 0
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_test = X_test.fillna(0)

print(f"Total features created: {X_train.shape[1]}")
print(f"\nFeature names (first 20):")
print(X_train.columns.tolist()[:20])

Total features created: 65

Feature names (first 20):
['load_forecast', 'actual_load', 'wind_onshore', 'wind_offshore', 'solar', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'is_weekend', 'total_renewable', 'total_wind', 'residual_load', 'residual_load_pct', 'renewable_penetration', 'wind_onshore_ratio', 'wind_offshore_ratio', 'solar_ratio']


## 4. Train XGBoost Model

In [32]:
# Fit scaler on training data
feature_engine.fit(X_train, y_train)

# Transform features
X_train_scaled = feature_engine.transform(X_train)
X_val_scaled = feature_engine.transform(X_val)
X_test_scaled = feature_engine.transform(X_test)

In [33]:
# Train model with early stopping
model = train_xgboost_model(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    params={
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_child_weight': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
    },
    n_estimators=1000,
    early_stopping_rounds=50,
    verbose=True
)

[0]	validation_0-rmse:118.31753	validation_1-rmse:104.82109
[1]	validation_0-rmse:113.61268	validation_1-rmse:100.69558
[2]	validation_0-rmse:109.25523	validation_1-rmse:97.60646
[3]	validation_0-rmse:105.07267	validation_1-rmse:93.80094
[4]	validation_0-rmse:101.13739	validation_1-rmse:90.24394
[5]	validation_0-rmse:97.40336	validation_1-rmse:86.91937
[6]	validation_0-rmse:93.91515	validation_1-rmse:83.57511
[7]	validation_0-rmse:90.57611	validation_1-rmse:80.88942
[8]	validation_0-rmse:87.42139	validation_1-rmse:78.14343
[9]	validation_0-rmse:84.51459	validation_1-rmse:75.55706
[10]	validation_0-rmse:81.73585	validation_1-rmse:72.71189
[11]	validation_0-rmse:79.07602	validation_1-rmse:69.99320
[12]	validation_0-rmse:76.57363	validation_1-rmse:67.64985
[13]	validation_0-rmse:74.26485	validation_1-rmse:65.74744
[14]	validation_0-rmse:72.06383	validation_1-rmse:63.34083
[15]	validation_0-rmse:70.03004	validation_1-rmse:61.95636
[16]	validation_0-rmse:68.09164	validation_1-rmse:60.25923


## 5. Evaluate Model

In [34]:
# Evaluate on all sets
train_metrics = evaluate_model(model, X_train_scaled, y_train, set_name="Train")
val_metrics = evaluate_model(model, X_val_scaled, y_val, set_name="Validation")
test_metrics = evaluate_model(model, X_test_scaled, y_test, set_name="Test")

2025-12-15 15:24:23 - train_model - INFO - Train Set Performance:
2025-12-15 15:24:23 - train_model - INFO -   MAE:  16.38 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   RMSE: 22.09 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   MAPE: inf%
2025-12-15 15:24:23 - train_model - INFO -   R²:   0.9679
2025-12-15 15:24:23 - train_model - INFO - Validation Set Performance:
2025-12-15 15:24:23 - train_model - INFO -   MAE:  22.71 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   RMSE: 31.41 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   MAPE: inf%
2025-12-15 15:24:23 - train_model - INFO -   R²:   0.2713
2025-12-15 15:24:23 - train_model - INFO - Test Set Performance:
2025-12-15 15:24:23 - train_model - INFO -   MAE:  28.79 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   RMSE: 43.53 €/MWh
2025-12-15 15:24:23 - train_model - INFO -   MAPE: inf%
2025-12-15 15:24:23 - train_model - INFO -   R²:   0.4245


In [35]:
# Plot predictions vs actual
y_test_pred = model.predict(X_test_scaled)

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=('Test Set: Actual vs Predicted Prices', 'Predicted vs Actual Scatter Plot'),
    vertical_spacing=0.12
)

# Time series plot
fig.add_trace(
    go.Scatter(x=y_test.index, y=y_test.values, name='Actual', 
               line=dict(color='blue', width=1.5), opacity=0.7),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=y_test.index, y=y_test_pred, name='Predicted', 
               line=dict(color='red', width=1.5), opacity=0.7),
    row=1, col=1
)

# Scatter plot
fig.add_trace(
    go.Scatter(x=y_test.values, y=y_test_pred, mode='markers',
               marker=dict(color='blue', size=4, opacity=0.5),
               name='Predictions', showlegend=False),
    row=2, col=1
)

# Add perfect prediction line
min_val, max_val = y_test.min(), y_test.max()
fig.add_trace(
    go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
               mode='lines', line=dict(color='red', dash='dash', width=2),
               name='Perfect Prediction', showlegend=False),
    row=2, col=1
)

# Update axes
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="Price (€/MWh)", row=1, col=1)
fig.update_xaxes(title_text="Actual Price (€/MWh)", row=2, col=1)
fig.update_yaxes(title_text="Predicted Price (€/MWh)", row=2, col=1)

fig.update_layout(height=800, showlegend=True)
fig.show()

In [36]:
# Error distribution and residual plot
errors = y_test.values - y_test_pred

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Error Distribution', 'Residual Plot'),
    horizontal_spacing=0.12
)

# Histogram of errors
fig.add_trace(
    go.Histogram(x=errors, nbinsx=50, name='Errors',
                 marker=dict(color='steelblue', line=dict(color='black', width=1))),
    row=1, col=1
)

# Add vertical line at 0
fig.add_vline(x=0, line_dash="dash", line_color="red", line_width=2, row=1, col=1)

# Residual scatter plot
fig.add_trace(
    go.Scatter(x=y_test_pred, y=errors, mode='markers',
               marker=dict(color='steelblue', size=4, opacity=0.5),
               name='Residuals', showlegend=False),
    row=1, col=2
)

# Add horizontal line at 0
fig.add_hline(y=0, line_dash="dash", line_color="red", line_width=2, row=1, col=2)

# Update axes
fig.update_xaxes(title_text="Prediction Error (€/MWh)", row=1, col=1)
fig.update_yaxes(title_text="Frequency", row=1, col=1)
fig.update_xaxes(title_text="Predicted Price (€/MWh)", row=1, col=2)
fig.update_yaxes(title_text="Prediction Error (€/MWh)", row=1, col=2)

fig.update_layout(height=400, showlegend=False)
fig.show()

## 6. Feature Importance

In [37]:
# Get top features
feature_importance = get_feature_importance(
    model,
    X_train.columns.tolist(),
    top_n=20
)

print("Top 20 Most Important Features:")
print(feature_importance)

Top 20 Most Important Features:
                   feature  importance
24             price_lag_1    0.329219
35     price_rolling_6_max    0.207341
25             price_lag_2    0.090214
32    price_rolling_6_mean    0.083647
29            price_lag_24    0.035629
30            price_lag_48    0.027029
44  price_rolling_168_mean    0.016180
8          day_of_week_cos    0.015884
43    price_rolling_24_max    0.013094
23    load_forecast_future    0.010674
31           price_lag_168    0.007301
10               month_cos    0.006823
34     price_rolling_6_min    0.006760
57          price_diff_24h    0.006218
5                 hour_sin    0.006128
6                 hour_cos    0.005402
20    residual_load_x_hour    0.005304
47   price_rolling_168_max    0.004765
7          day_of_week_sin    0.004658
4                    solar    0.004607


In [38]:
# Plot feature importance
fig = go.Figure()

fig.add_trace(go.Bar(
    x=feature_importance['importance'].values,
    y=feature_importance['feature'].values,
    orientation='h',
    marker=dict(color='steelblue')
))

fig.update_layout(
    title='Top 20 Feature Importance',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=600,
    yaxis=dict(autorange='reversed')  # Highest importance at top
)

fig.show()

## 7. Save Model

In [39]:
# Save trained model and feature engine
save_model(
    model,
    feature_engine,
    save_dir="../models",
    model_name="xgboost_24h_forecast"
)

TypeError: `_estimator_type` undefined.  Please use appropriate mixin to define estimator type.

## 8. Model Insights

In [None]:
# Analyze errors by hour of day
error_by_hour = pd.DataFrame({
    'hour': y_test.index.hour,
    'error': np.abs(errors)
}).groupby('hour')['error'].mean()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=error_by_hour.index,
    y=error_by_hour.values,
    marker=dict(color='steelblue')
))

fig.update_layout(
    title='Forecast Error by Hour of Day',
    xaxis_title='Hour of Day',
    yaxis_title='Mean Absolute Error (€/MWh)',
    height=400,
    showlegend=False
)

fig.show()

In [None]:
# Metrics comparison across train/val/test sets
metrics_df = pd.DataFrame({
    'Train': train_metrics,
    'Validation': val_metrics,
    'Test': test_metrics
})

# Plot metrics comparison
fig = go.Figure()

for metric in ['MAE', 'RMSE']:
    fig.add_trace(go.Bar(
        name=metric,
        x=['Train', 'Validation', 'Test'],
        y=metrics_df.loc[metric],
    ))

fig.update_layout(
    title='Model Performance Across Sets',
    xaxis_title='Dataset',
    yaxis_title='Error (€/MWh)',
    barmode='group',
    height=400
)

fig.show()

In [None]:
# Summary statistics
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"\nForecast Horizon: 24 hours")
print(f"Training Samples: {len(X_train)}")
print(f"Validation Samples: {len(X_val)}")
print(f"Test Samples: {len(X_test)}")
print(f"\nNumber of Features: {X_train.shape[1]}")
print(f"Best Iteration: {model.best_iteration}")
print(f"\nTest Set Performance:")
print(f"  MAE:  {test_metrics['MAE']:.2f} €/MWh")
print(f"  RMSE: {test_metrics['RMSE']:.2f} €/MWh")
print(f"  MAPE: {test_metrics['MAPE']:.2f}%")
print(f"  R²:   {test_metrics['R2']:.4f}")