# Hyperparameter Tuning with Optuna

This notebook uses Optuna to find optimal hyperparameters for XGBoost.

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

sys.path.append('../src')

from utils.preprocessing import (
    create_merged_dataset,
    split_train_val_test,
    handle_missing_values
)
from features.pipeline import TimeSeriesFeatureEngine
from models.hyperparameter_tuning import (
    tune_xgboost,
    quick_tune,
    extensive_tune,
    plot_optimization_history,
    plot_param_importance
)
from models.train import train_xgboost_model, evaluate_model

import plotly.graph_objects as go

## 1. Prepare Data (Same as Training Notebook)

In [2]:
# Load and clean data
df = create_merged_dataset(db_path="../data/cache.db")
df_clean = handle_missing_values(df, strategy='forward_fill', limit=24).dropna()

# Split data
train_df, val_df, test_df = split_train_val_test(
    df_clean,
    train_end="2023-12-31",
    val_end="2024-06-30"
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 2022-01-01 00:00:00+01:00 to 2023-12-31 00:00:00+01:00 (17497 samples)
Val:   2023-12-31 01:00:00+01:00 to 2024-06-30 00:00:00+02:00 (4367 samples)
Test:  2024-06-30 01:00:00+02:00 to 2025-01-01 00:00:00+01:00 (4441 samples)
Train: 17497, Val: 4367, Test: 4441


In [3]:
# Create features
feature_engine = TimeSeriesFeatureEngine(
    target_col='price',
    forecast_horizon=24,
    feature_config={'scaler_type': 'robust'}
)

X_train, y_train = feature_engine.prepare_data(train_df, create_target=True)
X_val, y_val = feature_engine.prepare_data(val_df, create_target=True)
X_test, y_test = feature_engine.prepare_data(test_df, create_target=True)

# Fill NaN
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)
X_test = X_test.fillna(0)

# Scale
feature_engine.fit(X_train, y_train)
X_train_scaled = feature_engine.transform(X_train)
X_val_scaled = feature_engine.transform(X_val)
X_test_scaled = feature_engine.transform(X_test)

# Convert back to DataFrame for Optuna
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Features: {X_train.shape[1]}")
print(f"Training samples: {len(X_train)}")

Features: 60
Training samples: 17473


## 2. Quick Tuning (30 trials, ~5-10 minutes)

Good for fast iteration. Use this first to get reasonable hyperparameters.

In [4]:
# Quick tuning - good starting point
best_params_quick = quick_tune(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    n_trials=30
)

[I 2025-12-15 15:35:37,922] A new study created in memory with name: xgboost_price_forecasting


Running quick tuning with 30 trials...
HYPERPARAMETER TUNING WITH OPTUNA
Number of trials: 30



  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-12-15 15:35:39,381] Trial 0 finished with value: 29.70154098715024 and parameters: {'learning_rate': 0.023688639503640783, 'max_depth': 7, 'min_child_weight': 76, 'subsample': 0.7394633936788146, 'colsample_bytree': 0.5624074561769746, 'colsample_bylevel': 0.562397808134481, 'gamma': 0.3846096996241774, 'reg_alpha': 3.9676050770529874, 'reg_lambda': 0.6358358856676253, 'n_estimators': 737}. Best is trial 0 with value: 29.70154098715024.
[I 2025-12-15 15:35:40,714] Trial 1 finished with value: 31.43698255498961 and parameters: {'learning_rate': 0.010485387725194618, 'max_depth': 7, 'min_child_weight': 85, 'subsample': 0.5849356442713105, 'colsample_bytree': 0.5727299868828403, 'colsample_bylevel': 0.5733618039413735, 'gamma': 1.5907869905017349, 'reg_alpha': 0.37520558551242816, 'reg_lambda': 0.19762189340280073, 'n_estimators': 362}. Best is trial 0 with value: 29.70154098715024.
[I 2025-12-15 15:35:41,002] Trial 2 finished with value: 31.63555845600283 and parameters: {'learni

## 3. Train Model with Best Parameters

In [5]:
# Train with optimized parameters
# Extract n_estimators separately to avoid conflict
params_without_n_est = {k: v for k, v in best_params_quick.items() if k != 'n_estimators'}
n_est = best_params_quick.get('n_estimators', 500)

model_tuned = train_xgboost_model(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    params=params_without_n_est,
    n_estimators=n_est,
    early_stopping_rounds=30,
    verbose=True
)

[0]	validation_0-rmse:119.83088	validation_1-rmse:105.91495
[1]	validation_0-rmse:116.70109	validation_1-rmse:103.12766
[2]	validation_0-rmse:113.71891	validation_1-rmse:100.39755
[3]	validation_0-rmse:110.76890	validation_1-rmse:97.61563
[4]	validation_0-rmse:107.87652	validation_1-rmse:94.78009
[5]	validation_0-rmse:105.12662	validation_1-rmse:92.21946
[6]	validation_0-rmse:102.56798	validation_1-rmse:90.18956
[7]	validation_0-rmse:100.02324	validation_1-rmse:87.64769
[8]	validation_0-rmse:97.64633	validation_1-rmse:85.24280
[9]	validation_0-rmse:95.42231	validation_1-rmse:83.05116
[10]	validation_0-rmse:93.29294	validation_1-rmse:81.10587
[11]	validation_0-rmse:91.29667	validation_1-rmse:78.87882
[12]	validation_0-rmse:89.29760	validation_1-rmse:76.79826
[13]	validation_0-rmse:87.46050	validation_1-rmse:74.76329
[14]	validation_0-rmse:85.72254	validation_1-rmse:72.83885
[15]	validation_0-rmse:83.99268	validation_1-rmse:71.01970
[16]	validation_0-rmse:82.35558	validation_1-rmse:69.67

In [6]:
# Evaluate tuned model
train_metrics = evaluate_model(model_tuned, X_train_scaled, y_train, set_name="Train")
val_metrics = evaluate_model(model_tuned, X_val_scaled, y_val, set_name="Validation")
test_metrics = evaluate_model(model_tuned, X_test_scaled, y_test, set_name="Test")

print("\n" + "="*60)
print("TUNED MODEL PERFORMANCE")
print("="*60)
print(f"Validation R²: {val_metrics['R2']:.4f}")
print(f"Test R²:       {test_metrics['R2']:.4f}")
print(f"Test MAE:      {test_metrics['MAE']:.2f} €/MWh")
print(f"Test RMSE:     {test_metrics['RMSE']:.2f} €/MWh")

2025-12-15 15:36:09 - train_model - INFO - Train Set Performance:
2025-12-15 15:36:09 - train_model - INFO -   MAE:  26.81 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   RMSE: 38.03 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   MAPE: inf%
2025-12-15 15:36:09 - train_model - INFO -   R²:   0.9048
2025-12-15 15:36:09 - train_model - INFO - Validation Set Performance:
2025-12-15 15:36:09 - train_model - INFO -   MAE:  19.97 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   RMSE: 28.76 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   MAPE: inf%
2025-12-15 15:36:09 - train_model - INFO -   R²:   0.3895
2025-12-15 15:36:09 - train_model - INFO - Test Set Performance:
2025-12-15 15:36:09 - train_model - INFO -   MAE:  26.28 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   RMSE: 40.63 €/MWh
2025-12-15 15:36:09 - train_model - INFO -   MAPE: inf%
2025-12-15 15:36:09 - train_model - INFO -   R²:   0.4988

TUNED MODEL PERFORMANCE
Validation R²: 0.3895
Test R²:       0.4988
Test MAE:

## 4. Extensive Tuning (Optional - for best results)

Run this if you want the absolute best performance. Takes 30-60 minutes.

In [7]:
# Uncomment to run extensive tuning (takes longer)
# best_params_extensive, study = extensive_tune(
#     X_train_scaled,
#     y_train,
#     X_val_scaled,
#     y_val,
#     timeout_minutes=30  # Adjust based on available time
# )

## 5. Full Tuning with Visualization

In [8]:
# Run tuning with more trials for better results
best_params, study = tune_xgboost(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    n_trials=100,  # Increase for better results
    verbose=True
)

[I 2025-12-15 15:36:09,114] A new study created in memory with name: xgboost_price_forecasting


HYPERPARAMETER TUNING WITH OPTUNA
Number of trials: 100



  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-15 15:36:10,606] Trial 0 finished with value: 29.70154098715024 and parameters: {'learning_rate': 0.023688639503640783, 'max_depth': 7, 'min_child_weight': 76, 'subsample': 0.7394633936788146, 'colsample_bytree': 0.5624074561769746, 'colsample_bylevel': 0.562397808134481, 'gamma': 0.3846096996241774, 'reg_alpha': 3.9676050770529874, 'reg_lambda': 0.6358358856676253, 'n_estimators': 737}. Best is trial 0 with value: 29.70154098715024.
[I 2025-12-15 15:36:11,967] Trial 1 finished with value: 31.43698255498961 and parameters: {'learning_rate': 0.010485387725194618, 'max_depth': 7, 'min_child_weight': 85, 'subsample': 0.5849356442713105, 'colsample_bytree': 0.5727299868828403, 'colsample_bylevel': 0.5733618039413735, 'gamma': 1.5907869905017349, 'reg_alpha': 0.37520558551242816, 'reg_lambda': 0.19762189340280073, 'n_estimators': 362}. Best is trial 0 with value: 29.70154098715024.
[I 2025-12-15 15:36:12,253] Trial 2 finished with value: 31.63555845600283 and parameters: {'learni

In [9]:
# Plot optimization history
fig = plot_optimization_history(study)
if fig:
    fig.show()

In [10]:
# Plot parameter importance
fig = plot_param_importance(study)
if fig:
    fig.show()

## 6. Train Final Model with Best Parameters

In [11]:
# Train final model
# Extract n_estimators separately to avoid conflict
params_without_n_est = {k: v for k, v in best_params.items() if k != 'n_estimators'}
n_est = best_params.get('n_estimators', 500)

model_final = train_xgboost_model(
    X_train_scaled,
    y_train,
    X_val_scaled,
    y_val,
    params=params_without_n_est,
    n_estimators=n_est,
    early_stopping_rounds=50,
    verbose=True
)

[0]	validation_0-rmse:117.10377	validation_1-rmse:103.54749
[1]	validation_0-rmse:111.50256	validation_1-rmse:98.26481
[2]	validation_0-rmse:106.26459	validation_1-rmse:93.01837
[3]	validation_0-rmse:101.57184	validation_1-rmse:88.79208
[4]	validation_0-rmse:97.18099	validation_1-rmse:84.76953
[5]	validation_0-rmse:93.16263	validation_1-rmse:81.09563
[6]	validation_0-rmse:89.73386	validation_1-rmse:78.19956
[7]	validation_0-rmse:86.20228	validation_1-rmse:74.42999
[8]	validation_0-rmse:83.19187	validation_1-rmse:70.92418
[9]	validation_0-rmse:80.26421	validation_1-rmse:67.42878
[10]	validation_0-rmse:77.60481	validation_1-rmse:65.18190
[11]	validation_0-rmse:75.23878	validation_1-rmse:62.36013
[12]	validation_0-rmse:72.99328	validation_1-rmse:59.62059
[13]	validation_0-rmse:71.00865	validation_1-rmse:57.16772
[14]	validation_0-rmse:69.23601	validation_1-rmse:55.26808
[15]	validation_0-rmse:67.47682	validation_1-rmse:53.23879
[16]	validation_0-rmse:65.91910	validation_1-rmse:52.07952
[1

In [12]:
# Final evaluation
train_metrics_final = evaluate_model(model_final, X_train_scaled, y_train, set_name="Train")
val_metrics_final = evaluate_model(model_final, X_val_scaled, y_val, set_name="Validation")
test_metrics_final = evaluate_model(model_final, X_test_scaled, y_test, set_name="Test")

2025-12-15 15:37:33 - train_model - INFO - Train Set Performance:
2025-12-15 15:37:33 - train_model - INFO -   MAE:  26.85 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   RMSE: 38.14 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   MAPE: inf%
2025-12-15 15:37:33 - train_model - INFO -   R²:   0.9042
2025-12-15 15:37:33 - train_model - INFO - Validation Set Performance:
2025-12-15 15:37:33 - train_model - INFO -   MAE:  19.74 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   RMSE: 28.20 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   MAPE: inf%
2025-12-15 15:37:33 - train_model - INFO -   R²:   0.4126
2025-12-15 15:37:33 - train_model - INFO - Test Set Performance:
2025-12-15 15:37:33 - train_model - INFO -   MAE:  26.53 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   RMSE: 41.02 €/MWh
2025-12-15 15:37:33 - train_model - INFO -   MAPE: inf%
2025-12-15 15:37:33 - train_model - INFO -   R²:   0.4891


In [13]:
# Summary
print("\n" + "="*60)
print("FINAL MODEL SUMMARY")
print("="*60)
print(f"\nBest Hyperparameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

print(f"\nPerformance:")
print(f"  Validation R²: {val_metrics_final['R2']:.4f}")
print(f"  Test R²:       {test_metrics_final['R2']:.4f}")
print(f"  Test MAE:      {test_metrics_final['MAE']:.2f} €/MWh")
print(f"  Test RMSE:     {test_metrics_final['RMSE']:.2f} €/MWh")
print(f"  Test MAPE:     {test_metrics_final['MAPE']:.2f}%")


FINAL MODEL SUMMARY

Best Hyperparameters:
  learning_rate: 0.06755681854433829
  max_depth: 3
  min_child_weight: 92
  subsample: 0.5587256345153042
  colsample_bytree: 0.8452945667640633
  colsample_bylevel: 0.7005121122227415
  gamma: 4.993337785531988
  reg_alpha: 0.14448599849632637
  reg_lambda: 0.9162580648195305
  n_estimators: 789

Performance:
  Validation R²: 0.4126
  Test R²:       0.4891
  Test MAE:      26.53 €/MWh
  Test RMSE:     41.02 €/MWh
  Test MAPE:     inf%


## 7. Save Best Model

In [14]:
from models.train import save_model

# Save the tuned model
save_model(
    model_final,
    feature_engine,
    save_dir="../models",
    model_name="xgboost_24h_tuned"
)

# Also save the best parameters
import json
with open('../models/best_params.json', 'w') as f:
    json.dump(best_params, f, indent=2)
    
print("Model and parameters saved!")

2025-12-15 15:37:34 - train_model - INFO - Model saved to: ../models/xgboost_24h_tuned.json
2025-12-15 15:37:34 - train_model - INFO - Feature engine saved to: ../models/xgboost_24h_tuned_feature_engine.pkl
2025-12-15 15:37:34 - train_model - INFO - Feature names saved to: ../models/xgboost_24h_tuned_features.json
Model and parameters saved!
