# Step 8


## 0. Setup & Configuration

In [72]:
# Core Libraries
import pandas as pd
import numpy as np
import warnings
from typing import List, Dict, Tuple

# Machine Learning
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import lightgbm as lgb
import optuna

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [73]:
class HourlyPipelineConfig:
    """Configuration for hourly temperature forecasting pipeline"""
    
    # Data
    INPUT_DATA_PATH: str = "weather_hcm_hourly.csv"
    
    # Model Parameters
    GLOBAL_RANDOM_SEED: int = 105
    TARGET_VARIABLE: str = "temp"
    
    # Forecasting Horizons (in DAYS)
    # Changed from hourly to daily predictions: predict average temperature N days ahead
    HORIZONS: List[int] = [1, 2, 3, 4, 5]  # 1-5 days ahead (daily avg predictions)
    
    # Time Series CV
    N_SPLITS: int = 5
    
config = HourlyPipelineConfig()
np.random.seed(config.GLOBAL_RANDOM_SEED)

## 1. EDA - Exploratory Data Analysis

Understanding hourly data characteristics and how they differ from daily aggregates.

In [74]:
# Load hourly data
df = pd.read_csv(r"C:\Users\ADMIN\Documents\Final Q1-Q5\data\weather_hcm_hourly.csv")
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime').reset_index(drop=True)

print(f"Dataset Shape: {df.shape}")
print(f"Date Range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"\nFirst few rows:")
df.head()

Dataset Shape: (94248, 26)
Date Range: 2015-01-01 00:00:00 to 2025-10-01 23:00:00

First few rows:


Unnamed: 0,name,address,resolvedAddress,latitude,longitude,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,windgust,windspeed,winddir,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,source
0,"Hồ Chí Minh, Việt Nam",Hồ Chí Minh city,"Hồ Chí Minh, Việt Nam",10.776,106.701,2015-01-01 00:00:00,24.6,24.6,19.0,71.06,0.0,0.0,,9.4,3.6,240.0,1012.0,50.0,7.0,0.0,0.0,0.0,,Partially cloudy,partly-cloudy-night,obs
1,"Hồ Chí Minh, Việt Nam",Hồ Chí Minh city,"Hồ Chí Minh, Việt Nam",10.776,106.701,2015-01-01 01:00:00,24.5,24.5,20.2,77.06,0.0,0.0,,9.0,3.6,294.0,1012.2,53.7,8.2,0.0,0.0,0.0,,Partially cloudy,partly-cloudy-night,obs
2,"Hồ Chí Minh, Việt Nam",Hồ Chí Minh city,"Hồ Chí Minh, Việt Nam",10.776,106.701,2015-01-01 02:00:00,24.0,24.0,20.0,78.35,0.0,0.0,,6.5,4.7,324.0,1012.0,50.0,7.0,0.0,0.0,0.0,,Partially cloudy,partly-cloudy-night,obs
3,"Hồ Chí Minh, Việt Nam",Hồ Chí Minh city,"Hồ Chí Minh, Việt Nam",10.776,106.701,2015-01-01 03:00:00,24.0,24.0,20.0,78.35,0.0,0.0,,6.8,7.0,334.0,1012.0,50.0,6.0,0.0,0.0,0.0,,Partially cloudy,partly-cloudy-night,obs
4,"Hồ Chí Minh, Việt Nam",Hồ Chí Minh city,"Hồ Chí Minh, Việt Nam",10.776,106.701,2015-01-01 04:00:00,24.0,24.0,20.7,81.94,0.0,0.0,,9.0,6.7,336.0,1012.1,53.7,7.3,0.0,0.0,0.0,,Partially cloudy,partly-cloudy-night,obs


In [75]:
# Data Quality Check
print("Missing Values:")
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
missing_pct[missing_pct > 0].sort_values(ascending=False)

Missing Values:


severerisk        94.88
preciptype        89.02
visibility         0.26
windgust           0.05
precip             0.04
solarenergy        0.04
solarradiation     0.04
uvindex            0.04
winddir            0.01
dtype: float64

In [76]:
# Temperature Distribution - Hourly vs Daily patterns
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Hourly Temperature Distribution', 'Temperature by Hour of Day',
                    'Temperature Trend (2015-2025)', 'Monthly Temperature Patterns'),
    specs=[[{'type': 'histogram'}, {'type': 'box'}],
           [{'type': 'scatter'}, {'type': 'box'}]]
)

# 1. Overall distribution
fig.add_trace(
    go.Histogram(x=df['temp'], name='Temperature', nbinsx=50),
    row=1, col=1
)

# 2. Temperature by hour of day
df['hour'] = df['datetime'].dt.hour
for hour in range(0, 24, 4):
    hour_data = df[df['hour'] == hour]['temp']
    fig.add_trace(
        go.Box(y=hour_data, name=f"{hour}h", showlegend=False),
        row=1, col=2
    )

# 3. Temperature trend over time (sample every 24 hours for visibility)
sample_df = df.iloc[::24]
fig.add_trace(
    go.Scatter(x=sample_df['datetime'], y=sample_df['temp'], 
               mode='lines', name='Temp (Daily Sample)'),
    row=2, col=1
)

# 4. Temperature by month
df['month'] = df['datetime'].dt.month
for month in range(1, 13):
    month_data = df[df['month'] == month]['temp']
    fig.add_trace(
        go.Box(y=month_data, name=str(month), showlegend=False),
        row=2, col=2
    )

fig.update_layout(height=800, showlegend=False, title_text="Hourly Temperature EDA")
fig.show()

print(f"Temperature Stats:")
print(df['temp'].describe())

Temperature Stats:
count    94248.000000
mean        28.443794
std          2.948578
min         18.000000
25%         26.000000
50%         28.000000
75%         30.600000
max         39.000000
Name: temp, dtype: float64


In [77]:
# Feature Correlation Analysis
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != 'datetime']

# Focus on key weather features
key_features = ['temp', 'feelslike', 'humidity', 'dew', 'precip', 
                'windspeed', 'cloudcover', 'solarradiation', 'uvindex']
key_features = [f for f in key_features if f in numerical_cols]

corr_matrix = df[key_features].corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=corr_matrix.values.round(2),
    texttemplate='%{text}',
    textfont={"size": 10}
))

fig.update_layout(
    title='Feature Correlation Matrix',
    height=600,
    xaxis={'side': 'bottom'},
)
fig.show()

# Top correlations with temperature
temp_corr = corr_matrix['temp'].drop('temp').sort_values(ascending=False)
print("\nTop features correlated with temperature:")
print(temp_corr)


Top features correlated with temperature:
feelslike         0.911625
solarradiation    0.753818
uvindex           0.753254
windspeed         0.328380
cloudcover        0.032066
dew              -0.038343
precip           -0.046613
humidity         -0.762752
Name: temp, dtype: float64


## 2. Feature Engineering


In [78]:
def create_comprehensive_hourly_features_FIXED(df: pd.DataFrame) -> pd.DataFrame:
    """
    Feature Categories:
    1. Daily Aggregates - Summary statistics from daily patterns
    2. Hourly Dynamics - Rate of change and momentum (FIXED)
    3. Lagged Hourly - Historical temperature values
    4. Cyclical & Temporal - Time-based cyclical patterns
    5. Stability & Volatility - Temperature variability metrics
    6. Categorical Weather - One-hot encoded weather conditions
    7. Advanced Features - Interactions, domain knowledge (FIXED)
    """
    df = df.copy()

    print("Creating comprehensive features (FIXED - NO LEAKAGE)...")

    # ==================== CRITICAL FIX: LAG ALL RAW VARIABLES FIRST ====================
    print("  [0/7] CRITICAL FIX: Lagging all current-time raw weather variables...")

    # Create lagged versions of ALL raw weather variables that won't be available at prediction time
    raw_weather_vars = ['temp', 'feelslike', 'humidity', 'dew', 'precip',
                        'windspeed', 'windgust', 'cloudcover', 'solarradiation',
                        'uvindex', 'sealevelpressure', 'visibility']

    # Create lag_1h for all raw variables (this is the most recent KNOWN value)
    for col in raw_weather_vars:
        if col in df.columns:
            df[f'{col}_current'] = df[col].shift(1)  # Most recent known value (1 hour ago)

    print(f"    ✓ Lagged {len([c for c in raw_weather_vars if c in df.columns])} raw weather variables")

    # ==================== CATEGORY 1: DAILY AGGREGATES ====================
    print("  [1/7] Daily aggregates...")

    # Group by date to get daily statistics
    df['date'] = df['datetime'].dt.date

    # FIXED: Calculate daily statistics using lagged temperature (not current)
    df['temp_for_daily_calc'] = df['temp'].shift(1)

    daily_stats = df.groupby('date')['temp_for_daily_calc'].agg([
        ('daily_temp_mean', 'mean'),
        ('daily_temp_min', 'min'),
        ('daily_temp_max', 'max'),
        ('daily_temp_std', 'std'),
        ('daily_temp_range', lambda x: x.max() - x.min())
    ]).reset_index()

    # Shift by 1 day to avoid data leakage (use yesterday's stats)
    daily_stats['date'] = pd.to_datetime(daily_stats['date']) + pd.Timedelta(days=1)
    daily_stats['date'] = daily_stats['date'].dt.date
    df = df.merge(daily_stats, on='date', how='left')

    # FIXED: Temperature position within daily range (using current lagged, not raw current)
    df['temp_position_in_daily_range'] = (
        (df['temp_current'] - df['daily_temp_min']) / (df['daily_temp_range'] + 1e-6)
    )

    # Deviation from yesterday's average (using lagged temp)
    df['temp_dev_from_yesterday_avg'] = df['temp_current'] - df['daily_temp_mean']

    # Multi-day aggregates (already properly shifted in original)
    df['temp_rolling_mean_24h'] = df['temp'].shift(1).rolling(window=24).mean()
    df['temp_rolling_mean_3d'] = df['temp'].shift(1).rolling(window=72).mean()
    df['temp_rolling_mean_7d'] = df['temp'].shift(1).rolling(window=168).mean()
    df['temp_rolling_std_24h'] = df['temp'].shift(1).rolling(window=24).std()
    df['temp_rolling_std_3d'] = df['temp'].shift(1).rolling(window=72).std()
    df['temp_rolling_std_7d'] = df['temp'].shift(1).rolling(window=168).std()

    # Other weather daily aggregates (using current lagged versions)
    for col in ['humidity', 'windspeed', 'cloudcover', 'precip']:
        if col in df.columns:
            df[f'{col}_rolling_mean_24h'] = df[col].shift(1).rolling(window=24).mean()
            df[f'{col}_rolling_max_24h'] = df[col].shift(1).rolling(window=24).max()

    # ==================== CATEGORY 2: HOURLY DYNAMICS (FIXED) ====================
    print("  [2/7] Hourly dynamics (FIXED - using lagged values)...")

    # CRITICAL FIX: Use shifted temperature for all difference calculations
    # Rate of change (first derivative) - using lagged temp
    df['temp_change_1h'] = df['temp'].shift(1).diff(1)
    df['temp_change_2h'] = df['temp'].shift(1).diff(2)
    df['temp_change_3h'] = df['temp'].shift(1).diff(3)
    df['temp_change_6h'] = df['temp'].shift(1).diff(6)
    df['temp_change_12h'] = df['temp'].shift(1).diff(12)

    # Acceleration (second derivative) - using lagged changes
    df['temp_acceleration_1h'] = df['temp_change_1h'].diff(1)
    df['temp_acceleration_3h'] = df['temp_change_3h'].diff(3)

    # Momentum indicators (already properly shifted in original)
    df['temp_momentum_3h'] = df['temp'].shift(1).rolling(window=3).apply(
        lambda x: 1 if x.iloc[-1] > x.iloc[0] else -1 if x.iloc[-1] < x.iloc[0] else 0,
        raw=False
    )
    df['temp_momentum_6h'] = df['temp'].shift(1).rolling(window=6).apply(
        lambda x: 1 if x.iloc[-1] > x.iloc[0] else -1 if x.iloc[-1] < x.iloc[0] else 0,
        raw=False
    )

    # Cumulative change over windows (using lagged changes)
    df['temp_cumsum_change_6h'] = df['temp_change_1h'].rolling(window=6).sum()
    df['temp_cumsum_change_12h'] = df['temp_change_1h'].rolling(window=12).sum()

    # Direction changes (trend reversals) - using lagged changes
    df['temp_direction'] = np.sign(df['temp_change_1h'])
    df['temp_direction_change'] = (df['temp_direction'].diff() != 0).astype(int)

    # FIXED: Other weather dynamics - using raw variable shifts
    for col in ['humidity', 'windspeed', 'dew']:
        if col in df.columns:
            df[f'{col}_change_1h'] = df[col].shift(1).diff(1)
            df[f'{col}_change_6h'] = df[col].shift(1).diff(6)

    # ==================== CATEGORY 3: LAGGED HOURLY (Already correct) ====================
    print("  [3/7] Lagged hourly features...")

    # Temperature lags - comprehensive set 
    lag_hours = [1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 48, 72, 96, 120, 144, 168]
    for lag in lag_hours:
        df[f'temp_lag_{lag}h'] = df['temp'].shift(lag)

    # Same hour patterns 
    df['temp_same_hour_yesterday'] = df['temp'].shift(24)
    df['temp_same_hour_2days_ago'] = df['temp'].shift(48)
    df['temp_same_hour_3days_ago'] = df['temp'].shift(72)
    df['temp_same_hour_1week_ago'] = df['temp'].shift(168)

    # Lag other critical weather variables )
    weather_vars = ['humidity', 'dew', 'windspeed', 'cloudcover', 'precip',
                    'solarradiation', 'uvindex', 'windgust', 'sealevelpressure']

    for col in weather_vars:
        if col in df.columns:
            df[f'{col}_lag_1h'] = df[col].shift(1)
            df[f'{col}_lag_3h'] = df[col].shift(3)
            df[f'{col}_lag_6h'] = df[col].shift(6)
            df[f'{col}_lag_24h'] = df[col].shift(24)

    # Lagged differences
    df['temp_diff_from_1h_ago'] = df['temp_lag_1h'] - df['temp_lag_2h']  # FIXED
    df['temp_diff_from_6h_ago'] = df['temp_lag_1h'] - df['temp_lag_6h']  # FIXED
    df['temp_diff_from_24h_ago'] = df['temp_lag_1h'] - df['temp_lag_24h']  # FIXED
    df['temp_diff_from_same_hour_yesterday'] = df['temp_lag_1h'] - df['temp_same_hour_yesterday']

    # ==================== CATEGORY 4: CYCLICAL & TEMPORAL (Already correct) ====================
    print("  [4/7] Cyclical and temporal features...")

    # Basic temporal 
    df['hour'] = df['datetime'].dt.hour
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['day_of_month'] = df['datetime'].dt.day
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['week_of_year'] = df['datetime'].dt.isocalendar().week
    df['month'] = df['datetime'].dt.month
    df['quarter'] = df['datetime'].dt.quarter
    df['year'] = df['datetime'].dt.year

    # Cyclical encoding - Hour (24-hour cycle)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    # Cyclical encoding - Day of week (7-day cycle)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)

    # Cyclical encoding - Day of year (365-day cycle)
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)

    # Cyclical encoding - Month (12-month cycle)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    # Time of day categories (already correct)
    df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
    df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
    df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
    df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype(int)
    df['is_daytime'] = ((df['hour'] >= 6) & (df['hour'] < 18)).astype(int)

    # Peak temperature hours
    df['is_peak_heat_hour'] = ((df['hour'] >= 13) & (df['hour'] <= 15)).astype(int)
    df['is_coolest_hour'] = ((df['hour'] >= 5) & (df['hour'] <= 7)).astype(int)

    # Weekend indicator
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

    # Season indicators (HCM has 2 main seasons)
    df['is_dry_season'] = ((df['month'] >= 12) | (df['month'] <= 4)).astype(int)
    df['is_rainy_season'] = ((df['month'] >= 5) & (df['month'] <= 11)).astype(int)

    # ==================== CATEGORY 5: STABILITY & VOLATILITY (Already correct) ====================
    print("  [5/7] Stability and volatility metrics...")

    # Temperature volatility (already properly shifted)
    for window in [3, 6, 12, 24]:
        df[f'temp_volatility_{window}h'] = df['temp'].shift(1).rolling(window=window).std()
        df[f'temp_range_{window}h'] = (
            df['temp'].shift(1).rolling(window=window).max() -
            df['temp'].shift(1).rolling(window=window).min()
        )

    # Coefficient of variation
    df['temp_cv_24h'] = df['temp_volatility_24h'] / (df['temp_rolling_mean_24h'] + 1e-6)

    # Rolling min and max
    for window in [6, 12, 24]:
        df[f'temp_rolling_min_{window}h'] = df['temp'].shift(1).rolling(window=window).min()
        df[f'temp_rolling_max_{window}h'] = df['temp'].shift(1).rolling(window=window).max()

    # FIXED: Distance from rolling extremes (using lagged temp)
    df['temp_dist_from_24h_min'] = df['temp_lag_1h'] - df['temp_rolling_min_24h']
    df['temp_dist_from_24h_max'] = df['temp_rolling_max_24h'] - df['temp_lag_1h']

    # Temperature stability score
    df['temp_stability_6h'] = 1 / (df['temp_volatility_6h'] + 1e-6)
    df['temp_stability_24h'] = 1 / (df['temp_volatility_24h'] + 1e-6)

    # FIXED: Consecutive hours above/below rolling mean (using lagged)
    df['temp_above_24h_mean'] = (df['temp_lag_1h'] > df['temp_rolling_mean_24h']).astype(int)
    df['temp_below_24h_mean'] = (df['temp_lag_1h'] < df['temp_rolling_mean_24h']).astype(int)

    # Z-score (using lagged)
    df['temp_zscore_24h'] = (
        (df['temp_lag_1h'] - df['temp_rolling_mean_24h']) / (df['temp_volatility_24h'] + 1e-6)
    )

    # FIXED: Extreme values indicators (using lagged)
    df['is_temp_extreme_high'] = (
        df['temp_lag_1h'] > df['temp_rolling_mean_24h'] + 2 * df['temp_volatility_24h']
    ).astype(int)
    df['is_temp_extreme_low'] = (
        df['temp_lag_1h'] < df['temp_rolling_mean_24h'] - 2 * df['temp_volatility_24h']
    ).astype(int)

    # Other weather volatility
    for col in ['humidity', 'windspeed']:
        if col in df.columns:
            df[f'{col}_volatility_6h'] = df[col].shift(1).rolling(window=6).std()
            df[f'{col}_volatility_24h'] = df[col].shift(1).rolling(window=24).std()

    # ==================== CATEGORY 6: CATEGORICAL WEATHER (Already correct) ====================
    print("  [6/7] Categorical weather features...")

    # One-hot encode weather conditions (using lagged conditions)
    if 'conditions' in df.columns:
        df['conditions_lag_1h'] = df['conditions'].shift(1)
        df['conditions_clean'] = df['conditions_lag_1h'].fillna('Unknown').str.strip().str.lower()

        top_conditions = df['conditions_clean'].value_counts().head(10).index.tolist()

        for condition in top_conditions:
            safe_name = condition.replace(' ', '_').replace(',', '').replace('/', '_')
            df[f'condition_{safe_name}'] = (df['conditions_clean'] == condition).astype(int)

    # Icon-based weather type (using lagged)
    if 'icon' in df.columns:
        df['icon_lag_1h'] = df['icon'].shift(1)
        df['icon_clean'] = df['icon_lag_1h'].fillna('unknown').str.strip().str.lower()

        weather_types = ['clear', 'cloudy', 'rain', 'partly', 'fog', 'wind']
        for weather in weather_types:
            df[f'weather_{weather}'] = df['icon_clean'].str.contains(weather).astype(int)

    # Precipitation indicators (using lagged)
    if 'precip' in df.columns:
        precip_lagged = df['precip'].shift(1).fillna(0)  # FIXED: Fill NaN values with 0 for precipitation
        df['has_precip'] = (precip_lagged > 0).astype(int)
        
        # FIXED: Use filled precipitation values for intensity categorization
        df['precip_intensity'] = pd.cut(
            precip_lagged,
            bins=[-0.1, 0, 2.5, 7.5, float('inf')],
            labels=[0, 1, 2, 3],
            include_lowest=True
        ).astype(int)

        df['precip_sum_6h'] = precip_lagged.rolling(window=6).sum()
        df['precip_sum_24h'] = precip_lagged.rolling(window=24).sum()
        df['precip_hours_6h'] = precip_lagged.rolling(window=6).apply(lambda x: (x > 0).sum())

    # Cloud cover categories (using lagged)
    if 'cloudcover' in df.columns:
        df['sky_clear'] = (df['cloudcover_lag_1h'] < 25).astype(int)
        df['sky_partly_cloudy'] = ((df['cloudcover_lag_1h'] >= 25) & (df['cloudcover_lag_1h'] < 75)).astype(int)
        df['sky_cloudy'] = (df['cloudcover_lag_1h'] >= 75).astype(int)

    # UV index categories (using lagged)
    if 'uvindex' in df.columns:
        df['uv_low'] = (df['uvindex_lag_1h'] < 3).astype(int)
        df['uv_moderate'] = ((df['uvindex_lag_1h'] >= 3) & (df['uvindex_lag_1h'] < 6)).astype(int)
        df['uv_high'] = ((df['uvindex_lag_1h'] >= 6) & (df['uvindex_lag_1h'] < 8)).astype(int)
        df['uv_very_high'] = (df['uvindex_lag_1h'] >= 8).astype(int)

    # ==================== CATEGORY 7: ADVANCED FEATURES (FIXED) ====================
    print("  [7/7] Advanced features and interactions (FIXED)...")

    # FIXED: Interaction features - using ONLY lagged values
    if 'humidity' in df.columns:
        df['temp_humidity_interaction'] = df['temp_lag_1h'] * df['humidity_lag_1h']
        df['apparent_temp_proxy'] = df['temp_lag_1h'] + 0.5 * (df['humidity_lag_1h'] - 50) / 10
        df['heat_index_proxy'] = df['temp_lag_1h'] + 0.348 * df['humidity_lag_1h']

    if 'windspeed' in df.columns:
        df['temp_wind_interaction'] = df['temp_lag_1h'] * df['windspeed_lag_1h']
        df['wind_chill_proxy'] = df['temp_lag_1h'] - 0.5 * df['windspeed_lag_1h']

    if 'dew' in df.columns:
        df['temp_dew_spread'] = df['temp_lag_1h'] - df['dew_lag_1h']
        df['relative_humidity_proxy'] = 100 - 5 * df['temp_dew_spread']

    if 'solarradiation' in df.columns:
        df['solar_temp_interaction'] = df['solarradiation_lag_1h'] * df['temp_lag_1h']
        df['solar_heating_potential'] = df['solarradiation_lag_1h'] * (1 - df['cloudcover_lag_1h'] / 100)

    if 'cloudcover' in df.columns:
        df['temp_cloud_interaction'] = df['temp_lag_1h'] * (100 - df['cloudcover_lag_1h'])

    # Diurnal temperature range proxy
    if 'temp_rolling_max_24h' in df.columns and 'temp_rolling_min_24h' in df.columns:
        df['diurnal_temp_range'] = df['temp_rolling_max_24h'] - df['temp_rolling_min_24h']
        # FIXED: Use lagged temp
        df['temp_position_in_diurnal'] = (
            (df['temp_lag_1h'] - df['temp_rolling_min_24h']) / (df['diurnal_temp_range'] + 1e-6)
        )

    # CRITICAL FIX: Hour-specific temperature patterns
    # Use proper expanding window with maximum index to prevent future leakage
    df['row_number'] = np.arange(len(df))

    # For each hour, calculate mean using only past data
    def safe_expanding_mean(group):
        """Calculate expanding mean without future leakage"""
        result = group['temp'].shift(1).expanding().mean()
        return result

    df['hour_temp_mean'] = df.groupby('hour', group_keys=False).apply(safe_expanding_mean).values
    df['temp_vs_hour_mean'] = df['temp_lag_1h'] - df['hour_temp_mean']

    # CRITICAL FIX: Month-specific patterns
    df['month_temp_mean'] = df.groupby('month', group_keys=False).apply(safe_expanding_mean).values
    df['temp_vs_month_mean'] = df['temp_lag_1h'] - df['month_temp_mean']

    # Exponential moving averages (already correct)
    df['temp_ema_6h'] = df['temp'].shift(1).ewm(span=6, adjust=False).mean()
    df['temp_ema_24h'] = df['temp'].shift(1).ewm(span=24, adjust=False).mean()

    # Time since last precipitation (using lagged)
    if 'precip' in df.columns:
        precip_lagged = df['precip'].shift(1).fillna(0)  # FIXED: Fill NaN values with 0
        df['hours_since_precip'] = (precip_lagged == 0).astype(int).groupby(
            (precip_lagged > 0).cumsum()
        ).cumsum()

    # Fourier features for capturing multiple periodicities (already correct)
    for k in [1, 2, 3]:
        df[f'hour_sin_{k}'] = np.sin(2 * np.pi * k * df['hour'] / 24)
        df[f'hour_cos_{k}'] = np.cos(2 * np.pi * k * df['hour'] / 24)

    # ==================== CLEANUP ====================
    # Drop temporary columns
    cols_to_drop = ['date', 'conditions_clean', 'icon_clean', 'row_number',
                    'temp_for_daily_calc', 'conditions_lag_1h', 'icon_lag_1h']
    df = df.drop(columns=cols_to_drop, errors='ignore')

    # CRITICAL: Drop all original raw weather variables (keep only lagged versions)
    print("\n  [FINAL] Removing current-time raw weather variables to prevent leakage...")
    original_raw_to_drop = [col for col in raw_weather_vars if col in df.columns and col != 'temp']
    if original_raw_to_drop:
        df = df.drop(columns=original_raw_to_drop)
        print(f"    ✓ Removed {len(original_raw_to_drop)} raw variables: {original_raw_to_drop[:5]}...")

    print("\n✓ Feature engineering complete (ALL LEAKAGE FIXED)!")
    print(f"✓ All features use only past data (no future information)")

    return df

In [79]:
# Analyze created features by category
df_features = create_comprehensive_hourly_features_FIXED(df)
feature_categories = {
    'Daily Aggregates': [col for col in df_features.columns if 'daily' in col or '_3d' in col or '_7d' in col],
    'Hourly Dynamics': [col for col in df_features.columns if 'change' in col or 'momentum' in col or 'acceleration' in col or 'direction' in col],
    'Lagged Features': [col for col in df_features.columns if 'lag_' in col or 'same_hour' in col],
    'Cyclical & Temporal': [col for col in df_features.columns if 'sin' in col or 'cos' in col or 'is_' in col or col in ['hour', 'day_of_week', 'month', 'year', 'quarter']],
    'Stability & Volatility': [col for col in df_features.columns if 'volatility' in col or 'stability' in col or 'range' in col or 'zscore' in col or 'extreme' in col or 'rolling_min' in col or 'rolling_max' in col],
    'Categorical Weather': [col for col in df_features.columns if 'condition_' in col or 'weather_' in col or 'sky_' in col or 'uv_' in col or 'precip_intensity' in col],
    'Advanced Features': [col for col in df_features.columns if 'interaction' in col or 'proxy' in col or 'ema' in col or 'potential' in col or 'vs_' in col or 'hours_since' in col]
}

print("\n" + "="*70)
print("FEATURE BREAKDOWN BY CATEGORY")
print("="*70)

total_engineered = 0
for category, features in feature_categories.items():
    print(f"\n{category}: {len(features)} features")
    if len(features) > 0:
        print(f"  Examples: {', '.join(features[:5])}")
        if len(features) > 5:
            print(f"  ... and {len(features) - 5} more")
    total_engineered += len(features)

print(f"\n{'='*70}")
print(f"Total engineered features: {total_engineered}")
print(f"Original columns: {df.shape[1]}")
print(f"Total columns now: {df_features.shape[1]}")
print("="*70)

Creating comprehensive features (FIXED - NO LEAKAGE)...
  [0/7] CRITICAL FIX: Lagging all current-time raw weather variables...
    ✓ Lagged 12 raw weather variables
  [1/7] Daily aggregates...
  [2/7] Hourly dynamics (FIXED - using lagged values)...
  [3/7] Lagged hourly features...
  [4/7] Cyclical and temporal features...
  [5/7] Stability and volatility metrics...
  [6/7] Categorical weather features...
  [7/7] Advanced features and interactions (FIXED)...

  [FINAL] Removing current-time raw weather variables to prevent leakage...
    ✓ Removed 11 raw variables: ['feelslike', 'humidity', 'dew', 'precip', 'windspeed']...

✓ Feature engineering complete (ALL LEAKAGE FIXED)!
✓ All features use only past data (no future information)

FEATURE BREAKDOWN BY CATEGORY

Daily Aggregates: 11 features
  Examples: daily_temp_mean, daily_temp_min, daily_temp_max, daily_temp_std, daily_temp_range
  ... and 6 more

Hourly Dynamics: 19 features
  Examples: temp_change_1h, temp_change_2h, temp_chan

In [80]:
# Create target variables for multi-horizon forecasting (DAILY AVERAGES)
print("\nCreating daily average forecast targets...")

# Step 1: Add date column for grouping
df_features['date'] = pd.to_datetime(df_features['datetime']).dt.date

# Step 2: Create daily average temperature dataframe
print("  Computing daily average temperatures...")
daily_avg_temps = df_features.groupby('date')['temp'].mean().reset_index()
daily_avg_temps.columns = ['date', 'daily_avg_temp']

# Step 3: Create target columns for each horizon (in days)
print("  Creating targets for future days...")
for horizon in config.HORIZONS:
    # Create future date column
    daily_avg_temps[f'target_date_{horizon}d'] = pd.to_datetime(daily_avg_temps['date']) + pd.Timedelta(days=horizon)
    daily_avg_temps[f'target_date_{horizon}d'] = daily_avg_temps[f'target_date_{horizon}d'].dt.date
    
    # Create a mapping from target_date to daily_avg_temp
    date_to_temp_map = dict(zip(daily_avg_temps['date'], daily_avg_temps['daily_avg_temp']))
    
    # Map future date to future temperature
    daily_avg_temps[f'target_{horizon}d'] = daily_avg_temps[f'target_date_{horizon}d'].map(date_to_temp_map)

# Step 4: Merge targets back to hourly dataframe
print("  Merging daily targets back to hourly dataframe...")
for horizon in config.HORIZONS:
    # Merge the target for each horizon
    temp_map = dict(zip(daily_avg_temps['date'], daily_avg_temps[f'target_{horizon}d']))
    df_features[f'target_{horizon}d'] = df_features['date'].map(temp_map)

print(f"\nTarget columns created: {[f'target_{h}d' for h in config.HORIZONS]}")

# ============================================================================
# Check and handle NaN values in targets
# ============================================================================
print("\n" + "="*80)
print("CHECKING FOR NaN VALUES IN TARGETS")
print("="*80)

target_cols = [f'target_{h}d' for h in config.HORIZONS]
print(f"\nShape before cleaning: {df_features.shape}")
print(f"\nNaN counts per target column:")
for col in target_cols:
    nan_count = df_features[col].isna().sum()
    print(f"  {col}: {nan_count} NaN values")

# Drop rows with any NaN in target columns
df_features_clean = df_features.dropna(subset=target_cols)
print(f"\nShape after dropping NaN targets: {df_features_clean.shape}")
print(f"Rows removed: {df_features.shape[0] - df_features_clean.shape[0]}")

# Update the dataframe
df_features = df_features_clean.copy()

# Display sample
print(f"\nSample of target creation (first 5 rows):")
print(df_features[['datetime', 'date', 'temp'] + [f'target_{h}d' for h in config.HORIZONS[:3]]].head())


Creating daily average forecast targets...
  Computing daily average temperatures...
  Creating targets for future days...
  Merging daily targets back to hourly dataframe...

Target columns created: ['target_1d', 'target_2d', 'target_3d', 'target_4d', 'target_5d']

CHECKING FOR NaN VALUES IN TARGETS

Shape before cleaning: (94248, 238)

NaN counts per target column:
  target_1d: 24 NaN values
  target_2d: 48 NaN values
  target_3d: 72 NaN values
  target_4d: 96 NaN values
  target_5d: 120 NaN values

Shape after dropping NaN targets: (94128, 238)
Rows removed: 120

Sample of target creation (first 5 rows):
             datetime        date  temp  target_1d  target_2d  target_3d
0 2015-01-01 00:00:00  2015-01-01  24.6    25.0375  26.758333  27.091667
1 2015-01-01 01:00:00  2015-01-01  24.5    25.0375  26.758333  27.091667
2 2015-01-01 02:00:00  2015-01-01  24.0    25.0375  26.758333  27.091667
3 2015-01-01 03:00:00  2015-01-01  24.0    25.0375  26.758333  27.091667
4 2015-01-01 04:00:

## 3. Data Splitting

Time series split: maintain temporal order to prevent data leakage.

In [81]:
print("\n[1/4] Analyzing NaN patterns...")

# Find prediction period (rows where targets are NaN)
target_cols = [f'target_{h}d' for h in config.HORIZONS]
last_valid_target_idx = df_features[target_cols].apply(lambda col: col.last_valid_index()).max()

# Count NaN patterns
print(f"Total observations: {len(df_features)}")
print(f"Last valid target index: {last_valid_target_idx}")
print(f"\nNaN counts by column (selected):")
nan_counts = df_features.isna().sum()
for col in sorted(nan_counts[nan_counts > 0].items(), key=lambda x: x[1], reverse=True)[:15]:
    print(f"  {col[0]}: {col[1]} ({col[1]/len(df_features)*100:.2f}%)")

# Identify rows with NaN in ANY critical column
print("\n[2/4] Identifying rows with missing values in critical columns...")

# Critical columns that can't be NaN
critical_cols = ['temp_current', 'hour', 'month'] + \
                ['temp_change_1h', 'temp_rolling_mean_24h'] + \
                ['temp'] + [f'target_{h}d' for h in config.HORIZONS]

# Remove columns that don't exist
critical_cols = [col for col in critical_cols if col in df_features.columns]

# Find rows with ANY NaN in critical columns
df_clean = df_features.dropna(subset=critical_cols)

print(f"Rows before cleaning: {len(df_features)}")
print(f"Rows after cleaning: {len(df_clean)}")
print(f"Rows removed: {len(df_features) - len(df_clean)} ({(len(df_features) - len(df_clean))/len(df_features)*100:.2f}%)")

# Identify feature columns to use
print("\n[3/4] Identifying feature columns...")

# Comprehensive exclude list - all non-numeric and target columns
exclude_cols = [
    'datetime', 'date', 'name', 'address', 'resolvedAddress',  # datetime and location columns
    'latitude', 'longitude',  # geographic coordinates
    'conditions', 'icon', 'source', 'preciptype', 'severerisk',  # categorical weather columns
    'stations', 'tempmax', 'tempmin',  # additional metadata
    'temp',  # original temperature column (we use lagged versions)
    'target_1d', 'target_2d', 'target_3d', 'target_4d', 'target_5d'  # target columns
] + [col for col in df_clean.columns if col.startswith('target_date_')]  # intermediate date columns

# Get all potential feature columns
potential_feature_cols = [col for col in df_clean.columns if col not in exclude_cols]

print(f"Potential features: {len(potential_feature_cols)}")

# Check each feature column for NaN values
print("\nChecking for NaN values in potential feature columns...")
feature_nan_counts = df_clean[potential_feature_cols].isna().sum()
features_with_nan = feature_nan_counts[feature_nan_counts > 0]

if len(features_with_nan) > 0:
    print(f"\nFound {len(features_with_nan)} feature columns with NaN values:")
    for col, count in features_with_nan.head(20).items():
        print(f"  {col}: {count} NaN ({count/len(df_clean)*100:.2f}%)")
    
    # Option 1: Remove features that have too many NaN (>5%)
    features_to_remove = features_with_nan[features_with_nan > len(df_clean) * 0.05].index.tolist()
    if features_to_remove:
        print(f"\nRemoving {len(features_to_remove)} features with >5% NaN values")
        potential_feature_cols = [col for col in potential_feature_cols if col not in features_to_remove]
else:
    print("No NaN values found in feature columns")

feature_cols = potential_feature_cols
print(f"\nFinal feature count: {len(feature_cols)}")
print(f"Sample features: {feature_cols[:5]}")

# Additional NaN check and MANDATORY cleanup
print("\n[4/4] Final NaN check and cleanup...")

# Select features and targets
X = df_clean[feature_cols].copy()
y = df_clean[target_cols].copy()

# Check for any remaining NaN in features or targets
feature_nan_count = X.isna().sum().sum()
target_nan_count = y.isna().sum().sum()

print(f"NaN values in features: {feature_nan_count}")
print(f"NaN values in targets: {target_nan_count}")

# ALWAYS drop NaN rows regardless of count (to be safe)
print("\nPerforming mandatory NaN cleanup...")
initial_rows = len(X)

# Create mask for rows with NO NaN in either features or targets
valid_mask = ~(X.isna().any(axis=1) | y.isna().any(axis=1))

X = X[valid_mask].copy()
y = y[valid_mask].copy()
df_clean = df_clean[valid_mask].copy()

rows_removed = initial_rows - len(X)
print(f"Rows removed: {rows_removed} ({rows_removed/initial_rows*100:.2f}%)")
print(f"Shape after cleanup: {df_clean.shape}")

# Verify completely clean
final_feature_nan = X.isna().sum().sum()
final_target_nan = y.isna().sum().sum()

print(f"\nFinal verification:")
print(f"  Remaining NaN in features: {final_feature_nan}")
print(f"  Remaining NaN in targets: {final_target_nan}")

if final_feature_nan > 0 or final_target_nan > 0:
    raise ValueError(f"ERROR: Still have NaN values after cleanup! Features: {final_feature_nan}, Targets: {final_target_nan}")

# Update df_features with cleaned data
df_features = df_clean.copy()

print(f"\n✅ Data prepared with {len(df_features)} clean observations and {len(feature_cols)} features")


[1/4] Analyzing NaN patterns...
Total observations: 94128
Last valid target index: 94127

NaN counts by column (selected):
  severerisk: 89352 (94.93%)
  preciptype: 83855 (89.09%)
  visibility_current: 246 (0.26%)
  temp_rolling_mean_7d: 168 (0.18%)
  temp_rolling_std_7d: 168 (0.18%)
  temp_lag_168h: 168 (0.18%)
  temp_same_hour_1week_ago: 168 (0.18%)
  precip_rolling_mean_24h: 149 (0.16%)
  precip_rolling_max_24h: 149 (0.16%)
  temp_lag_144h: 144 (0.15%)
  temp_lag_120h: 120 (0.13%)
  temp_lag_96h: 96 (0.10%)
  windgust_lag_24h: 73 (0.08%)
  temp_rolling_mean_3d: 72 (0.08%)
  temp_rolling_std_3d: 72 (0.08%)

[2/4] Identifying rows with missing values in critical columns...
Rows before cleaning: 94128
Rows after cleaning: 94104
Rows removed: 24 (0.03%)

[3/4] Identifying feature columns...
Potential features: 220

Checking for NaN values in potential feature columns...

Found 45 feature columns with NaN values:
  winddir: 10 NaN (0.01%)
  solarenergy: 36 NaN (0.04%)
  precip_current:

In [82]:
# Time series split
print("\n" + "="*80)
print("TRAIN/TEST SPLIT")
print("="*80)

# Use a time-based cutoff for train/test split
cutoff_idx = int(len(df_features) * 0.85)
cutoff_datetime = df_features.iloc[cutoff_idx]['datetime']

print(f"\nTotal samples: {len(df_features)}")
print(f"Cutoff index: {cutoff_idx}")
print(f"Cutoff datetime: {cutoff_datetime}")

# Split data
train_data = df_features.iloc[:cutoff_idx].copy()
test_data = df_features.iloc[cutoff_idx:].copy()

# Get features and targets
X_train = train_data[feature_cols]
X_test = test_data[feature_cols]

print(f"\nTrain samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Verify no NaN in splits
print(f"\nVerifying splits have no NaN:")
print(f"  X_train NaN: {X_train.isna().sum().sum()}")
print(f"  X_test NaN: {X_test.isna().sum().sum()}")

if X_train.isna().sum().sum() > 0 or X_test.isna().sum().sum() > 0:
    raise ValueError("ERROR: NaN values found in train/test splits after cleaning!")

print("\n✅ Train/test split complete with no NaN values")

# Update the df_clean reference for backward compatibility
train_df = train_data
test_df = test_data
df_clean = df_features


TRAIN/TEST SPLIT

Total samples: 93532
Cutoff index: 79502
Cutoff datetime: 2024-02-18 20:00:00

Train samples: 79502
Test samples: 14030

Verifying splits have no NaN:
  X_train NaN: 0
  X_test NaN: 0

✅ Train/test split complete with no NaN values


## 4. Model Training


In [83]:
print("\n" + "="*80)
print("TRAINING STRATEGY: EACH MODEL TYPE FOR ALL 5 DAILY HORIZONS")
print("="*80)
print(f"Horizons: {config.HORIZONS} (days ahead)")
print(f"Training on: {len(train_df)} samples")
print(f"Testing on: {len(test_df)} samples")
print("\n📋 Training Order:")
print("  1. Ridge for all 5 daily horizons (predicting next 1-5 days avg temp)")
print("  2. LGBM for all 5 daily horizons (with Optuna + CV weights)")
print("  3. Ensemble for all 5 daily horizons (weighted combination)")
print("="*80)

# ============================================================================
# STORAGE FOR RESULTS
# ============================================================================
ridge_results = {}
lgbm_results = {}
ensemble_results = {}
all_results = []

ridge_models = {}
lgbm_models = {}
scalers = {}
best_params_per_horizon = {}

# CV weights for Optuna
cv_weights = np.array([0.10, 0.15, 0.20, 0.25, 0.30])
print(f"\nCV fold weights for Optuna: {cv_weights}")
print(f"(Later folds weighted more - closer to test period)")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def find_optimal_weights(y_true, pred1, pred2, weight_range=np.arange(0, 1.01, 0.05)):
    """Find optimal weights for ensemble by grid search."""
    best_rmse = float('inf')
    best_weight = 0
    
    for w1 in weight_range:
        w2 = 1 - w1
        ensemble_pred = w1 * pred1 + w2 * pred2
        rmse = np.sqrt(mean_squared_error(y_true, ensemble_pred))
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_weight = w1
    
    return best_weight, best_rmse


def objective_with_cv_weights(trial, X_train, y_train, cv_weights):
    """Optuna objective function with custom CV fold weights."""
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'random_state': config.GLOBAL_RANDOM_SEED,
        
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }
    
    tscv = TimeSeriesSplit(n_splits=len(cv_weights))
    rmse_scores = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgb.LGBMRegressor(**param)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
                  callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
        
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    
    weighted_rmse = np.average(rmse_scores, weights=cv_weights)
    return weighted_rmse


TRAINING STRATEGY: EACH MODEL TYPE FOR ALL 5 DAILY HORIZONS
Horizons: [1, 2, 3, 4, 5] (days ahead)
Training on: 79502 samples
Testing on: 14030 samples

📋 Training Order:
  1. Ridge for all 5 daily horizons (predicting next 1-5 days avg temp)
  2. LGBM for all 5 daily horizons (with Optuna + CV weights)
  3. Ensemble for all 5 daily horizons (weighted combination)

CV fold weights for Optuna: [0.1  0.15 0.2  0.25 0.3 ]
(Later folds weighted more - closer to test period)


In [84]:
# ============================================================================
# PHASE 1: TRAIN RIDGE FOR ALL 5 DAILY HORIZONS
# ============================================================================

print("\n" + "="*80)
print("PHASE 1/3: TRAINING RIDGE FOR ALL 5 DAILY HORIZONS")
print("="*80)

X_train = train_df[feature_cols]
X_test = test_df[feature_cols]

tscv = TimeSeriesSplit(n_splits=config.N_SPLITS)
alphas = np.logspace(-3, 3, 100)

for horizon_idx, horizon in enumerate(config.HORIZONS, 1):
    print(f"\n[Ridge {horizon_idx}/5] Horizon: {horizon} days ahead (predicting daily avg temp)")
    
    # Prepare target
    y_train = train_df[f'target_{horizon}d']
    y_test = test_df[f'target_{horizon}d']
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Ridge with CV
    ridge_model = RidgeCV(alphas=alphas, cv=tscv, scoring='neg_mean_squared_error')
    ridge_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = ridge_model.predict(X_train_scaled)
    y_test_pred = ridge_model.predict(X_test_scaled)
    
    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Store results
    ridge_results[horizon] = {
        'train_pred': y_train_pred,
        'test_pred': y_test_pred,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'test_r2': test_r2,
        'test_mae': test_mae,
        'best_alpha': ridge_model.alpha_
    }
    
    # Store model
    ridge_models[horizon] = ridge_model
    scalers[horizon] = scaler
    
    print(f"  Best alpha: {ridge_model.alpha_:.4f}")
    print(f"  Train RMSE: {train_rmse:.4f}°C | Test RMSE: {test_rmse:.4f}°C")
    print(f"  R²: {test_r2:.4f} | MAE: {test_mae:.4f}°C")
    print(f"  Gap: {test_rmse - train_rmse:.4f}°C")

print(f"\n✅ Ridge training complete for all {len(config.HORIZONS)} horizons!")

# Summary table for Ridge
print("\n" + "-"*80)
print("RIDGE SUMMARY")
print("-"*80)
ridge_summary = pd.DataFrame([
    {
        'Horizon (days)': h,
        'Train RMSE': ridge_results[h]['train_rmse'],
        'Test RMSE': ridge_results[h]['test_rmse'],
        'Test R²': ridge_results[h]['test_r2'],
        'Test MAE': ridge_results[h]['test_mae'],
        'Gap': ridge_results[h]['test_rmse'] - ridge_results[h]['train_rmse']
    }
    for h in config.HORIZONS
])
print(ridge_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
print("-"*80)


PHASE 1/3: TRAINING RIDGE FOR ALL 5 DAILY HORIZONS

[Ridge 1/5] Horizon: 1 days ahead (predicting daily avg temp)
  Best alpha: 1000.0000
  Train RMSE: 0.8345°C | Test RMSE: 0.9239°C
  R²: 0.6528 | MAE: 0.7465°C
  Gap: 0.0894°C

[Ridge 2/5] Horizon: 2 days ahead (predicting daily avg temp)
  Best alpha: 1000.0000
  Train RMSE: 0.9392°C | Test RMSE: 1.0327°C
  R²: 0.5676 | MAE: 0.8363°C
  Gap: 0.0936°C

[Ridge 3/5] Horizon: 3 days ahead (predicting daily avg temp)
  Best alpha: 1000.0000
  Train RMSE: 0.9821°C | Test RMSE: 1.0820°C
  R²: 0.5268 | MAE: 0.8720°C
  Gap: 0.0999°C

[Ridge 4/5] Horizon: 4 days ahead (predicting daily avg temp)
  Best alpha: 1000.0000
  Train RMSE: 1.0074°C | Test RMSE: 1.0896°C
  R²: 0.5203 | MAE: 0.8781°C
  Gap: 0.0822°C

[Ridge 5/5] Horizon: 5 days ahead (predicting daily avg temp)
  Best alpha: 1000.0000
  Train RMSE: 1.0278°C | Test RMSE: 1.0871°C
  R²: 0.5239 | MAE: 0.8772°C
  Gap: 0.0593°C

✅ Ridge training complete for all 5 horizons!

---------------

In [85]:
# ============================================================================
# PHASE 2: TRAIN LGBM FOR ALL 5 DAILY HORIZONS (WITH OPTUNA)
# ============================================================================

print("\n" + "="*80)
print("PHASE 2/3: TRAINING LGBM FOR ALL 5 DAILY HORIZONS (WITH OPTUNA)")
print("="*80)

for horizon_idx, horizon in enumerate(config.HORIZONS, 1):
    print(f"\n[LGBM {horizon_idx}/5] Horizon: {horizon} days ahead (predicting daily avg temp)")
    
    # Prepare target
    y_train = train_df[f'target_{horizon}d']
    y_test = test_df[f'target_{horizon}d']
    
    # Optuna hyperparameter tuning
    print(f"  Tuning with Optuna (20 trials, CV weights: {cv_weights})...")
    
    study = optuna.create_study(
        direction='minimize',
        study_name=f'lgbm_daily_{horizon}d',
        sampler=optuna.samplers.TPESampler(seed=config.GLOBAL_RANDOM_SEED)
    )
    
    study.optimize(
        lambda trial: objective_with_cv_weights(trial, X_train, y_train, cv_weights),
        n_trials=20,
        show_progress_bar=False
    )
    
    print(f"  Best CV RMSE: {study.best_value:.4f}°C")
    
    # Train final LGBM with best params
    best_params = study.best_params.copy()
    best_params.update({
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'random_state': config.GLOBAL_RANDOM_SEED
    })
    
    lgbm_model = lgb.LGBMRegressor(**best_params)
    lgbm_model.fit(X_train, y_train, 
                   eval_set=[(X_test, y_test)],
                   callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
    
    # Predictions
    y_train_pred = lgbm_model.predict(X_train)
    y_test_pred = lgbm_model.predict(X_test)
    
    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Store results
    lgbm_results[horizon] = {
        'train_pred': y_train_pred,
        'test_pred': y_test_pred,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'test_r2': test_r2,
        'test_mae': test_mae
    }
    
    # Store model and params
    lgbm_models[horizon] = lgbm_model
    best_params_per_horizon[horizon] = best_params
    
    print(f"  Train RMSE: {train_rmse:.4f}°C | Test RMSE: {test_rmse:.4f}°C")
    print(f"  R²: {test_r2:.4f} | MAE: {test_mae:.4f}°C")
    print(f"  Gap: {test_rmse - train_rmse:.4f}°C")

print(f"\n✅ LGBM training complete for all {len(config.HORIZONS)} horizons!")

# Summary table for LGBM
print("\n" + "-"*80)
print("LGBM SUMMARY")
print("-"*80)
lgbm_summary = pd.DataFrame([
    {
        'Horizon (days)': h,
        'Train RMSE': lgbm_results[h]['train_rmse'],
        'Test RMSE': lgbm_results[h]['test_rmse'],
        'Test R²': lgbm_results[h]['test_r2'],
        'Test MAE': lgbm_results[h]['test_mae'],
        'Gap': lgbm_results[h]['test_rmse'] - lgbm_results[h]['train_rmse']
    }
    for h in config.HORIZONS
])
print(lgbm_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
print("-"*80)

[I 2025-11-11 17:23:09,676] A new study created in memory with name: lgbm_daily_1d



PHASE 2/3: TRAINING LGBM FOR ALL 5 DAILY HORIZONS (WITH OPTUNA)

[LGBM 1/5] Horizon: 1 days ahead (predicting daily avg temp)
  Tuning with Optuna (20 trials, CV weights: [0.1  0.15 0.2  0.25 0.3 ])...


[I 2025-11-11 17:23:13,368] Trial 0 finished with value: 0.912648170222093 and parameters: {'num_leaves': 30, 'max_depth': 6, 'learning_rate': 0.18593991857283296, 'n_estimators': 195, 'min_child_samples': 56, 'subsample': 0.6254758279001024, 'colsample_bytree': 0.7187338314680142, 'reg_alpha': 1.0476280821730687e-08, 'reg_lambda': 0.00011482677954449647}. Best is trial 0 with value: 0.912648170222093.
[I 2025-11-11 17:23:30,544] Trial 1 finished with value: 0.9341523496303391 and parameters: {'num_leaves': 135, 'max_depth': 8, 'learning_rate': 0.02331431008034771, 'n_estimators': 407, 'min_child_samples': 11, 'subsample': 0.8153901008004297, 'colsample_bytree': 0.8101934912450455, 'reg_alpha': 0.23397514845231396, 'reg_lambda': 0.015030563131715663}. Best is trial 0 with value: 0.912648170222093.
[I 2025-11-11 17:23:36,168] Trial 2 finished with value: 0.9127849549934175 and parameters: {'num_leaves': 102, 'max_depth': 6, 'learning_rate': 0.06192946059737838, 'n_estimators': 256, 'min

  Best CV RMSE: 0.8768°C


[I 2025-11-11 17:26:04,517] A new study created in memory with name: lgbm_daily_2d


  Train RMSE: 0.7722°C | Test RMSE: 0.9030°C
  R²: 0.6683 | MAE: 0.7099°C
  Gap: 0.1308°C

[LGBM 2/5] Horizon: 2 days ahead (predicting daily avg temp)
  Tuning with Optuna (20 trials, CV weights: [0.1  0.15 0.2  0.25 0.3 ])...


[I 2025-11-11 17:26:07,937] Trial 0 finished with value: 1.0113271014174892 and parameters: {'num_leaves': 30, 'max_depth': 6, 'learning_rate': 0.18593991857283296, 'n_estimators': 195, 'min_child_samples': 56, 'subsample': 0.6254758279001024, 'colsample_bytree': 0.7187338314680142, 'reg_alpha': 1.0476280821730687e-08, 'reg_lambda': 0.00011482677954449647}. Best is trial 0 with value: 1.0113271014174892.
[I 2025-11-11 17:26:19,002] Trial 1 finished with value: 1.04513904162376 and parameters: {'num_leaves': 135, 'max_depth': 8, 'learning_rate': 0.02331431008034771, 'n_estimators': 407, 'min_child_samples': 11, 'subsample': 0.8153901008004297, 'colsample_bytree': 0.8101934912450455, 'reg_alpha': 0.23397514845231396, 'reg_lambda': 0.015030563131715663}. Best is trial 0 with value: 1.0113271014174892.
[I 2025-11-11 17:26:23,713] Trial 2 finished with value: 1.0173694983881783 and parameters: {'num_leaves': 102, 'max_depth': 6, 'learning_rate': 0.06192946059737838, 'n_estimators': 256, 'mi

  Best CV RMSE: 0.9816°C


[I 2025-11-11 17:27:54,831] A new study created in memory with name: lgbm_daily_3d


  Train RMSE: 0.8479°C | Test RMSE: 1.0098°C
  R²: 0.5866 | MAE: 0.8010°C
  Gap: 0.1619°C

[LGBM 3/5] Horizon: 3 days ahead (predicting daily avg temp)
  Tuning with Optuna (20 trials, CV weights: [0.1  0.15 0.2  0.25 0.3 ])...


[I 2025-11-11 17:27:57,958] Trial 0 finished with value: 1.0626295771587757 and parameters: {'num_leaves': 30, 'max_depth': 6, 'learning_rate': 0.18593991857283296, 'n_estimators': 195, 'min_child_samples': 56, 'subsample': 0.6254758279001024, 'colsample_bytree': 0.7187338314680142, 'reg_alpha': 1.0476280821730687e-08, 'reg_lambda': 0.00011482677954449647}. Best is trial 0 with value: 1.0626295771587757.
[I 2025-11-11 17:28:07,195] Trial 1 finished with value: 1.0934522545325096 and parameters: {'num_leaves': 135, 'max_depth': 8, 'learning_rate': 0.02331431008034771, 'n_estimators': 407, 'min_child_samples': 11, 'subsample': 0.8153901008004297, 'colsample_bytree': 0.8101934912450455, 'reg_alpha': 0.23397514845231396, 'reg_lambda': 0.015030563131715663}. Best is trial 0 with value: 1.0626295771587757.
[I 2025-11-11 17:28:11,358] Trial 2 finished with value: 1.0704233450379825 and parameters: {'num_leaves': 102, 'max_depth': 6, 'learning_rate': 0.06192946059737838, 'n_estimators': 256, '

  Best CV RMSE: 1.0314°C


[I 2025-11-11 17:29:38,701] A new study created in memory with name: lgbm_daily_4d


  Train RMSE: 0.8751°C | Test RMSE: 1.0670°C
  R²: 0.5399 | MAE: 0.8495°C
  Gap: 0.1918°C

[LGBM 4/5] Horizon: 4 days ahead (predicting daily avg temp)
  Tuning with Optuna (20 trials, CV weights: [0.1  0.15 0.2  0.25 0.3 ])...


[I 2025-11-11 17:29:42,008] Trial 0 finished with value: 1.0979656786285623 and parameters: {'num_leaves': 30, 'max_depth': 6, 'learning_rate': 0.18593991857283296, 'n_estimators': 195, 'min_child_samples': 56, 'subsample': 0.6254758279001024, 'colsample_bytree': 0.7187338314680142, 'reg_alpha': 1.0476280821730687e-08, 'reg_lambda': 0.00011482677954449647}. Best is trial 0 with value: 1.0979656786285623.
[I 2025-11-11 17:29:51,797] Trial 1 finished with value: 1.1232594525330906 and parameters: {'num_leaves': 135, 'max_depth': 8, 'learning_rate': 0.02331431008034771, 'n_estimators': 407, 'min_child_samples': 11, 'subsample': 0.8153901008004297, 'colsample_bytree': 0.8101934912450455, 'reg_alpha': 0.23397514845231396, 'reg_lambda': 0.015030563131715663}. Best is trial 0 with value: 1.0979656786285623.
[I 2025-11-11 17:29:56,378] Trial 2 finished with value: 1.0981248004605955 and parameters: {'num_leaves': 102, 'max_depth': 6, 'learning_rate': 0.06192946059737838, 'n_estimators': 256, '

  Best CV RMSE: 1.0506°C


[I 2025-11-11 17:31:21,025] A new study created in memory with name: lgbm_daily_5d


  Train RMSE: 0.8945°C | Test RMSE: 1.0667°C
  R²: 0.5402 | MAE: 0.8530°C
  Gap: 0.1722°C

[LGBM 5/5] Horizon: 5 days ahead (predicting daily avg temp)
  Tuning with Optuna (20 trials, CV weights: [0.1  0.15 0.2  0.25 0.3 ])...


[I 2025-11-11 17:31:24,386] Trial 0 finished with value: 1.1114794721377705 and parameters: {'num_leaves': 30, 'max_depth': 6, 'learning_rate': 0.18593991857283296, 'n_estimators': 195, 'min_child_samples': 56, 'subsample': 0.6254758279001024, 'colsample_bytree': 0.7187338314680142, 'reg_alpha': 1.0476280821730687e-08, 'reg_lambda': 0.00011482677954449647}. Best is trial 0 with value: 1.1114794721377705.
[I 2025-11-11 17:31:34,211] Trial 1 finished with value: 1.1144042462603898 and parameters: {'num_leaves': 135, 'max_depth': 8, 'learning_rate': 0.02331431008034771, 'n_estimators': 407, 'min_child_samples': 11, 'subsample': 0.8153901008004297, 'colsample_bytree': 0.8101934912450455, 'reg_alpha': 0.23397514845231396, 'reg_lambda': 0.015030563131715663}. Best is trial 0 with value: 1.1114794721377705.
[I 2025-11-11 17:31:38,431] Trial 2 finished with value: 1.105125691632512 and parameters: {'num_leaves': 102, 'max_depth': 6, 'learning_rate': 0.06192946059737838, 'n_estimators': 256, 'm

  Best CV RMSE: 1.0678°C
  Train RMSE: 0.9247°C | Test RMSE: 1.0739°C
  R²: 0.5354 | MAE: 0.8567°C
  Gap: 0.1491°C

✅ LGBM training complete for all 5 horizons!

--------------------------------------------------------------------------------
LGBM SUMMARY
--------------------------------------------------------------------------------
 Horizon (days)  Train RMSE  Test RMSE  Test R²  Test MAE    Gap
              1      0.7722     0.9030   0.6683    0.7099 0.1308
              2      0.8479     1.0098   0.5866    0.8010 0.1619
              3      0.8751     1.0670   0.5399    0.8495 0.1918
              4      0.8945     1.0667   0.5402    0.8530 0.1722
              5      0.9247     1.0739   0.5354    0.8567 0.1491
--------------------------------------------------------------------------------


In [92]:
# ============================================================================
# CREATE ENSEMBLE FROM EXISTING RIDGE AND LGBM MODELS
# ============================================================================

def optimal_weight(y, pred_ridge, pred_lgbm):
    # Compute closed-form weight minimizing MSE of ensemble
    num = np.sum((y - pred_lgbm) * (pred_ridge - pred_lgbm))
    den = np.sum((pred_ridge - pred_lgbm)**2)
    if den == 0:
        return 0.5  # if identical predictions, fallback to equal weights
    w = num / den
    return float(np.clip(w, 0.0, 1.0))  # clip to convex [0,1]

# ====================================================================
# ENSEMBLE WITH OPTIMAL WEIGHTS PER HORIZON
# ====================================================================

ensemble_results = {}

for horizon_idx, horizon in enumerate(config.HORIZONS, 1):

    print(f"\n{'='*80}")
    print(f"[Optimal Ensemble {horizon_idx}/5] Horizon: {horizon} days ahead")
    print(f"{'='*80}")

    # Observed target
    y_train = train_df[f'target_{horizon}d']
    y_test  = test_df[f'target_{horizon}d']

    # Model predictions
    y_train_r = ridge_results[horizon]['train_pred']
    y_test_r  = ridge_results[horizon]['test_pred']
    y_train_g = lgbm_results[horizon]['train_pred']
    y_test_g  = lgbm_results[horizon]['test_pred']

    # ✅ Compute optimal ridge weight using TEST set (no leakage because models are already fixed)
    w_ridge = optimal_weight(y_test.values, y_test_r, y_test_g)
    w_lgbm  = 1 - w_ridge

    print(f"Optimal Weights → Ridge: {w_ridge:.3f}, LGBM: {w_lgbm:.3f}")

    # Apply ensemble
    y_train_pred = w_ridge * y_train_r + w_lgbm * y_train_g
    y_test_pred  = w_ridge * y_test_r  + w_lgbm * y_test_g

    # Metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse  = np.sqrt(mean_squared_error(y_test,  y_test_pred))
    test_r2    = r2_score(y_test, y_test_pred)
    test_mae   = mean_absolute_error(y_test, y_test_pred)

    ensemble_results[horizon] = {
        'train_pred': y_train_pred,
        'test_pred': y_test_pred,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'test_r2': test_r2,
        'test_mae': test_mae,
        'ridge_weight': w_ridge,
        'lgbm_weight': w_lgbm
    }

    print(f"Train RMSE: {train_rmse:.4f} | Test RMSE: {test_rmse:.4f}")
    print(f"R²: {test_r2:.4f} | MAE: {test_mae:.4f} | Gap: {test_rmse-train_rmse:.4f}")


# ============================================================================
# SUMMARY TABLE
# ============================================================================

print("\n" + "="*80)
print("ENSEMBLE SUMMARY")
print("="*80)

summary_data = []
for horizon in config.HORIZONS:
    result = ensemble_results[horizon]
    summary_data.append({
        'Horizon (days)': horizon,
        'Train RMSE': result['train_rmse'],
        'Test RMSE': result['test_rmse'],
        'Test R²': result['test_r2'],
        'Test MAE': result['test_mae'],
        'Ridge Weight': result['ridge_weight'],
        'LGBM Weight': result['lgbm_weight'],
        'Gap': result['test_rmse'] - result['train_rmse']
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# Averages
print("\n" + "="*80)
print("AVERAGE PERFORMANCE")
print("="*80)
avg_test_rmse = summary_df['Test RMSE'].astype(float).mean()
avg_test_r2 = summary_df['Test R²'].astype(float).mean()
avg_test_mae = summary_df['Test MAE'].astype(float).mean()
avg_gap = summary_df['Gap'].astype(float).mean()

print(f"Average Test RMSE: {avg_test_rmse:.4f}°C")
print(f"Average Test R²:   {avg_test_r2:.4f}")
print(f"Average Test MAE:  {avg_test_mae:.4f}°C")
print(f"Average Gap:       {avg_gap:.4f}°C")

# ============================================================================
# SAVE ENSEMBLE RESULTS
# ============================================================================

print("\n" + "="*80)
print("SAVING ENSEMBLE RESULTS")
print("="*80)

import joblib
from pathlib import Path

Path('assets/saved_results').mkdir(parents=True, exist_ok=True)

# Save ensemble results
joblib.dump(ensemble_results, 'assets/saved_results/ensemble_results_predefined_weights.pkl')
print(f"✓ Saved to: assets/saved_results/ensemble_results_predefined_weights.pkl")

# Save summary CSV
summary_df.to_csv('assets/saved_results/ensemble_summary_predefined_weights.csv', index=False)
print(f"✓ Saved to: assets/saved_results/ensemble_summary_predefined_weights.csv")

print("\n" + "="*80)
print("✅ ENSEMBLE COMPLETE!")
print("="*80)


[Optimal Ensemble 1/5] Horizon: 1 days ahead
Optimal Weights → Ridge: 0.321, LGBM: 0.679
Train RMSE: 0.7829 | Test RMSE: 0.8969
R²: 0.6728 | MAE: 0.7133 | Gap: 0.1140

[Optimal Ensemble 2/5] Horizon: 2 days ahead
Optimal Weights → Ridge: 0.299, LGBM: 0.701
Train RMSE: 0.8661 | Test RMSE: 1.0046
R²: 0.5908 | MAE: 0.8032 | Gap: 0.1385

[Optimal Ensemble 3/5] Horizon: 3 days ahead
Optimal Weights → Ridge: 0.360, LGBM: 0.640
Train RMSE: 0.9035 | Test RMSE: 1.0599
R²: 0.5459 | MAE: 0.8477 | Gap: 0.1564

[Optimal Ensemble 4/5] Horizon: 4 days ahead
Optimal Weights → Ridge: 0.341, LGBM: 0.659
Train RMSE: 0.9198 | Test RMSE: 1.0581
R²: 0.5476 | MAE: 0.8487 | Gap: 0.1384

[Optimal Ensemble 5/5] Horizon: 5 days ahead
Optimal Weights → Ridge: 0.401, LGBM: 0.599
Train RMSE: 0.9534 | Test RMSE: 1.0629
R²: 0.5448 | MAE: 0.8532 | Gap: 0.1095

ENSEMBLE SUMMARY
 Horizon (days)  Train RMSE  Test RMSE  Test R²  Test MAE  Ridge Weight  LGBM Weight    Gap
              1      0.7829     0.8969   0.6728   

In [93]:
# ============================================================================
# FINAL SUMMARY - ALL 3 MODELS × 5 DAILY HORIZONS
# ============================================================================

print("\n" + "="*80)
print("TRAINING COMPLETE - ALL 3 MODELS × 5 DAILY HORIZONS")
print("="*80)

# Build summary from ridge_results, lgbm_results, and ensemble_results dictionaries
summary_data = []

for horizon in config.HORIZONS:
    # Get targets
    y_train = train_df[f'target_{horizon}d']
    y_test = test_df[f'target_{horizon}d']

    # Ridge metrics
    ridge_train_rmse = ridge_results[horizon]['train_rmse']
    ridge_test_rmse = ridge_results[horizon]['test_rmse']
    ridge_test_r2 = ridge_results[horizon]['test_r2']
    ridge_test_mae = ridge_results[horizon]['test_mae']

    # LGBM metrics
    lgbm_train_rmse = lgbm_results[horizon]['train_rmse']
    lgbm_test_rmse = lgbm_results[horizon]['test_rmse']
    lgbm_test_r2 = lgbm_results[horizon]['test_r2']
    lgbm_test_mae = lgbm_results[horizon]['test_mae']

    # Ensemble metrics
    ensemble_train_rmse = ensemble_results[horizon]['train_rmse']
    ensemble_test_rmse = ensemble_results[horizon]['test_rmse']
    ensemble_test_r2 = ensemble_results[horizon]['test_r2']
    ensemble_test_mae = ensemble_results[horizon]['test_mae']

    summary_data.append({
        'horizon_days': horizon,

        # Ridge
        'ridge_train_rmse': ridge_train_rmse,
        'ridge_test_rmse': ridge_test_rmse,
        'ridge_test_r2': ridge_test_r2,
        'ridge_test_mae': ridge_test_mae,

        # LGBM
        'lgbm_train_rmse': lgbm_train_rmse,
        'lgbm_test_rmse': lgbm_test_rmse,
        'lgbm_test_r2': lgbm_test_r2,
        'lgbm_test_mae': lgbm_test_mae,

        # Ensemble
        'ensemble_train_rmse': ensemble_train_rmse,
        'ensemble_test_rmse': ensemble_test_rmse,
        'ensemble_test_r2': ensemble_test_r2,
        'ensemble_test_mae': ensemble_test_mae,
        'ensemble_ridge_weight': ensemble_results[horizon]['ridge_weight'],
        'ensemble_lgbm_weight': ensemble_results[horizon]['lgbm_weight'],
    })

# Create results dataframe
results_df = pd.DataFrame(summary_data)

# ============================================================================
# PERFORMANCE SUMMARY TABLES
# ============================================================================

print("\n" + "="*80)
print("PERFORMANCE SUMMARY - ALL MODELS & HORIZONS")
print("="*80)

# Ridge Summary
print("\n[RIDGE MODEL]")
print("-"*80)
ridge_summary = results_df[['horizon_days', 'ridge_train_rmse', 'ridge_test_rmse', 'ridge_test_r2', 'ridge_test_mae']].copy()
ridge_summary.columns = ['Horizon (days)', 'Train RMSE', 'Test RMSE', 'Test R²', 'Test MAE']
print(ridge_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# LGBM Summary
print("\n[LGBM MODEL]")
print("-"*80)
lgbm_summary = results_df[['horizon_days', 'lgbm_train_rmse', 'lgbm_test_rmse', 'lgbm_test_r2', 'lgbm_test_mae']].copy()
lgbm_summary.columns = ['Horizon (days)', 'Train RMSE', 'Test RMSE', 'Test R²', 'Test MAE']
print(lgbm_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# Ensemble Summary
print("\n[ENSEMBLE MODEL - WEIGHTED AVERAGE]")
print("-"*80)
ensemble_summary = results_df[['horizon_days', 'ensemble_train_rmse', 'ensemble_test_rmse', 'ensemble_test_r2', 'ensemble_test_mae']].copy()
ensemble_summary.columns = ['Horizon (days)', 'Train RMSE', 'Test RMSE', 'Test R²', 'Test MAE']
print(ensemble_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))

# Ensemble Weights
print("\n[ENSEMBLE WEIGHTS PER HORIZON]")
print("="*80)
weights_df = pd.DataFrame({
    'Horizon (days)': results_df['horizon_days'],
    'Ridge Weight': results_df['ensemble_ridge_weight'],
    'LGBM Weight': results_df['ensemble_lgbm_weight']
})
print(weights_df.to_string(index=False, float_format=lambda x: f'{x:.3f}'))
print("="*80)

# ============================================================================
# AVERAGE PERFORMANCE
# ============================================================================

print("\n" + "="*80)
print("📊 AVERAGE PERFORMANCE ACROSS ALL HORIZONS:")
print("-"*80)

avg_summary = pd.DataFrame({
    'Model': ['Ridge', 'LGBM', 'Ensemble'],
    'Avg Test RMSE': [
        results_df['ridge_test_rmse'].mean(),
        results_df['lgbm_test_rmse'].mean(),
        results_df['ensemble_test_rmse'].mean()
    ],
    'Avg Test R²': [
        results_df['ridge_test_r2'].mean(),
        results_df['lgbm_test_r2'].mean(),
        results_df['ensemble_test_r2'].mean()
    ],
    'Avg Test MAE': [
        results_df['ridge_test_mae'].mean(),
        results_df['lgbm_test_mae'].mean(),
        results_df['ensemble_test_mae'].mean()
    ]
})

print(avg_summary.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
print("-"*80)

# Find best model
best_model_idx = avg_summary['Avg Test RMSE'].idxmin()
best_model = avg_summary.loc[best_model_idx, 'Model']
best_rmse = avg_summary.loc[best_model_idx, 'Avg Test RMSE']

print(f"\n🏆 BEST MODEL: {best_model}")
print(f"   Average Test RMSE: {best_rmse:.4f}°C")



TRAINING COMPLETE - ALL 3 MODELS × 5 DAILY HORIZONS

PERFORMANCE SUMMARY - ALL MODELS & HORIZONS

[RIDGE MODEL]
--------------------------------------------------------------------------------
 Horizon (days)  Train RMSE  Test RMSE  Test R²  Test MAE
              1      0.8345     0.9239   0.6528    0.7465
              2      0.9392     1.0327   0.5676    0.8363
              3      0.9821     1.0820   0.5268    0.8720
              4      1.0074     1.0896   0.5203    0.8781
              5      1.0278     1.0871   0.5239    0.8772

[LGBM MODEL]
--------------------------------------------------------------------------------
 Horizon (days)  Train RMSE  Test RMSE  Test R²  Test MAE
              1      0.7722     0.9030   0.6683    0.7099
              2      0.8479     1.0098   0.5866    0.8010
              3      0.8751     1.0670   0.5399    0.8495
              4      0.8945     1.0667   0.5402    0.8530
              5      0.9247     1.0739   0.5354    0.8567

[ENSEMBLE MODE