In [1]:
%pip install holidays
%pip install lightgbm
%pip install optuna

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import holidays
import lightgbm as lgb
import numpy as np
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [3]:
# --- Configuration ---

DATA_PATH = "merged.csv"

TARGET_COLUMN = 'consumption'
GROUP_COLUMN = 'group_id'
TIME_COLUMN = 'measured_at'

# Define all your categorical features here
CATEGORICAL_FEATURES = [
    'group_id', 
    'm_area', 
    'region', 
    'municipality', 
    'segment', 
    'p_type', 
    'c_bucket'
]

In [4]:
df = pd.read_csv(DATA_PATH) 

In [5]:
df.dtypes

measured_at      object
group_id          int64
consumption     float64
eur_per_mwh     float64
m_area           object
region           object
municipality     object
segment          object
p_type           object
c_bucket         object
avg_temp        float64
avg_hum         float64
wind            float64
rain            float64
air_pressure    float64
dtype: object

In [6]:
def load_and_prep_data(data_path):
    """
    Loads the data, converts time column, and sets categorical dtypes.
    """
    print(f"Loading data from {data_path}...")
    df = pd.read_csv(data_path) 
    
    # We must do this per group to avoid data leaking between customers.
    # 'transform' applies the interpolation within each group and returns a Series
    # aligned with the original dataframe's index.
    # 'limit_direction="both"' fills NaNs at the very start or end of a group
    # by forward-filling or back-filling from the nearest valid price.
    df['eur_per_mwh'] = df.groupby(GROUP_COLUMN)['eur_per_mwh'].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )

    df[TIME_COLUMN] = pd.to_datetime(df[TIME_COLUMN])

    for col in CATEGORICAL_FEATURES:
        if col in df.columns:
            df[col] = df[col].astype('category')
            
    # Sort data to ensure time-based features are correct
    df = df.sort_values(by=[GROUP_COLUMN, TIME_COLUMN])
    
    print("Data loaded and dtypes set.")
    return df

def create_time_features(df):
    """
    Creates new features from the main datetime column.
    INCLUDES CYCLIC FEATURES.
    """
    print("Creating time features (with cyclic)...")
    df_copy = df.copy()
    
    # Original features (we'll use them to create cyclic ones)
    hour = df_copy[TIME_COLUMN].dt.hour
    day_of_week = df_copy[TIME_COLUMN].dt.dayofweek
    day_of_month = df_copy[TIME_COLUMN].dt.day
    day_of_year = df_copy[TIME_COLUMN].dt.dayofyear
    month = df_copy[TIME_COLUMN].dt.month
    
    df_copy['year'] = df_copy[TIME_COLUMN].dt.year
    df_copy['is_weekend'] = (day_of_week >= 5).astype(int)
    
    fin_holidays = holidays.Finland()
    df_copy['is_holiday'] = df_copy[TIME_COLUMN].dt.date.apply(lambda x: x in fin_holidays).astype(int)
    
    # Hour of day (0-23)
    df_copy['hour_sin'] = np.sin(2 * np.pi * hour / 24.0)
    df_copy['hour_cos'] = np.cos(2 * np.pi * hour / 24.0)
    
    # Day of week (0-6)
    df_copy['day_of_week_sin'] = np.sin(2 * np.pi * day_of_week / 7.0)
    df_copy['day_of_week_cos'] = np.cos(2 * np.pi * day_of_week / 7.0)

    # Day of month (1-31)
    df_copy['day_of_month_sin'] = np.sin(2 * np.pi * (day_of_month - 1) / 31.0)
    df_copy['day_of_month_cos'] = np.cos(2 * np.pi * (day_of_month - 1) / 31.0)
    
    # Day of year (1-366)
    df_copy['day_of_year_sin'] = np.sin(2 * np.pi * (day_of_year - 1) / 366.0)
    df_copy['day_of_year_cos'] = np.cos(2 * np.pi * (day_of_year - 1) / 366.0)

    # Month (1-12)
    df_copy['month_sin'] = np.sin(2 * np.pi * (month - 1) / 12.0)
    df_copy['month_cos'] = np.cos(2 * np.pi * (month - 1) / 12.0)
    
    return df_copy

def create_lag_features(df, group_col, target, lags):
    """
    Creates lag features, grouped by each customer.
    """
    print(f"Creating lag features for lags: {lags}...")
    df_copy = df.copy()
    df_grouped = df_copy.groupby(group_col)
    
    for lag in lags:
        df_copy[f'{target}_lag_{lag}h'] = df_grouped[target].shift(lag)
        
    return df_copy

def create_rolling_features(df, group_col, target, windows, aggregations):
    """
    Creates rolling window features, grouped by each customer.
    """
    print(f"Creating rolling features for windows: {windows}...")
    df_copy = df.copy()
    
    for window in windows:
        for agg in aggregations:
            # The groupby operation is chained inside the rolling call
            # This is a more efficient way to do grouped rolling operations
            # We use reset_index() to align the new feature with the original dataframe
            rolling_feat = df_copy.groupby(group_col)[target] \
                                  .rolling(window=window, min_periods=1) \
                                  .agg(agg) \
                                  .reset_index(level=0, drop=True)
            
            df_copy[f'{target}_roll_{agg}_{window}h'] = rolling_feat
            
    return df_copy

In [7]:
# 1. Load and Prep
data = load_and_prep_data(DATA_PATH)

# Check the dtypes and for any missing values
data.info()

Loading data from merged.csv...
Data loaded and dtypes set.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3679872 entries, 0 to 3679871
Data columns (total 15 columns):
 #   Column        Dtype              
---  ------        -----              
 0   measured_at   datetime64[ns, UTC]
 1   group_id      category           
 2   consumption   float64            
 3   eur_per_mwh   float64            
 4   m_area        category           
 5   region        category           
 6   municipality  category           
 7   segment       category           
 8   p_type        category           
 9   c_bucket      category           
 10  avg_temp      float64            
 11  avg_hum       float64            
 12  wind          float64            
 13  rain          float64            
 14  air_pressure  float64            
dtypes: category(7), datetime64[ns, UTC](1), float64(7)
memory usage: 249.2 MB


In [8]:
# 2. Create Time Features
data = create_time_features(data)

# Display the new time features
print(data.columns[data.columns.str.contains('hour|day|month|year|week')])
data.head()

Creating time features (with cyclic)...
Index(['year', 'is_weekend', 'is_holiday', 'hour_sin', 'hour_cos',
       'day_of_week_sin', 'day_of_week_cos', 'day_of_month_sin',
       'day_of_month_cos', 'day_of_year_sin', 'day_of_year_cos', 'month_sin',
       'month_cos'],
      dtype='object')


Unnamed: 0,measured_at,group_id,consumption,eur_per_mwh,m_area,region,municipality,segment,p_type,c_bucket,...,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,day_of_month_sin,day_of_month_cos,day_of_year_sin,day_of_year_cos,month_sin,month_cos
0,2021-01-01 00:00:00+00:00,28,7.010736,24.35,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,0.0,1.0,-0.433884,-0.900969,0.0,1.0,0.0,1.0,0.0,1.0
1,2021-01-01 01:00:00+00:00,28,6.600845,23.98,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,0.258819,0.965926,-0.433884,-0.900969,0.0,1.0,0.0,1.0,0.0,1.0
2,2021-01-01 02:00:00+00:00,28,6.468329,23.72,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,0.5,0.866025,-0.433884,-0.900969,0.0,1.0,0.0,1.0,0.0,1.0
3,2021-01-01 03:00:00+00:00,28,6.712097,23.73,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,0.707107,0.707107,-0.433884,-0.900969,0.0,1.0,0.0,1.0,0.0,1.0
4,2021-01-01 04:00:00+00:00,28,6.418159,24.06,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,0.866025,0.5,-0.433884,-0.900969,0.0,1.0,0.0,1.0,0.0,1.0


In [9]:
# 3. Create Lag Features
lag_list = [1, 2, 3, 6, 12, 24, 48, 168]
data = create_lag_features(data, GROUP_COLUMN, TARGET_COLUMN, lag_list)

# Check the end of a single group's data to see the lags
data[data[GROUP_COLUMN] == data[GROUP_COLUMN].unique()[0]].tail()

Creating lag features for lags: [1, 2, 3, 6, 12, 24, 48, 168]...


  df_grouped = df_copy.groupby(group_col)


Unnamed: 0,measured_at,group_id,consumption,eur_per_mwh,m_area,region,municipality,segment,p_type,c_bucket,...,month_sin,month_cos,consumption_lag_1h,consumption_lag_2h,consumption_lag_3h,consumption_lag_6h,consumption_lag_12h,consumption_lag_24h,consumption_lag_48h,consumption_lag_168h
32851,2024-09-30 19:00:00+00:00,28,6.234852,4.19,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,-0.866025,-0.5,5.625254,5.3512,4.611995,3.531831,3.507821,5.559985,4.625679,5.347109
32852,2024-09-30 20:00:00+00:00,28,5.925762,2.03,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,-0.866025,-0.5,6.234852,5.625254,5.3512,3.679196,3.353279,5.239241,4.431405,5.051069
32853,2024-09-30 21:00:00+00:00,28,5.41912,0.0,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,-0.866025,-0.5,5.925762,6.234852,5.625254,4.109394,3.208446,4.554825,3.846092,4.464691
32854,2024-09-30 22:00:00+00:00,28,5.177524,0.01,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,-0.866025,-0.5,5.41912,5.925762,6.234852,4.611995,3.191845,4.059698,3.491137,4.050641
32855,2024-09-30 23:00:00+00:00,28,4.944352,0.01,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,-0.866025,-0.5,5.177524,5.41912,5.925762,5.3512,3.154596,3.979015,3.436034,3.86406


In [10]:
# 4. Create Rolling Features (Daily and Weekly trends)
window_list = [24, 168]
agg_list = ['mean', 'std']
data = create_rolling_features(data, GROUP_COLUMN, TARGET_COLUMN, window_list, agg_list)

# Check the new rolling features
data.head()

Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Unnamed: 0,measured_at,group_id,consumption,eur_per_mwh,m_area,region,municipality,segment,p_type,c_bucket,...,consumption_lag_3h,consumption_lag_6h,consumption_lag_12h,consumption_lag_24h,consumption_lag_48h,consumption_lag_168h,consumption_roll_mean_24h,consumption_roll_std_24h,consumption_roll_mean_168h,consumption_roll_std_168h
0,2021-01-01 00:00:00+00:00,28,7.010736,24.35,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,,,,,,,7.010736,,7.010736,
1,2021-01-01 01:00:00+00:00,28,6.600845,23.98,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,,,,,,,6.80579,0.289837,6.80579,0.289837
2,2021-01-01 02:00:00+00:00,28,6.468329,23.72,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,,,,,,,6.693303,0.282776,6.693303,0.282776
3,2021-01-01 03:00:00+00:00,28,6.712097,23.73,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,7.010736,,,,,,6.698002,0.231077,6.698002,0.231077
4,2021-01-01 04:00:00+00:00,28,6.418159,24.06,Eastern Finland,Etelä-Savo,Etelä-Savo,Private,Spot Price,High,...,6.600845,,,,,,6.642033,0.236029,6.642033,0.236029


In [11]:
print("--- Creating Price Features ---")

# 1. Define price lag list
price_lag_list = [1, 24, 168]

print(f"Creating price lag features for lags: {price_lag_list}...")
data = create_lag_features(data, GROUP_COLUMN, 'eur_per_mwh', price_lag_list)


# 2. Define price rolling windows
price_window_list = [24, 168]
price_agg_list = ['mean', 'std']

print(f"Creating price rolling features for windows: {price_window_list}...")
data = create_rolling_features(data, GROUP_COLUMN, 'eur_per_mwh', price_window_list, price_agg_list)

print("Price features created.")

--- Creating Price Features ---
Creating price lag features for lags: [1, 24, 168]...
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating price rolling features for windows: [24, 168]...
Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Price features created.


In [12]:
print("--- Creating Weather Features ---")

WEATHER_COLUMNS = ['avg_temp', 'avg_hum', 'wind', 'rain', 'air_pressure']
weather_lag_list = [1, 24, 168]
weather_window_list = [24, 168]
weather_agg_list = ['mean', 'std']

for col in WEATHER_COLUMNS:
    if col not in data.columns:
        print(f"Warning: Weather column '{col}' not found. Skipping.")
        continue
    
    print(f"Creating features for: {col}")
    # Create Lags
    data = create_lag_features(data, GROUP_COLUMN, col, weather_lag_list)
    
    # Create Rolling Windows
    data = create_rolling_features(data, GROUP_COLUMN, col, weather_window_list, weather_agg_list)

print("Weather features created.")

--- Creating Weather Features ---
Creating features for: avg_temp
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Creating features for: avg_hum
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Creating features for: wind
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Creating features for: rain
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Creating features for: air_pressure
Creating lag features for lags: [1, 24, 168]...


  df_grouped = df_copy.groupby(group_col)


Creating rolling features for windows: [24, 168]...


  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \
  rolling_feat = df_copy.groupby(group_col)[target] \


Weather features created.


In [13]:
# 5. Drop rows with NaNs created by lags/rolling windows
original_rows = data.shape[0]
data = data.dropna()
new_rows = data.shape[0]

print(f"Dropped {original_rows - new_rows} rows due to NaN values.")
print(f"Original rows: {original_rows}, New rows: {new_rows}")

Dropped 18816 rows due to NaN values.
Original rows: 3679872, New rows: 3661056


In [14]:
print("--- Final DataFrame Info ---")
data.info()

print("\n--- Final DataFrame (Sample) ---")
# Show a random sample to see different groups and times
data.sample(10)

--- Final DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 3661056 entries, 168 to 3679871
Data columns (total 82 columns):
 #   Column                       Dtype              
---  ------                       -----              
 0   measured_at                  datetime64[ns, UTC]
 1   group_id                     category           
 2   consumption                  float64            
 3   eur_per_mwh                  float64            
 4   m_area                       category           
 5   region                       category           
 6   municipality                 category           
 7   segment                      category           
 8   p_type                       category           
 9   c_bucket                     category           
 10  avg_temp                     float64            
 11  avg_hum                      float64            
 12  wind                         float64            
 13  rain                         float64          

Unnamed: 0,measured_at,group_id,consumption,eur_per_mwh,m_area,region,municipality,segment,p_type,c_bucket,...,rain_roll_std_24h,rain_roll_mean_168h,rain_roll_std_168h,air_pressure_lag_1h,air_pressure_lag_24h,air_pressure_lag_168h,air_pressure_roll_mean_24h,air_pressure_roll_std_24h,air_pressure_roll_mean_168h,air_pressure_roll_std_168h
2175444,2021-10-17 12:00:00+00:00,404,0.28053,56.75,Southern Finland,Uusimaa,Espoo,Private,Variable Price,Low,...,0.083297,0.11369,0.476128,1003.7,999.0,1025.7,1000.475,1.744121,1006.991071,8.056298
187827,2023-09-09 03:00:00+00:00,38,0.350873,0.0,Eastern Finland,Pohjois-Karjala,Joensuu,Private,Fixed Price,Medium,...,0.0,0.052976,0.288071,1020.7,1019.8,1011.9,1019.579167,0.521686,1016.554762,4.485637
2774714,2022-09-10 02:00:00+00:00,582,0.127706,95.06,Western and Inland Finland,Etelä-Pohjanmaa,Etelä-Pohjanmaa,Private,Spot Price,Low,...,0.0,0.0,0.0,1024.2,1026.5,1022.9,1025.320833,0.974447,1025.074405,3.422749
1529849,2023-02-09 17:00:00+00:00,346,0.327415,26.17,Southern Finland,Päijät-Häme,Lahti,Private,Flat/Hybrid,Low,...,0.0,0.002976,0.031766,1016.6,1031.9,1007.7,1024.141667,5.153126,1026.241667,8.045041
3564831,2022-11-14 15:00:00+00:00,709,0.307033,212.44,Western and Inland Finland,Pirkanmaa,Tampere,Private,Variable Price,Low,...,0.0,0.0,0.0,1024.9,1028.7,1003.6,1028.408333,1.724483,1009.658929,10.506476
3469491,2023-03-29 03:00:00+00:00,706,0.408933,65.35,Western and Inland Finland,Pirkanmaa,Tampere,Private,Flat/Hybrid,Medium,...,0.0,0.0,0.0,1010.7,1005.9,1007.4,1009.091667,1.493731,997.614881,7.702513
2728313,2021-02-22 17:00:00+00:00,581,5.346575,60.02,Western and Inland Finland,Etelä-Pohjanmaa,Etelä-Pohjanmaa,Private,Spot Price,High,...,0.0,0.0,0.0,1028.9,1017.5,1024.9,1021.025,4.892519,1019.869048,5.235804
1335190,2023-05-23 22:00:00+00:00,302,0.325737,2.43,Southern Finland,Kanta-Häme,Kanta-Häme,Private,Flat/Hybrid,Medium,...,0.065801,0.004167,0.029677,1017.0,1019.0,1007.6,1017.433333,0.663762,1023.540476,7.890533
2736600,2022-02-03 00:00:00+00:00,581,5.143495,135.89,Western and Inland Finland,Etelä-Pohjanmaa,Etelä-Pohjanmaa,Private,Spot Price,High,...,0.0,0.0,0.0,1007.1,997.0,999.7,1001.966667,3.51304,995.976786,9.094145
119136,2023-05-08 00:00:00+00:00,36,0.368352,56.68,Eastern Finland,Pohjois-Karjala,Joensuu,Enterprise,Variable Price,Low,...,0.0,0.122024,0.378846,1026.6,1028.0,1008.3,1027.704167,0.991915,1019.20119,9.622311


In [15]:
# Save the processed data to a new file (parquet is fast and efficient)
PROCESSED_DATA_PATH = "processed_energy_data.parquet"
data.to_parquet(PROCESSED_DATA_PATH)

print(f"Processed data saved to {PROCESSED_DATA_PATH}")

Processed data saved to processed_energy_data.parquet


In [17]:
# 1. Define the Cutoff Date (Last 14 days of the dataset)
# This dynamic approach works regardless of what specific year your data ends in
max_date = data[TIME_COLUMN].max()
cutoff_date = max_date - pd.Timedelta(days=14)

print(f"Dataset End Date: {max_date}")
print(f"Validation Split Date: {cutoff_date}")

# 2. Perform the Split
train = data[data[TIME_COLUMN] <= cutoff_date]
val = data[data[TIME_COLUMN] > cutoff_date]

# 3. Define Features (X) and Target (y)
# We exclude the target itself and the raw timestamp
features = [col for col in data.columns if col not in [TARGET_COLUMN, TIME_COLUMN]]

X_train = train[features]
y_train = train[TARGET_COLUMN]

X_val = val[features]
y_val = val[TARGET_COLUMN]

print("\n--- Split Shapes ---")
print(f"Training Data:   {X_train.shape} rows")
print(f"Validation Data: {X_val.shape} rows")
print(f"Features used:   {len(features)}")
print(f"Features list:   {features}")

Dataset End Date: 2024-09-30 23:00:00+00:00
Validation Split Date: 2024-09-16 23:00:00+00:00

--- Split Shapes ---
Training Data:   (3623424, 80) rows
Validation Data: (37632, 80) rows
Features used:   80
Features list:   ['group_id', 'eur_per_mwh', 'm_area', 'region', 'municipality', 'segment', 'p_type', 'c_bucket', 'avg_temp', 'avg_hum', 'wind', 'rain', 'air_pressure', 'year', 'is_weekend', 'is_holiday', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'day_of_month_sin', 'day_of_month_cos', 'day_of_year_sin', 'day_of_year_cos', 'month_sin', 'month_cos', 'consumption_lag_1h', 'consumption_lag_2h', 'consumption_lag_3h', 'consumption_lag_6h', 'consumption_lag_12h', 'consumption_lag_24h', 'consumption_lag_48h', 'consumption_lag_168h', 'consumption_roll_mean_24h', 'consumption_roll_std_24h', 'consumption_roll_mean_168h', 'consumption_roll_std_168h', 'eur_per_mwh_lag_1h', 'eur_per_mwh_lag_24h', 'eur_per_mwh_lag_168h', 'eur_per_mwh_roll_mean_24h', 'eur_per_mwh_roll_std_24h', '

In [18]:
# Create the LightGBM Dataset objects
# This effectively "compiles" the data for the model
train_data = lgb.Dataset(
    X_train, 
    label=y_train, 
    categorical_feature=CATEGORICAL_FEATURES,
    free_raw_data=False # Keeps the raw data in memory just in case we need to inspect
)

val_data = lgb.Dataset(
    X_val, 
    label=y_val, 
    categorical_feature=CATEGORICAL_FEATURES,
    reference=train_data, # Validation data must use the same bins/categories as training
    free_raw_data=False
)

print("LightGBM Datasets created successfully.")

LightGBM Datasets created successfully.


In [19]:
def objective(trial):
    # Define the search space for parameters
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'n_estimators': 1000,
        'n_jobs': -1,
        'seed': 42,
        'verbose': -1, # Suppress logs during trials
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', -1, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0), # 'bagging_fraction'
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0) # 'feature_fraction'
    }

    # Re-run the training with these new params
    model_tune = lgb.train(
        params=params,
        train_set=train_data,
        valid_sets=[val_data], # We only need validation set for scoring
        valid_names=['validation'],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )

    # Return the score to minimize
    return model_tune.best_score['validation']['rmse']

# --- Start the Tuning ---
print("Starting Optuna hyperparameter search...")

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) # Run 50 different trials

print("\n--- Tuning Complete ---")
print("Best trial:")
print(f"  Value (RMSE): {study.best_value:.4f}")
print("  Best Params: ")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

[I 2025-11-15 22:11:21,233] A new study created in memory with name: no-name-77b96865-e2bc-4173-92fa-46c0410b408b


Starting Optuna hyperparameter search...


[I 2025-11-15 22:14:41,225] Trial 0 finished with value: 0.057906738445651564 and parameters: {'learning_rate': 0.027000792698870114, 'num_leaves': 119, 'max_depth': 32, 'subsample': 0.8979603643198104, 'colsample_bytree': 0.7940417404786313}. Best is trial 0 with value: 0.057906738445651564.
[I 2025-11-15 22:16:07,510] Trial 1 finished with value: 0.09308999172111423 and parameters: {'learning_rate': 0.03064059744920763, 'num_leaves': 43, 'max_depth': 2, 'subsample': 0.661538269513958, 'colsample_bytree': 0.8781813182998747}. Best is trial 0 with value: 0.057906738445651564.
[I 2025-11-15 22:19:33,005] Trial 2 finished with value: 0.052720598242332765 and parameters: {'learning_rate': 0.04458413621309797, 'num_leaves': 149, 'max_depth': 22, 'subsample': 0.6706900172521421, 'colsample_bytree': 0.5967325773976782}. Best is trial 2 with value: 0.052720598242332765.
[I 2025-11-15 22:22:39,941] Trial 3 finished with value: 0.06067745745752501 and parameters: {'learning_rate': 0.02744149526


--- Tuning Complete ---
Best trial:
  Value (RMSE): 0.0486
  Best Params: 
    learning_rate: 0.09844065831999153
    num_leaves: 150
    max_depth: 21
    subsample: 0.6441976583667873
    colsample_bytree: 0.907325498051877


In [None]:
# 1. Get the best parameters from the Optuna study
# This assumes 'study' is in memory from the previous cell

# --- Tuning Complete ---
# Best trial:
#   Value (RMSE): 0.0486
#   Best Params: 
#     learning_rate: 0.09844065831999153
#     num_leaves: 150
#     max_depth: 21
#     subsample: 0.6441976583667873
#     colsample_bytree: 0.907325498051877
best_params = study.best_params

# 2. Set our fixed parameters and combine with the best ones
# We also add the 'l1' (MAE) metric back in for logging
final_params = {
    'objective': 'regression_l1', 
    'metric': ['rmse', 'l1'],
    'n_estimators': 10000, # high number, early stopping will find the best
    'n_jobs': -1,
    'seed': 42,
    'verbose': -1,
    **best_params  # This unpacks 'learning_rate', 'num_leaves', etc.
}

# 3. Define Callbacks
callbacks = [
    lgb.log_evaluation(period=100), 
    lgb.early_stopping(stopping_rounds=50, first_metric_only=True)
]

# 4. Train the *final* model
print("Training final model with best parameters...")
model = lgb.train(
    params=final_params,
    train_set=train_data,
    valid_sets=[train_data, val_data], 
    valid_names=['train', 'validation'],
    callbacks=callbacks
)

print("\n--- Final Model Training Complete ---")
print(f"Best iteration: {model.best_iteration}")
print(f"Best validation score (RMSE): {model.best_score['validation']['rmse']:.4f}")
print(f"Best validation score (MAE/l1): {model.best_score['validation']['l1']:.4f}")

Training final model with best parameters...
Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.0989961	train's l1: 0.0356865	validation's rmse: 0.0747737	validation's l1: 0.0307522
[200]	train's rmse: 0.0803788	train's l1: 0.0305159	validation's rmse: 0.0628219	validation's l1: 0.0265726
[300]	train's rmse: 0.0712909	train's l1: 0.0283013	validation's rmse: 0.0573841	validation's l1: 0.0248473
[400]	train's rmse: 0.065517	train's l1: 0.0268175	validation's rmse: 0.0538585	validation's l1: 0.0237482
[500]	train's rmse: 0.0628689	train's l1: 0.0260752	validation's rmse: 0.0525994	validation's l1: 0.0232745
[600]	train's rmse: 0.0593594	train's l1: 0.0252022	validation's rmse: 0.05114	validation's l1: 0.0228151
[700]	train's rmse: 0.0582236	train's l1: 0.0247915	validation's rmse: 0.0503945	validation's l1: 0.022552
[800]	train's rmse: 0.0569004	train's l1: 0.024378	validation's rmse: 0.0496425	validation's l1: 0.0223145
[900]	train's rmse: 0.0558494	train

In [None]:
# --- 9. Load, Concat, and Impute Test Set ---

TEST_DATA_PATH = "df_test.csv"

print(f"Loading complete test data from {TEST_DATA_PATH}...")
test_df = pd.read_csv(TEST_DATA_PATH)

# --- 1. Prep Test Set ---
test_df[TIME_COLUMN] = pd.to_datetime(test_df[TIME_COLUMN])
for col in CATEGORICAL_FEATURES:
    if col in test_df.columns:
        # Use the categories from the training data to ensure consistency
        test_df[col] = pd.Categorical(
            test_df[col], 
            categories=data[col].dtype.categories
        )

# --- 2. Get History & Concatenate ---
min_test_date = test_df[TIME_COLUMN].min()
history_cutoff = min_test_date - pd.Timedelta(days=8) # 8 days to be safe
history_df = data[data[TIME_COLUMN] >= history_cutoff].copy()

print(f"History data shape: {history_df.shape}")
print(f"Test data shape: {test_df.shape}")

combined_df = pd.concat([history_df, test_df], ignore_index=True)
combined_df = combined_df.sort_values(by=[GROUP_COLUMN, TIME_COLUMN])

# --- 3. Impute ALL Missing Future Data ---
print("Imputing missing data...")

# A. Impute 'eur_per_mwh' (for Oct 2nd)
# Shift(24) here, as it's the most logical way to get Oct 2nd's price
print("Imputing prices using 24-hour lag...")
combined_df['eur_per_mwh'] = combined_df.groupby(GROUP_COLUMN)['eur_per_mwh'].transform(
    lambda x: x.fillna(x.shift(24))
)

# B. Impute all weather columns (for Oct 1st & 2nd)
print("Imputing weather using 48-hour rolling mean...")
WEATHER_COLUMNS = ['avg_temp', 'avg_hum', 'wind', 'rain', 'air_pressure']
for col in WEATHER_COLUMNS:
    if col in combined_df.columns:
        combined_df[col] = combined_df.groupby(GROUP_COLUMN)[col].transform(
            # For each group:
            # 1. Calculate a rolling 48h mean (using min_periods=1 for the start)
            # 2. Shift it by 1 so the mean is from the *past* (no data leakage)
            # 3. Use this rolling mean ONLY to fill the NaNs
            lambda x: x.fillna(x.shift(1).rolling(window=48, min_periods=1).mean())
        )
    
# C. Final cleanup (in case any group had no data at all)
combined_df = combined_df.fillna(method='ffill')

print("Imputation complete. 'combined_df' is ready for feature engineering.")

Loading complete test data from df_test.csv...
History data shape: (21504, 82)
Test data shape: (5376, 15)
Imputing missing data...
Imputing prices using 24-hour lag...
Imputing weather using 48-hour rolling mean...
Imputation complete. 'combined_df' is ready for feature engineering.


  combined_df['eur_per_mwh'] = combined_df.groupby(GROUP_COLUMN)['eur_per_mwh'].transform(
  combined_df[col] = combined_df.groupby(GROUP_COLUMN)[col].transform(
  combined_df = combined_df.fillna(method='ffill')


In [33]:
# 1. Get the last 7 days (168h) of training data.
min_test_date = test_df[TIME_COLUMN].min()
history_cutoff = min_test_date - pd.Timedelta(days=8) # 8 days to be safe
history_df = data[data[TIME_COLUMN] >= history_cutoff].copy()

print(f"History data shape: {history_df.shape}")
print(f"Test data shape: {test_df.shape}")

# 2. Combine the history and the test set
combined_df = pd.concat([history_df, test_df], ignore_index=True)

# 3. Re-run all feature engineering functions
print("Creating time features...")
combined_df = create_time_features(combined_df)

print("Creating CONSUMPTION features...")
combined_df = create_lag_features(combined_df, GROUP_COLUMN, TARGET_COLUMN, lag_list)
combined_df = create_rolling_features(combined_df, GROUP_COLUMN, TARGET_COLUMN, window_list, agg_list)

print("Creating PRICE features...")
combined_df = create_lag_features(combined_df, GROUP_COLUMN, 'eur_per_mwh', price_lag_list)
combined_df = create_rolling_features(combined_df, GROUP_COLUMN, 'eur_per_mwh', price_window_list, price_agg_list)

print("Creating WEATHER features...")
WEATHER_COLUMNS = ['avg_temp', 'avg_hum', 'wind', 'rain', 'air_pressure']
weather_lag_list = [1, 24, 168]
weather_window_list = [24, 168]
weather_agg_list = ['mean', 'std']

for col in WEATHER_COLUMNS:
    print(f"Creating test features for: {col}")
    combined_df = create_lag_features(combined_df, GROUP_COLUMN, col, weather_lag_list)
    combined_df = create_rolling_features(combined_df, GROUP_COLUMN, col, weather_window_list, weather_agg_list)

print("Feature engineering complete.")

History data shape: (21504, 82)
Test data shape: (5376, 15)
Creating time features...
Creating time features (with cyclic)...
Creating CONSUMPTION features...
Creating lag features for lags: [1, 2, 3, 6, 12, 24, 48, 168]...
Creating rolling features for windows: [24, 168]...
Creating PRICE features...
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...
Creating WEATHER features...
Creating test features for: avg_temp
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...
Creating test features for: avg_hum
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...
Creating test features for: wind
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...
Creating test features for: rain
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...


  df_grouped = df_copy.groupby(group_col)
  rolling_feat = df_copy.groupby(group_col)[target] \


Creating test features for: air_pressure
Creating lag features for lags: [1, 24, 168]...
Creating rolling features for windows: [24, 168]...
Feature engineering complete.


In [None]:
# 1. Isolate the test set rows (they are the last N rows)
final_test_features = combined_df.tail(len(test_df))

# 2. Separate features (X_test) and true target (y_test)
X_test = final_test_features[features]
y_test = final_test_features[TARGET_COLUMN]

# 3. Generate Predictions
print("Generating predictions on test set...")
test_predictions = model.predict(X_test)

# 4. Calculate Final Test Scores
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
test_mae = mean_absolute_error(y_test, test_predictions)
test_mape = mean_absolute_percentage_error(y_test, test_predictions)

print("\n--- Final Test Set Performance ---")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE:  {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.4f}")

Generating predictions on test set...

--- Final Test Set Performance ---
Test RMSE: 1.9063
Test MAE:  1.5665
Test MAPE: 3.6077
