In [1]:
# 📓 Notebook 3C – Hyperparameter Tuning (LightGBM)

# 0️⃣ Load processed data (daylight only)

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
import joblib

# Load data
current_dir = Path().resolve()
root = current_dir.parents[1]
data_dir = root / 'data' / 'processed'

df = pd.read_parquet(data_dir / 'merged_openmeteo_pvgis.parquet')

# Filter daylight
target = 'global_irradiance_W_m2'
df_daylight = df[df[target] > 0]

# Add time features
df_daylight['hour'] = df_daylight['time'].dt.hour
df_daylight['month'] = df_daylight['time'].dt.month

# Encode city
encoder = OrdinalEncoder()
df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])

# 1️⃣ Define features and target

features = [
    'temperature_2m',
    'cloudcover',
    'windspeed_10m',
    'winddirection_10m',
    'shortwave_radiation',
    'direct_radiation',
    'diffuse_radiation',
    'cloud_cover',
    'hour',
    'month',
    'city_encoded'
]

# 2️⃣ Split train/validation by city (70-30 per city)

train_list = []
val_list = []

for city in df_daylight['city'].unique():
    df_city = df_daylight[df_daylight['city'] == city]
    train_city = df_city.sample(frac=0.7, random_state=42)
    val_city = df_city.drop(train_city.index)
    
    train_list.append(train_city)
    val_list.append(val_city)

df_train = pd.concat(train_list)
df_val = pd.concat(val_list)

# 3️⃣ Prepare data for LightGBM

X_train = df_train[features]
y_train = df_train[target]

X_val = df_val[features]
y_val = df_val[target]

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# 4️⃣ Set tuned parameters

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "learning_rate": 0.01,
    "num_leaves": 100,
    "max_depth": 12,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 50,
    "n_jobs": -1,
    "device_type": "cpu"  # Change to 'gpu' if you have CUDA
}

# 5️⃣ Train LightGBM model

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1500,
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(100)
    ]
)

# 6️⃣ Evaluate model

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")

# 7️⃣ Save model and encoder

models_dir = root / 'models'
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, models_dir / 'lightgbm_irradiance_daylight_tuned.pkl')
joblib.dump(encoder, models_dir / 'city_encoder_daylight_tuned.pkl')

print("✅ Tuned model and encoder saved.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['hour'] = df_daylight['time'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['month'] = df_daylight['time'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])


Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 196.603	valid_1's rmse: 196.892
[200]	training's rmse: 164.778	valid_1's rmse: 165.294
[300]	training's rmse: 157.306	valid_1's rmse: 157.913
[400]	training's rmse: 154.354	valid_1's rmse: 155.02
[500]	training's rmse: 152.479	valid_1's rmse: 153.198
[600]	training's rmse: 151.198	valid_1's rmse: 151.971
[700]	training's rmse: 150.273	valid_1's rmse: 151.102
[800]	training's rmse: 149.604	valid_1's rmse: 150.488
[900]	training's rmse: 149.074	valid_1's rmse: 150.014
[1000]	training's rmse: 148.652	valid_1's rmse: 149.654
[1100]	training's rmse: 148.292	valid_1's rmse: 149.356
[1200]	training's rmse: 147.987	valid_1's rmse: 149.114
[1300]	training's rmse: 147.708	valid_1's rmse: 148.895
[1400]	training's rmse: 147.428	valid_1's rmse: 148.676
[1500]	training's rmse: 147.198	valid_1's rmse: 148.507
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 147.198	valid_1's rmse: 148.507
RMSE: