In [1]:
# 📓 Notebook 3D – Temporal Validation + Hyperparameter Tuning (LightGBM)

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
import joblib

# 0️⃣ Load processed data (daylight only)

current_dir = Path().resolve()
root = current_dir.parents[1]
data_dir = root / 'data' / 'processed'

df = pd.read_parquet(data_dir / 'merged_openmeteo_pvgis.parquet')

# Filter daylight
target = 'global_irradiance_W_m2'
df_daylight = df[df[target] > 0]

# Add time features
df_daylight['hour'] = df_daylight['time'].dt.hour
df_daylight['month'] = df_daylight['time'].dt.month

# Encode city
encoder = OrdinalEncoder()
df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])

# 1️⃣ Define features and target

features = [
    'temperature_2m',
    'cloudcover',
    'windspeed_10m',
    'winddirection_10m',
    'shortwave_radiation',
    'direct_radiation',
    'diffuse_radiation',
    'cloud_cover',
    'hour',
    'month',
    'city_encoded'
]

# 2️⃣ Split train/validation by city (temporal split)

train_list = []
val_list = []

for city in df_daylight['city'].unique():
    df_city = df_daylight[df_daylight['city'] == city].sort_values('time')
    cut_idx = int(0.7 * len(df_city))
    
    train_city = df_city.iloc[:cut_idx]
    val_city = df_city.iloc[cut_idx:]
    
    train_list.append(train_city)
    val_list.append(val_city)

df_train = pd.concat(train_list)
df_val = pd.concat(val_list)

# 3️⃣ Prepare data for LightGBM

X_train = df_train[features]
y_train = df_train[target]

X_val = df_val[features]
y_val = df_val[target]

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# 4️⃣ Set tuned parameters (refinados)

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "learning_rate": 0.01,            # Más bajo, más estable
    "num_leaves": 150,                # Aumentado para captar no linealidades
    "max_depth": 10,                  # Ligeramente reducido para evitar overfitting
    "feature_fraction": 0.85,
    "bagging_fraction": 0.7,          # Menos bagging → más precisión pero con regularización
    "bagging_freq": 3,                # Más frecuente
    "min_data_in_leaf": 30,           # Más pequeño para ajustar mejor puntos atípicos
    "lambda_l1": 1.0,                 # Regularización L1 (reduce overfitting en variables irrelevantes)
    "lambda_l2": 1.0,                 # Regularización L2 (reduce overfitting global)
    "n_jobs": -1,
    "device_type": "cpu"              # Cambia a 'gpu' si tienes CUDA
}

# 5️⃣ Train LightGBM model

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(100),
        lgb.log_evaluation(100)
    ]
)

# 6️⃣ Evaluate model

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")

# 7️⃣ Save model and encoder

models_dir = root / 'models'
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, models_dir / 'lightgbm_irradiance_daylight_temporal.pkl')
joblib.dump(encoder, models_dir / 'city_encoder_daylight_temporal.pkl')

print("✅ Model with temporal validation and tuned hyperparameters saved.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['hour'] = df_daylight['time'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['month'] = df_daylight['time'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])


Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 196.511	valid_1's rmse: 195.033
[200]	training's rmse: 164.851	valid_1's rmse: 162.528
[300]	training's rmse: 157.469	valid_1's rmse: 155.12
[400]	training's rmse: 154.713	valid_1's rmse: 152.514
[500]	training's rmse: 152.906	valid_1's rmse: 150.92
[600]	training's rmse: 151.636	valid_1's rmse: 149.875
[700]	training's rmse: 150.62	valid_1's rmse: 149.12
[800]	training's rmse: 149.804	valid_1's rmse: 148.525
[900]	training's rmse: 149.194	valid_1's rmse: 148.117
[1000]	training's rmse: 148.673	valid_1's rmse: 147.789
[1100]	training's rmse: 148.259	valid_1's rmse: 147.543
[1200]	training's rmse: 147.915	valid_1's rmse: 147.359
[1300]	training's rmse: 147.571	valid_1's rmse: 147.168
[1400]	training's rmse: 147.253	valid_1's rmse: 146.996
[1500]	training's rmse: 146.977	valid_1's rmse: 146.864
[1600]	training's rmse: 146.721	valid_1's rmse: 146.749
[1700]	training's rmse: 146.498	valid_1's rmse: 146.657