In [2]:
# 📓 Notebook 3B – Model Training (Daylight Filtering, versión compatible)

# 0️⃣ Load processed data

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder
import joblib

# Load data
current_dir = Path().resolve()
root = current_dir.parents[1]
data_dir = root / 'data' / 'processed'

df = pd.read_parquet(data_dir / 'merged_openmeteo_pvgis.parquet')

print("✅ Data loaded. Shape:", df.shape)

# 1️⃣ Filter daylight (irradiance > 0)

target = 'global_irradiance_W_m2'
df_daylight = df[df[target] > 0]

print("✅ Daylight data. Shape:", df_daylight.shape)

# 2️⃣ Add temporal features

df_daylight['hour'] = df_daylight['time'].dt.hour
df_daylight['month'] = df_daylight['time'].dt.month

# 3️⃣ Encode city as categorical

encoder = OrdinalEncoder()
df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])

# 4️⃣ Define features and target

features = [
    'temperature_2m',
    'cloudcover',
    'windspeed_10m',
    'winddirection_10m',
    'shortwave_radiation',
    'direct_radiation',
    'diffuse_radiation',
    'cloud_cover',
    'hour',
    'month',
    'city_encoded'
]

# 5️⃣ Split train/validation by city (70-30 per city)

train_list = []
val_list = []

for city in df_daylight['city'].unique():
    df_city = df_daylight[df_daylight['city'] == city]
    train_city = df_city.sample(frac=0.7, random_state=42)
    val_city = df_city.drop(train_city.index)
    
    train_list.append(train_city)
    val_list.append(val_city)

df_train = pd.concat(train_list)
df_val = pd.concat(val_list)

print("✅ Train shape:", df_train.shape)
print("✅ Validation shape:", df_val.shape)

# 6️⃣ Train LightGBM model (version compatible with callbacks)

X_train = df_train[features]
y_train = df_train[target]

X_val = df_val[features]
y_val = df_val[target]

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": 8,
    "n_jobs": -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=500,
    callbacks=[
        lgb.early_stopping(20),
        lgb.log_evaluation(50)
    ]
)

# 7️⃣ Evaluate model

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")

# 8️⃣ Save model and encoder

models_dir = root / 'models'
models_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, models_dir / 'lightgbm_irradiance_daylight.pkl')
joblib.dump(encoder, models_dir / 'city_encoder_daylight.pkl')

print("✅ Model and encoder saved.")




✅ Data loaded. Shape: (7293312, 11)
✅ Daylight data. Shape: (3512764, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['hour'] = df_daylight['time'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['month'] = df_daylight['time'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_daylight['city_encoded'] = encoder.fit_transform(df_daylight[['city']])


✅ Train shape: (2458936, 14)
✅ Validation shape: (1053828, 14)
Training until validation scores don't improve for 20 rounds
[50]	training's rmse: 162.382	valid_1's rmse: 162.822
[100]	training's rmse: 156.275	valid_1's rmse: 156.774
[150]	training's rmse: 153.947	valid_1's rmse: 154.495
[200]	training's rmse: 152.772	valid_1's rmse: 153.362
[250]	training's rmse: 151.959	valid_1's rmse: 152.592
[300]	training's rmse: 151.338	valid_1's rmse: 152.012
[350]	training's rmse: 150.825	valid_1's rmse: 151.54
[400]	training's rmse: 150.436	valid_1's rmse: 151.198
[450]	training's rmse: 150.078	valid_1's rmse: 150.881
[500]	training's rmse: 149.731	valid_1's rmse: 150.575
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 149.731	valid_1's rmse: 150.575
RMSE: 150.58
MAE: 98.90
R2 Score: 0.79
✅ Model and encoder saved.
