# Notebook 3 – Model Training

## 1. Load processed data

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
import joblib

# Load data
current_dir = Path().resolve()
root = current_dir.parents[1]

data_dir = root / 'data' / 'processed'
df = pd.read_parquet(data_dir / 'merged_openmeteo_pvgis.parquet')

print("✅ Data loaded. Shape:", df.shape)


✅ Data loaded. Shape: (7293312, 11)


## 1. Define features and target

In [2]:
target = 'global_irradiance_W_m2'

features = [
    'temperature_2m',
    'cloudcover',
    'windspeed_10m',
    'winddirection_10m',
    'shortwave_radiation',
    'direct_radiation',
    'diffuse_radiation',
    'cloud_cover',
    'city'  # Include city as feature
]


## 2. Encode city as categorical feature

In [3]:
encoder = OrdinalEncoder()
df['city_encoded'] = encoder.fit_transform(df[['city']])

# Replace city string by numeric in features list
features = [f for f in features if f != 'city'] + ['city_encoded']

## 3. Split train/validation by city (70-30 per city)

In [4]:
train_list = []
val_list = []

for city in df['city'].unique():
    df_city = df[df['city'] == city]
    train_city = df_city.sample(frac=0.7, random_state=42)
    val_city = df_city.drop(train_city.index)
    
    train_list.append(train_city)
    val_list.append(val_city)

df_train = pd.concat(train_list)
df_val = pd.concat(val_list)

print("✅ Train shape:", df_train.shape)
print("✅ Validation shape:", df_val.shape)

✅ Train shape: (5105308, 12)
✅ Validation shape: (2188004, 12)


## 4.  Prepare LightGBM datasets

In [5]:
X_train = df_train[features]
y_train = df_train[target]

X_val = df_val[features]
y_val = df_val[target]

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

## 5. Train LightGBM model

In [7]:
# params = {
#     "objective": "regression",
#     "metric": "rmse",
#     "boosting_type": "gbdt",
#     "verbosity": -1,
#     "learning_rate": 0.05,
#     "num_leaves": 31,
#     "max_depth": 8,
#     "n_jobs": -1
# }

# model = lgb.train(
#     params,
#     train_data,
#     valid_sets=[train_data, val_data],
#     num_boost_round=500,
#     early_stopping_rounds=20,
#     verbose_eval=50
# )


import lightgbm as lgb
from tqdm import tqdm

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "verbosity": -1,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": 8,
    "n_jobs": -1
}

num_boost_round = 500

# Crear barra de progreso
pbar = tqdm(total=num_boost_round)

# Callback personalizado para tqdm
def tqdm_callback(env):
    if env.iteration % 10 == 0 or env.iteration == num_boost_round:
        pbar.update(10)

# Entrenamiento con callbacks
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=num_boost_round,
    callbacks=[
        tqdm_callback,
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=50)
    ]
)

pbar.close()



  0%|          | 0/500 [01:24<?, ?it/s]


Training until validation scores don't improve for 20 rounds




[50]	training's rmse: 222.96	valid_1's rmse: 223.001




[100]	training's rmse: 218.259	valid_1's rmse: 218.347




[150]	training's rmse: 216.096	valid_1's rmse: 216.24




[200]	training's rmse: 214.642	valid_1's rmse: 214.834




[250]	training's rmse: 213.438	valid_1's rmse: 213.665




[300]	training's rmse: 212.681	valid_1's rmse: 212.944




[350]	training's rmse: 211.91	valid_1's rmse: 212.207




[400]	training's rmse: 211.136	valid_1's rmse: 211.47




[450]	training's rmse: 210.373	valid_1's rmse: 210.749


100%|██████████| 500/500 [01:10<00:00,  7.14it/s]

[500]	training's rmse: 209.564	valid_1's rmse: 209.973
Did not meet early stopping. Best iteration is:
[500]	training's rmse: 209.564	valid_1's rmse: 209.973





## 6. Evaluate model

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")



RMSE: 209.97
MAE: 144.38
R2 Score: 0.59
