# Enfoque 2 (Series Temporales) - Fase 1 Optimización (Modelos LINEALES + Árboles criterio RMSE) + Inputs Tratados_2

1. Evalúa varios modelos
2. Hace el standar scaler si es necesario (en lineales)
3. Hace tuning de los parámetros de cada modelo, teniendo en cuenta dentro un CV, y considera en cada caso el mejor modelo con el CRITERIO RMSE definido ad-hoc
4. Obtiene en una matriz de métricas r2, RMSE, MAPE, WAPE
 

In [2]:
import pandas as pd
import numpy as np


# Preprocesado y modelado
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error,mean_absolute_percentage_error,make_scorer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor,LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

from tqdm.auto import tqdm
# Para Importar funciones generadas ad-hoc
import importlib

import TFM_errores as tfm_err
importlib.reload(tfm_err)

  from .autonotebook import tqdm as notebook_tqdm


<module 'TFM_errores' from 'g:\\.shortcut-targets-by-id\\1DbNb9MNcjKmtl63ZWnzd5LmBznAULWIf\\TFM Kschool\\02. Modelos\\1. Modelo\\Precios\\Optimizaciones\\TFM_errores.py'>

## Carga DATOS INPUTS TRATADO 2 (con índice temporal)

In [None]:
df_total = pd.read_csv('SET_INPUTS_TRAT_2.csv', sep=',', encoding='latin-1')

# 1) Asegura orden temporal (ajusta nombres de columnas)
df_total = df_total.sort_values(["time_idx"]).reset_index(drop=True)

# 2) Define features y target
X = df_total.drop(columns=['Price_Elec_EUR_MWh','Year','Month','Day','DayOfYear'], axis=1)
y = df_total['Price_Elec_EUR_MWh']

# División del modelo (Time Series)

In [7]:
# División de los datos en train y test
X = X.sort_values(["time_idx"]).reset_index(drop=True)
y = y.loc[X.index]

train_size = 0.8
split = int(train_size * len(X))

X_train = X.iloc[:split]
X_test  = X.iloc[split:]

y_train = y.iloc[:split]
y_test  = y.iloc[split:]


In [None]:
# =========================
# 1) CV 
# =========================
cv = KFold(n_splits=5, shuffle=True, random_state=42)

models_cfg = {
    "Ridge": {
        "estimator": Ridge(random_state=42),
        "scale": True,
        "param_grid": {
            "model__alpha": [0.1, 1.0, 10.0, 100.0]
        }
    },
    "Lasso": {
        "estimator": Lasso(random_state=42, max_iter=20000),
        "scale": True,
        "param_grid": {
            "model__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1.0]
        }
    },
    "LinearRegression": {
        "estimator": LinearRegression(),
        "scale": True,   # recomendable para comparar coeficientes
        "param_grid": {} # no hay hiperparámetros que tunear
    }, 
    "HistGradientBoosting": {
        "estimator": HistGradientBoostingRegressor(
            random_state=42,
            loss="squared_error"
        ),
        "scale": False,  # árboles → no escalar
        "param_grid": {
            "model__max_depth": [3, 5, 8, None],
            "model__learning_rate": [0.01, 0.05, 0.1],
            "model__max_iter": [300, 800, 1500],
            "model__min_samples_leaf": [10, 30, 50],
            "model__l2_regularization": [0.0, 1e-3, 1e-2]
        }
    },
    "RandomForest": {
        "estimator": RandomForestRegressor(random_state=42),
        "scale": False,  # no lo necesita
        "param_grid": {
            "model__n_estimators": [100, 300, 800, 1000],
            "model__max_leaf_nodes": [50, 150, 300],
        }
    }
}


In [None]:
# =========================
# 3) Bucle con tqdm + GridSearchCV + métricas test
# =========================
metrics_analysis= []
def wape(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

wape_scorer = make_scorer(
    wape,
    greater_is_better=False  # sklearn maximiza
)

for name, cfg in tqdm(models_cfg.items()):
    scaler_step = StandardScaler() if cfg["scale"] else "passthrough"

    pipe = Pipeline([
        ("scaler", scaler_step),
        ("model", cfg["estimator"])
    ])
   
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=cfg["param_grid"],
        cv=cv,
        scoring="neg_root_mean_squared_error",  
        n_jobs=1,
        refit=True
    )

    gs.fit(X_train, y_train)

    best_model = gs.best_estimator_

    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    mse_test = mean_squared_error(y_test,y_test_pred)
    rmse_test = root_mean_squared_error(y_test,y_test_pred)
    mape_test = mean_absolute_percentage_error(y_test,y_test_pred)
    wape_test = wape(y_test, y_test_pred)
    r2_test = r2_score(y_test,y_test_pred)

    mse_train = mean_squared_error(y_train,y_train_pred)
    rmse_train = root_mean_squared_error(y_train,y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train,y_train_pred)
    wape_train = wape(y_train, y_train_pred)
    r2_train = r2_score(y_train,y_train_pred)

    metrics_ = pd.Series(name=name,data={ "model": name,"r2_test":r2_test,"mse_test":mse_test,"rmse_test":rmse_test,"mape_test":mape_test,"wape_test":wape_test,
                                       "r2_train":r2_train,"mse_train":mse_train,"rmse_train":rmse_train,"mape_train":mape_train, "wape_train":mape_train, "best_params": gs.best_params_})
    metrics_analysis.append(metrics_)
    
metrics_analysis_df = pd.DataFrame(metrics_analysis).style.format({
    "r2_test":   "{:.4f}",
    "mse_test":  "{:.1f}",
    "rmse_test": "{:.3f}",
    "mape_test": "{:.4f}",
    "r2_train":   "{:.4f}",
    "mse_train":  "{:.1f}",
    "rmse_train": "{:.3f}",
    "mape_train": "{:.4f}"
})

100%|██████████| 5/5 [1:01:05<00:00, 733.03s/it]


In [None]:
metrics_analysis_df.to_excel("E2_TEMP_Opt1_RMSE_metrics_analysis.xlsx", index=False)