# Enfoque 1 (Aleatorio) - Fase 1 Optimización (Modelos LINEALES + RF criterio WAPE) + Inputs Trat 2
### En este Notebook, vamos a crear un BUCLE más detallado que considere:
1. Evalúa varios modelos
2. Hace el standar scaler si es necesario (en lineales)
3. Hace tuning de los parámetros de cada modelo, teniendo en cuenta dentro un CV, y considera en cada caso el mejor modelo con el CRITERIO WAPE definido ad-hoc
4. Obtiene en una matriz de métricas r2, RMSE, MAPE, WAPE
 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Gráficos
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Preprocesado y modelado
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error,mean_absolute_percentage_error,make_scorer
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor,LinearRegression
from sklearn.ensemble import RandomForestRegressor

from tqdm.auto import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## Carga DATOS INPUTS TRATADO 2 (con índice temporal)

In [2]:
df_total = pd.read_csv('SET_INPUTS_TRAT_2.csv', sep=',', encoding='latin-1')

# 1) Asegura orden temporal (ajusta nombres de columnas)
df_total = df_total.sort_values(["time_idx"]).reset_index(drop=True)

# 2) Define features y target
X = df_total.drop(columns=['Price_Elec_EUR_MWh','Year','Month','Day','DayOfYear'], axis=1)
y = df_total['Price_Elec_EUR_MWh']

# División del modelo (Suffle)

In [3]:
# División de los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [4]:
# =========================
# 1) CV (elige el que quieras)
# =========================
cv = KFold(n_splits=5, shuffle=True, random_state=42)

models_cfg = {
    "Ridge": {
        "estimator": Ridge(random_state=42),
        "scale": True,
        "param_grid": {
            "model__alpha": [0.1, 1.0, 10.0, 100.0]
        }
    },
    "Lasso": {
        "estimator": Lasso(random_state=42, max_iter=20000),
        "scale": True,
        "param_grid": {
            "model__alpha": [1e-4, 1e-3, 1e-2, 1e-1, 1.0]
        }
    },
    "LinearRegression": {
        "estimator": LinearRegression(),
        "scale": True,   # recomendable para comparar coeficientes
        "param_grid": {} # no hay hiperparámetros que tunear
    }, 
    "SGDRegressor": {
        "estimator": SGDRegressor(random_state=42),
        "scale": True,
        "param_grid": {
            "model__loss": ["squared_error", "huber"],
            "model__penalty": ["l2", "l1", "elasticnet"],
            "model__alpha": [1e-6, 1e-5, 1e-4, 1e-3],
            "model__max_iter": [2000, 5000],
            "model__tol": [1e-3, 1e-4]
        }
    },
    "RandomForest": {
        "estimator": RandomForestRegressor(random_state=42),
        "scale": False,  # no lo necesita
        "param_grid": {
            "model__n_estimators": [100, 300, 800],
            "model__max_leaf_nodes": [50, 150, 300],
        }
    }
}


In [None]:
# =========================
# 3) Bucle con tqdm + GridSearchCV + métricas test
# =========================
metrics_analysis= []


def wape(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

wape_scorer = make_scorer(
    wape,
    greater_is_better=False  # sklearn maximiza
)


for name, cfg in tqdm(models_cfg.items()):
    scaler_step = StandardScaler() if cfg["scale"] else "passthrough"

    pipe = Pipeline([
        ("scaler", scaler_step),
        ("model", cfg["estimator"])
    ])
   
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=cfg["param_grid"],
        cv=cv,
        scoring=wape_scorer,  
        n_jobs=1,
        refit=True
    )

    gs.fit(X_train, y_train)

    best_model = gs.best_estimator_

    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    mse_test = mean_squared_error(y_test,y_test_pred)
    rmse_test = root_mean_squared_error(y_test,y_test_pred)
    mape_test = mean_absolute_percentage_error(y_test,y_test_pred)
    wape_test = wape(y_test, y_test_pred)
    r2_test = r2_score(y_test,y_test_pred)

    mse_train = mean_squared_error(y_train,y_train_pred)
    rmse_train = root_mean_squared_error(y_train,y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train,y_train_pred)
    wape_train = wape(y_train, y_train_pred)
    r2_train = r2_score(y_train,y_train_pred)

    metrics_ = pd.Series(name=name,data={ "model": name,"r2_test":r2_test,"mse_test":mse_test,"rmse_test":rmse_test,"mape_test":mape_test,"wape_test":wape_test,
                                       "r2_train":r2_train,"mse_train":mse_train,"rmse_train":rmse_train,"mape_train":mape_train, "wape_train":mape_train, "best_params": gs.best_params_})
    metrics_analysis.append(metrics_)
  
metrics_analysis_df = pd.DataFrame(metrics_analysis).style.format({
    "r2_test":   "{:.4f}",
    "mse_test":  "{:.1f}",
    "rmse_test": "{:.3f}",
    "mape_test": "{:.4f}",
    "r2_train":   "{:.4f}",
    "mse_train":  "{:.1f}",
    "rmse_train": "{:.3f}",
    "mape_train": "{:.4f}"
})

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [08:33<00:00, 102.80s/it]


In [6]:
metrics_analysis_df.to_excel("metrics_analysis_E1_ALEA_Opt1.xlsx", index=False)