# Modelo Precios : Generación de Pipeline para Producción

In [3]:
# Cargamos las librerias necesarias
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from joblib import dump
from sklearn.ensemble import HistGradientBoostingRegressor



### Las VARIABLES RAW que alimentan el modelo de Precios son las siguientes (valores horarios)
RangeIndex: 1826 entries, 0 to 1825
Data columns (total 18 columns):

| #  | Columna                          | Non-Null Count | Tipo de dato |
|----|----------------------------------|----------------|--------------|
| 0  | Year                             | 1826           | int64        |
| 1  | Month                            | 1826           | int64        |
| 2  | Day                              | 1826           | int64        |
| 3  | DayOfYear                        | 1826           | int64        |
| 4  | Demanda_GWh                      | 1826           | float64     |
| 5  | Gen_Eolica_GWh                   | 1826           | float64     |
| 6  | Gen_Solar_GWh                    | 1826           | float64     |
| 7  | Pot_Eolica_GW                    | 1826           | float64     |
| 8  | Pot_Solar_GW                     | 1826           | float64     |
| 9  | Price_Elec_EUR_MWh               | 1826           | float64     |
| 10 | Temp_min_C_Nat                   | 1826           | float64     |
| 11 | Temp_max_C_Nat                   | 1826           | float64     |
| 12 | Temp_media_C_Nat                 | 1826           | float64     |
| 13 | RadiacionGlobal_media_Wm2_Nat     | 1826           | float64     |
| 14 | Viento_media_ms_Nat              | 1826           | float64     |
| 15 | Price_Gas_EUR_MWh                | 1826           | float64     |
| 16 | festivo                          | 1826           | int64        |
| 17 | IPC Diario                       | 1826           | float64     |


In [4]:
df_inputs_precios_raw=pd.read_csv("SET_INPUTS_BASE_2.csv", encoding='latin1')


## TRANSFORMERS

47.656602739726026

In [None]:
# --- Transformer para eliminar columnas ---
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")
    
# --- Transformer para feature engineering ---
class FeatureBuilderPriceFixed(BaseEstimator, TransformerMixin):
    """
    Transformer SIN fit (stateless):
    - Valida columnas requeridas
    - Crea variables adicionales:
      Variac_Temp, Gen_E_Relat, Gen_S_Relat,
      IndiceGas (base fija), time_idx, sin_doy, cos_doy
    """
    def __init__(
        self,
        precio_base_gas: float = 47.66,   # valor fijo (evita fit); se puede modificar
        temp_ref: float = 16.0,
        sort: bool = True
    ):
        self.precio_base_gas = float(precio_base_gas)
        self.temp_ref = temp_ref
        self.sort = sort

        self.required_columns_ = [
            "Year", "Month", "Day", "DayOfYear",
            "Demanda_GWh", "Gen_Eolica_GWh", "Gen_Solar_GWh",
            "Temp_media_C_Nat", "Price_Gas_EUR_MWh",
            "Pot_Eolica_GW", "Pot_Solar_GW",
            "Temp_min_C_Nat", "Temp_max_C_Nat",
            "RadiacionGlobal_media_Wm2_Nat", "Viento_media_ms_Nat",
            "festivo", "IPC Diario",
        ]

    def fit(self, X, y=None):
        # No aprende nada. Se deja por compatibilidad con sklearn.
        if not np.isfinite(self.precio_base_gas) or self.precio_base_gas == 0:
            raise ValueError("precio_base_gas debe ser un número finito y distinto de 0.")
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Este transformer espera un pandas.DataFrame como entrada.")

        missing = [c for c in self.required_columns_ if c not in X.columns]
        if missing:
            raise ValueError(f"Faltan columnas requeridas en el input: {missing}")

        df = X.copy()

        if self.sort:
            df = df.sort_values(["Year", "Month", "Day"]).reset_index(drop=True)

        # Nuevas variables
        df["Variac_Temp"] = (df["Temp_media_C_Nat"] - self.temp_ref).abs()

        denom = df["Demanda_GWh"].replace(0, np.nan)
        df["Gen_E_Relat"] = df["Gen_Eolica_GWh"] / denom
        df["Gen_S_Relat"] = df["Gen_Solar_GWh"] / denom

        # Índice del gas con base fija
        df["IndiceGas"] = (df["Price_Gas_EUR_MWh"] / self.precio_base_gas) * 100

        # Variables temporales
        df["time_idx"] = np.arange(len(df), dtype=int)
        df["sin_doy"] = np.sin(2 * np.pi * df["DayOfYear"] / 365.25)
        df["cos_doy"] = np.cos(2 * np.pi * df["DayOfYear"] / 365.25)

        return df

    # ---------- helpers ----------
    def _check_required_columns(self, X: pd.DataFrame):
        missing = [c for c in self.required_columns_ if c not in X.columns]
        if missing:
            raise ValueError(f"Faltan columnas requeridas en el input: {missing}")

    @staticmethod
    def _ensure_df(X):
        if isinstance(X, pd.DataFrame):
            return X
        raise TypeError("Este transformer espera un pandas.DataFrame como entrada.")
    
  

# Funciones para el entrenamiento del MODELO, y generación de métricas

In [6]:
def mape(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def wape(y_true, y_pred):
    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true)) * 100


def train_price_model(
    df_inputs_price,
    preprocess_pipe,
    idx_train_ini=500,
    train_days=540,
    hgb_params=None
):
    """
    Entrena el modelo final y devuelve:
    - pipeline entrenado
    - predicciones train y test
    - métricas train y test
    """

    # -------------------------
    # 1) Feature engineering
    # -------------------------
    df_input_preproc = preprocess_pipe.fit_transform(df_inputs_price)
    df_input_preproc = df_input_preproc.sort_values("time_idx").reset_index(drop=True)

    # -------------------------
    # 2) X e y
    # -------------------------
    y = df_input_preproc["Price_Elec_EUR_MWh"]
    X = df_input_preproc.drop(columns=["Price_Elec_EUR_MWh"])

    # -------------------------
    # 3) Ventana temporal
    # -------------------------
    idx_train_fin = idx_train_ini + train_days

    X_train = X.iloc[idx_train_ini:idx_train_fin]
    X_test  = X.iloc[idx_train_fin:]

    y_train = y.iloc[idx_train_ini:idx_train_fin]
    y_test  = y.iloc[idx_train_fin:]

    # -------------------------
    # 4) Pipeline final
    # -------------------------
    if hgb_params is None:
        hgb_params = {
        "random_state": 42,
        "loss": "squared_error",
        "max_depth": 8,
        "max_iter": 1500,
        "min_samples_leaf": 30,
        "l2_regularization": 1e-4,
        "learning_rate": 0.1
    }

        
    final_pipe = Pipeline(steps=[
    ("drop_cols", ColumnDropper(columns=[
        "Year", "Month", "Day", "DayOfYear", "time_idx"
    ])),
    ("model", HistGradientBoostingRegressor(**hgb_params))
])

    # -------------------------
    # 5) Entrenamiento
    # -------------------------
    final_pipe.fit(X_train, y_train)

    # -------------------------
    # 6) Predicciones
    # -------------------------
    y_train_pred = final_pipe.predict(X_train)
    y_test_pred  = final_pipe.predict(X_test)

    # -------------------------
    # 7) Métricas
    # -------------------------
    metrics = {
        "train": {
            "MAPE (%)": mape(y_train, y_train_pred),
            "WAPE (%)": wape(y_train, y_train_pred),
            "R2": r2_score(y_train, y_train_pred),
            "RMSE": np.sqrt(mean_squared_error(y_train, y_train_pred)),
        },
        "test": {
            "MAPE (%)": mape(y_test, y_test_pred),
            "WAPE (%)": wape(y_test, y_test_pred),
            "R2": r2_score(y_test, y_test_pred),
            "RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        }
    }

    return final_pipe,X_train,y_train, y_train_pred, y_test_pred, metrics

# Pipeline para el preprocesado

In [7]:
preprocess_pipe = Pipeline(steps=[
    ("feat", FeatureBuilderPriceFixed(
        precio_base_gas=47.66,
        temp_ref=16.0,
        sort=True
    ))
])


# Generar Modelo entrenado y obtener métricas

In [8]:
final_pipe, X_train, y_train, y_train_pred, y_test_pred, metrics = train_price_model(
    df_inputs_price=df_inputs_precios_raw,
    preprocess_pipe=preprocess_pipe,
    idx_train_ini=830,
    train_days=630
)

print(metrics)

{'train': {'MAPE (%)': np.float64(0.1880221356778112), 'WAPE (%)': np.float64(0.16771771486312342), 'R2': 0.9999561786784444, 'RMSE': np.float64(0.4418241322261146)}, 'test': {'MAPE (%)': np.float64(70.95056778840772), 'WAPE (%)': np.float64(21.57780930097696), 'R2': 0.4396110782692445, 'RMSE': np.float64(24.569501724111163)}}


## Generar Pipeline de Produccción, entrenarlo y guardarlo - 
#### se tiene encuenta que hay variables que se pasaran en producción y que se usaran para el feauture enginerring, 
#### pero que el modleo no ve, por lo que al llamara  l modleo se eliminan 


In [9]:
# Pipeline completo: preprocesado + modelo
production_pipe = Pipeline(steps=[
    ("feat", FeatureBuilderPriceFixed(precio_base_gas=45.73, temp_ref=16.0, sort=True)),
    ("drop_for_model", ColumnDropper(columns=["Year", "Month", "Day", "DayOfYear", "time_idx"])),
    ("model", HistGradientBoostingRegressor(
        random_state=42,
        loss="squared_error",
        max_depth=8,
        max_iter=1500,
        min_samples_leaf=30,
        l2_regularization=1e-4,
        learning_rate=0.1
    ))
])


# 1) Entrenar (esto crea precio_base_ y entrena el RF)
production_pipe.fit(X_train, y_train)

# Guardar a disco
dump(production_pipe, "modelo_precio_electricidad_prod_HGB.joblib")

['modelo_precio_electricidad_prod_HGB.joblib']