
# Coffee Demand Forecasting — End-to-End (v2.5)

**Novedades v2.5**  
- Bugfix en `rolling_origins` (slicing inverso).  
- **Prophet**: regressors futuros correctamente provistos por `product` (sin NaN).  
- **LGBM**: opción para objetivos de conteo (`poisson`) y **Tweedie** (rejilla corta de `variance_power`).  
- **SARIMAX por producto** (m=7, con exógenas).  
- Métricas ampliadas: **MASE** y **cobertura** de intervalos p10–p90.  
- Nuevas features: `totals_day_roll_7` (causal), `competitor_sum_t1`, `weekofyear`, víspera/post-feriado.  
- Ranking por **producto×h** y **ensemble** por producto (mejor de cada familia).

> Ejecuta las celdas en orden. Deja `DATA_PATH` y `INDEX_PATH` apuntando a tus archivos.


In [1]:
#1
import os
os.environ["KERAS_BACKEND"] = "torch"   
import keras, torch
print("Keras:", keras.__version__, "| Backend:", keras.config.backend(), "| Torch:", torch.__version__)

Keras: 3.4.1 | Backend: torch | Torch: 2.2.2+cpu


In [2]:
#2
# !pip install pandas numpy lightgbm prophet scikit-learn statsmodels pmdarima --quiet

from pathlib import Path
import os, json, sys, platform, time, hashlib, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from IPython.display import display

#Paths
DATA_PATH = "D:\Julian\Estudio\Maestria Inteligencia Artificial\Materias\PROYECTO - DESARROLLO Y DESPLIEGUE DE SOLUCIONES\MICRO - PROYECTOS\REPOSITORIO\coffee-sales-project\data\processed\coffee_ml_features.csv"    # diario×producto
INDEX_PATH = "D:\Julian\Estudio\Maestria Inteligencia Artificial\Materias\PROYECTO - DESARROLLO Y DESPLIEGUE DE SOLUCIONES\MICRO - PROYECTOS\REPOSITORIO\coffee-sales-project\data\clean\index_1.csv"              # transaccional con clima+festivos (opcional)

#Parámetros generales
TARGET = "transactions"
HORIZON = 7
N_ORIGINS = 4
MIN_TRAIN_DAYS = 150                    
TOPK_IMP = 40                           

# Opciones de modelado
USE_LOG1P_TARGET = False
LGBM_OBJECTIVE = "auto"   # "auto" | "poisson" | "tweedie"
TWEEDIE_POWERS = [1.1, 1.3, 1.5]
CAP_OUTLIERS = False
OUTLIER_Q = 0.995

# Calendario y contexto
USE_RICH_CALENDAR = True
UA_HOLIDAYS_PATH = None                 
ADD_BUSINESS_AGGREGATES = True

# Merge desde index_1.csv (clima + festivo)
USE_INDEX_WEATHER_HOLIDAYS = True
WEATHER_AGG = {
    "wx_temperature_2m": "mean",
    "wx_precipitation": "sum",
    "wx_cloudcover": "mean"
}
HOLIDAY_COL = "is_holiday"              

# Prophet regresores
PROPHET_USE_REGRESSORS = True

# Semilla reproducible
RANDOM_STATE = 42

#Directorio de resultados
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

print("CWD:", os.getcwd())
print("DATA_PATH:", Path(DATA_PATH).resolve())
print("INDEX_PATH:", Path(INDEX_PATH).resolve())
print("RESULTS_DIR:", RESULTS_DIR.resolve())


CWD: C:\Users\Julian
DATA_PATH: D:\Julian\Estudio\Maestria Inteligencia Artificial\Materias\PROYECTO - DESARROLLO Y DESPLIEGUE DE SOLUCIONES\MICRO - PROYECTOS\REPOSITORIO\coffee-sales-project\data\processed\coffee_ml_features.csv
INDEX_PATH: D:\Julian\Estudio\Maestria Inteligencia Artificial\Materias\PROYECTO - DESARROLLO Y DESPLIEGUE DE SOLUCIONES\MICRO - PROYECTOS\REPOSITORIO\coffee-sales-project\data\clean\index_1.csv
RESULTS_DIR: C:\Users\Julian\results


In [3]:
#3
def _normalize_for_forecast_save(df_in: pd.DataFrame, target: str) -> pd.DataFrame:
    """Estandariza columnas para el ensemble: date, product, h, y, yhat, (p10, p90), model."""
    df = df_in.copy()
    # h como columna
    if "h" not in df.columns:
        if "index" in df.columns: df = df.rename(columns={"index": "h"})
        elif isinstance(df.index, pd.MultiIndex) and "h" in df.index.names: df = df.reset_index()

    # renombrar verdad a 'y' (admite varios sinónimos)
    y_candidates = ["y", target, "revenue", "units", "qty", "quantity", "sales", "target", "y_true"]
    y_col = next((c for c in y_candidates if c in df.columns), None)
    if y_col is not None and y_col != "y":
        df = df.rename(columns={y_col: "y"})

    cols = [c for c in ["date","product","h","y","yhat","yhat_p10","yhat_p90","model"] if c in df.columns]
    return df[cols]


In [4]:
#4
#Setup Mlflow
import sys, site
"""!{sys.executable} -m pip install --user -U setuptools wheel
!{sys.executable} -m pip install --user "mlflow==2.14.1"
"""
# Asegura que el kernel vea la ruta de user site en esta sesión
import sys, site
sys.path.insert(0, site.getusersitepackages())

import mlflow
print("MLflow OK:", mlflow.__version__)


import mlflow, json, os
from pathlib import Path
import pandas as pd
import numpy as np

# Carpeta local de tracking dentro de results/
MLFLOW_DIR = (RESULTS_DIR / "mlruns").resolve()
mlflow.set_tracking_uri(MLFLOW_DIR.as_uri())     # p.ej. file:///.../results/mlruns
mlflow.set_experiment("CoffeeForecasting")


def _mlflow_log_run(model_name: str,
                    params: dict,
                    overall_csv: Path,
                    by_h_csv: Path,
                    extra_artifacts: list = None,
                    tags: dict = None):
    """
    Lee CSV de métricas (overall y por h) y los registra en MLflow.
    - model_name: nombre del run (ej. 'lgbm_direct')
    - params: dict con hiperparámetros/config
    - overall_csv: ruta al *_metrics_overall.csv
    - by_h_csv: ruta al *_metrics_by_h.csv (si no tiene 'h', se infiere el step)
    - extra_artifacts: rutas extra para adjuntar (forecasts, modelos, etc.)
    - tags: dict de etiquetas (ej. {'stage':'train'})
    """
    with mlflow.start_run(run_name=model_name):
        # parámetros 
        if params:
            clean = {}
            for k, v in params.items():
                if isinstance(v, (int, float, str, bool)) or v is None:
                    clean[k] = v
                else:
                    clean[k] = str(v)
            mlflow.log_params(clean)

        if tags:
            mlflow.set_tags(tags)

        # métricas overall
        try:
            if Path(overall_csv).exists():
                ovr = pd.read_csv(overall_csv)
                if not ovr.empty:
                    row = ovr.iloc[0].to_dict()
                    for key in ["MAE","RMSE","sMAPE","MAPE","COV_p10_p90_%"]:
                        if key in row and pd.notna(row[key]):
                            mlflow.log_metric(key, float(row[key]))
        except Exception as e:
            print(f"[mlflow] warning leyendo overall '{overall_csv}':", e)

        # métricas por horizonte
        try:
            if Path(by_h_csv).exists():
                byh = pd.read_csv(by_h_csv)
                if not byh.empty:
                    if "h" in byh.columns:
                        steps = byh["h"].astype(int).tolist()
                    else:
                        steps = list(range(1, len(byh)+1))
                    for i, (_, r) in enumerate(byh.iterrows()):
                        step = steps[i]
                        for key in ["MAE","RMSE","sMAPE","MAPE","COV_p10_p90_%"]:
                            if key in r and pd.notna(r[key]):
                                mlflow.log_metric(f"{key}_by_h", float(r[key]), step=int(step))
        except Exception as e:
            print(f"[mlflow] warning leyendo by_h '{by_h_csv}':", e)

        # artefactos
        for p in [overall_csv, by_h_csv]:
            p = Path(p)
            if p.exists():
                mlflow.log_artifact(str(p))
        if extra_artifacts:
            for p in extra_artifacts:
                p = Path(p)
                if p.exists():
                    mlflow.log_artifact(str(p))



MLflow OK: 2.14.1


In [5]:
#5
# Cargar features diario×producto
df = pd.read_csv(DATA_PATH)
assert "date" in df.columns, "Se requiere columna 'date'"
df["date"] = pd.to_datetime(df["date"])

# Derivar 'product' desde one-hot product_*
prod_cols = [c for c in df.columns if c.startswith("product_")]
if len(prod_cols) == 0:
    raise ValueError("No hay columnas product_* (one-hot).")
df["product"] = df[prod_cols].idxmax(axis=1).str.replace("product_", "", regex=False)

print("Dimensiones base:", df.shape)
display(df.head(3))


Dimensiones base: (3104, 39)


Unnamed: 0,date,revenue,transactions,avg_price,first_sale_hour,last_sale_hour,avg_sale_hour,year,month,day,...,market_share_transactions,product_Americano,product_Americano with Milk,product_Cappuccino,product_Cocoa,product_Cortado,product_Espresso,product_Hot Chocolate,product_Latte,product
0,2024-03-01,28.9,1.0,28.9,15.0,15.0,15.0,2024,3,1,...,0.090909,True,False,False,False,False,False,False,False,Americano
1,2024-03-02,86.7,3.0,28.9,12.0,19.0,15.666667,2024,3,2,...,0.428571,True,False,False,False,False,False,False,False,Americano
2,2024-03-03,28.9,1.0,28.9,14.0,14.0,14.0,2024,3,3,...,0.1,True,False,False,False,False,False,False,False,Americano


In [6]:
#6
def build_daily_from_index(index_path, holiday_col="is_holiday", weather_agg=None):
    index_path = Path(index_path)
    if not index_path.exists():
        print("[Aviso] INDEX_PATH no existe; se omite merge clima/festivo.")
        return None
    raw = pd.read_csv(index_path)
    if "date" not in raw.columns:
        raise ValueError("index_1.csv debe tener columna 'date'.")
    raw["date"] = pd.to_datetime(raw["date"])
    agg_dict = {}
    if weather_agg:
        for k, v in weather_agg.items():
            if k in raw.columns:
                agg_dict[k] = v
    if holiday_col in raw.columns:
        agg_dict[holiday_col] = "max"
    if not agg_dict:
        print("[Aviso] No se encontraron columnas de clima/festivo en index_1 para agregar.")
        return None
    daily = raw.groupby("date").agg(agg_dict).reset_index()
    return daily

daily_idx = build_daily_from_index(INDEX_PATH, HOLIDAY_COL, WEATHER_AGG)

if daily_idx is not None:
    print("Agregados diarios desde index_1:", daily_idx.columns.tolist())
    df = df.merge(daily_idx, on="date", how="left")
else:
    print("Sin merge desde index_1 (no disponible o sin columnas útiles).")

print("Dimensiones tras merge:", df.shape)
display(df.head(3))


Agregados diarios desde index_1: ['date', 'wx_temperature_2m', 'wx_precipitation', 'wx_cloudcover', 'is_holiday']
Dimensiones tras merge: (3104, 43)


Unnamed: 0,date,revenue,transactions,avg_price,first_sale_hour,last_sale_hour,avg_sale_hour,year,month,day,...,product_Cocoa,product_Cortado,product_Espresso,product_Hot Chocolate,product_Latte,product,wx_temperature_2m,wx_precipitation,wx_cloudcover,is_holiday
0,2024-03-01,28.9,1.0,28.9,15.0,15.0,15.0,2024,3,1,...,False,False,False,False,False,Americano,6.272727,0.0,41.363636,False
1,2024-03-02,86.7,3.0,28.9,12.0,19.0,15.666667,2024,3,2,...,False,False,False,False,False,Americano,5.557143,0.0,12.571429,False
2,2024-03-03,28.9,1.0,28.9,14.0,14.0,14.0,2024,3,3,...,False,False,False,False,False,Americano,3.07,0.1,99.4,False


In [7]:
#7

dups = df.duplicated(subset=["date","product"]).sum()
print(f"Duplicados por (date, product): {dups}")
if dups > 0:
    display(df[df.duplicated(subset=["date","product"], keep=False)].sort_values(["product","date"]).head(10))

# Reindexar calendario por producto
def ensure_complete_calendar(dfin):
    out = []
    for prod, g in dfin.groupby("product"):
        g = g.sort_values("date")
        full_idx = pd.date_range(g["date"].min(), g["date"].max(), freq="D")
        g = g.set_index("date").reindex(full_idx)
        g["product"] = prod
        g.index.name = "date"
        out.append(g.reset_index())
    return pd.concat(out, ignore_index=True)

df = ensure_complete_calendar(df)
print("Nulos por columna (top 15) tras reindex:")
display(df.isna().sum().sort_values(ascending=False).head(15))
print("Preflight OK.")


Duplicados por (date, product): 0
Nulos por columna (top 15) tras reindex:


is_holiday                     56
wx_cloudcover                  56
wx_precipitation               56
wx_temperature_2m              56
product_Americano with Milk     0
transactions_roll_7             0
transactions_roll_30            0
transactions_vol_7              0
total_daily_transactions        0
total_daily_revenue             0
market_share_transactions       0
product_Americano               0
product_Cocoa                   0
product_Cappuccino              0
transactions_lag_14             0
dtype: int64

Preflight OK.


In [8]:
#8
def rebuild_causal_rollings(dfin, ycol=TARGET):
    dfin = dfin.sort_values(["product","date"]).copy()
    cols = [c for c in dfin.columns if c.startswith(f"{ycol}_roll_")]
    if not cols:
        print("No hay columnas roll_* del target para reconstruir.")
        return dfin, pd.DataFrame(columns=["column","window","reconstructed"])
    log = []
    out = []
    for prod, g in dfin.groupby("product"):
        g = g.sort_values("date").copy()
        s = g[ycol].shift(1)  # causal
        for rc in cols:
            try:
                w = int(rc.split("_")[-1])
            except:
                w = None
            if w is None: 
                continue
            g[rc] = s.rolling(w, min_periods=1).mean()
            log.append({"column": rc, "window": w, "reconstructed": True})
        out.append(g)
    log_df = pd.DataFrame(log).drop_duplicates().sort_values(["window","column"])
    return pd.concat(out, ignore_index=True), log_df

df, rebuild_log = rebuild_causal_rollings(df, ycol=TARGET)
print("Reconstrucción causal aplicada a:")
display(rebuild_log)

if CAP_OUTLIERS:
    q_map = df.groupby("product")[TARGET].quantile(OUTLIER_Q).rename("q_hi")
    df = df.merge(q_map, on="product", how="left")
    before = df[TARGET].copy()
    df[TARGET] = np.where(df[TARGET] > df["q_hi"], df["q_hi"], df[TARGET])
    print(f"Capping aplicado: {(before != df[TARGET]).sum()} valores.")
    df.drop(columns=["q_hi"], inplace=True)
else:
    print("CAP_OUTLIERS=False → sin capping.")


Reconstrucción causal aplicada a:


Unnamed: 0,column,window,reconstructed
0,transactions_roll_3,3,True
1,transactions_roll_7,7,True
2,transactions_roll_30,30,True


CAP_OUTLIERS=False → sin capping.


In [9]:
#9
def add_calendar_rich(dfin, holidays_path=None):
    dfo = dfin.copy()
    dt = pd.to_datetime(dfo["date"])
    # básicos
    dfo["dow"] = dt.dt.dayofweek
    dfo["month"] = dt.dt.month
    dfo["weekofyear"] = dt.dt.isocalendar().week.astype(int)
    dfo["is_weekend"] = (dfo["dow"] >= 5).astype(int)
    dfo["is_month_start"] = dt.dt.is_month_start.astype(int)
    dfo["is_month_end"] = dt.dt.is_month_end.astype(int)
    # víspera/post-feriado
    if "is_holiday" not in dfo.columns:
        dfo["is_holiday"] = 0
    dfo = dfo.sort_values("date")
    dfo["is_holiday_prev"] = dfo["is_holiday"].shift(1).fillna(0).astype(int)
    dfo["is_holiday_next"] = dfo["is_holiday"].shift(-1).fillna(0).astype(int)
    # cíclicos
    dfo["dow_sin"] = np.sin(2*np.pi*dt.dt.dayofweek/7)
    dfo["dow_cos"] = np.cos(2*np.pi*dt.dt.dayofweek/7)
    dfo["month_sin"] = np.sin(2*np.pi*(dt.dt.month-1)/12)
    dfo["month_cos"] = np.cos(2*np.pi*(dt.dt.month-1)/12)
    # festivos externos
    if holidays_path:
        h = pd.read_csv(holidays_path)
        h["date"] = pd.to_datetime(h["date"])
        h["is_holiday_ext"] = 1
        dfo = dfo.merge(h[["date","is_holiday_ext"]], on="date", how="left")
        dfo["is_holiday_ext"] = dfo["is_holiday_ext"].fillna(0).astype(int)
    return dfo

def add_business_aggregates_tminus1(dfin, ycol=TARGET):
    dfo = dfin.sort_values(["date","product"]).copy()
    totals = dfo.groupby("date")[ycol].sum().rename("totals_day")
    dfo = dfo.merge(totals, left_on="date", right_index=True, how="left")
    dfo["share_day"] = np.where(dfo["totals_day"]>0, dfo[ycol]/dfo["totals_day"], 0.0)
    dfo["totals_day_t1"] = dfo.groupby("product")["totals_day"].shift(1)
    dfo["share_day_t1"]  = dfo.groupby("product")["share_day"].shift(1)
    # roll_7 causal de totales y "competidor"
    dfo = dfo.sort_values(["product","date"])
    dfo["totals_day_roll_7"] = dfo.groupby("product")["totals_day_t1"].transform(
        lambda s: s.rolling(7, min_periods=1).mean()
    )
    dfo["competitor_sum_t1"] = dfo["totals_day_t1"] - dfo.groupby("product")[ycol].shift(1)
    dfo.drop(columns=["totals_day","share_day"], inplace=True)
    return dfo

if USE_RICH_CALENDAR:
    df = add_calendar_rich(df, UA_HOLIDAYS_PATH)
if ADD_BUSINESS_AGGREGATES:
    df = add_business_aggregates_tminus1(df, ycol=TARGET)

cols_show = [c for c in ["is_holiday","is_holiday_ext","wx_temperature_2m","wx_precipitation","wx_cloudcover",
                         "dow","dow_sin","month_cos","totals_day_t1","share_day_t1","totals_day_roll_7","competitor_sum_t1"] if c in df.columns]
display(df.head(5)[["date","product",TARGET]+cols_show])


Unnamed: 0,date,product,transactions,is_holiday,wx_temperature_2m,wx_precipitation,wx_cloudcover,dow,dow_sin,month_cos,totals_day_t1,share_day_t1,totals_day_roll_7,competitor_sum_t1
0,2024-03-01,Americano,1.0,False,6.272727,0.0,41.363636,4,-0.433884,0.5,,,,
1,2024-03-02,Americano,3.0,False,5.557143,0.0,12.571429,5,-0.974928,0.5,11.0,0.090909,11.0,10.0
2,2024-03-03,Americano,1.0,False,3.07,0.1,99.4,6,-0.781831,0.5,7.0,0.428571,9.0,4.0
3,2024-03-04,Americano,0.0,False,6.3,0.0,99.5,0,0.0,0.5,10.0,0.1,9.333333,9.0
4,2024-03-05,Americano,0.0,False,5.622222,0.0,85.888889,1,0.781831,0.5,4.0,0.0,8.0,4.0


In [10]:
#10
def build_causal_features(dfin, ycol=TARGET):
    dfin = dfin.sort_values(["product","date"]).copy()
    out = []
    for prod, g in dfin.groupby("product"):
        g = g.sort_values("date").copy()
        # lags
        g[f"{ycol}_lag_1"]  = g[ycol].shift(1)
        g[f"{ycol}_lag_7"]  = g[ycol].shift(7)
        g[f"{ycol}_lag_14"] = g[ycol].shift(14)
        # rollings causales
        s = g[ycol].shift(1)
        g[f"{ycol}_roll_3"]  = s.rolling(3,  min_periods=1).mean()
        g[f"{ycol}_roll_7"]  = s.rolling(7,  min_periods=1).mean()
        g[f"{ycol}_roll_30"] = s.rolling(30, min_periods=1).mean()
        g[f"{ycol}_vol_7"]   = s.rolling(7,  min_periods=1).std()
        out.append(g)
    return pd.concat(out, ignore_index=True)

df = build_causal_features(df, ycol=TARGET)
display(df.head(3))


Unnamed: 0,date,revenue,transactions,avg_price,first_sale_hour,last_sale_hour,avg_sale_hour,year,month,day,...,dow,weekofyear,is_holiday_prev,is_holiday_next,dow_sin,dow_cos,totals_day_t1,share_day_t1,totals_day_roll_7,competitor_sum_t1
0,2024-03-01,28.9,1.0,28.9,15.0,15.0,15.0,2024,3,1,...,4,9,0,0,-0.433884,-0.900969,,,,
1,2024-03-02,86.7,3.0,28.9,12.0,19.0,15.666667,2024,3,2,...,5,9,0,0,-0.974928,-0.222521,11.0,0.090909,11.0,10.0
2,2024-03-03,28.9,1.0,28.9,14.0,14.0,14.0,2024,3,3,...,6,9,0,0,-0.781831,0.62349,7.0,0.428571,9.0,4.0


In [11]:
#11
# Persistir dataset final de entrenamiento/inferencia
ARTIFACTS_DIR = RESULTS_DIR / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Copia exacta
df_final = df.copy()

# Dtypes + shape 
schema = pd.Series({col: str(df_final[col].dtype) for col in df_final.columns})
schema.to_csv(ARTIFACTS_DIR / "df_final_schema.csv", header=["dtype"])


#df_final.to_parquet(ARTIFACTS_DIR / "df_final.parquet", index=False)
df_final.to_csv(ARTIFACTS_DIR / "df_final.csv", index=False)

print("[Guardado] df_final -> artifacts/df_final.parquet & df_final.csv")


[Guardado] df_final -> artifacts/df_final.parquet & df_final.csv


In [12]:
#12
#Snapshot de datos y features
ART_DIR = (RESULTS_DIR / "artifacts"); ART_DIR.mkdir(parents=True, exist_ok=True)

# guarda una muestra ligera para inspección rápida
df_final.sample(min(1000, len(df_final))).to_csv(ART_DIR / "df_final_sample.csv", index=False)
df_final.to_parquet(ART_DIR / "df_final.parquet")

# esquema (nombres y dtypes)
pd.DataFrame({
    "column": df_final.columns,
    "dtype": [str(t) for t in df_final.dtypes]
}).to_csv(ART_DIR / "df_final_schema.csv", index=False)

# features candidatos si existen
try:
    pd.Series(candidates, name="feature").to_csv(ART_DIR / "features_selected.csv", index=False)
except Exception:
    pass

# log en mlflow (usamos cualquier par de CSVs de métricas para completar; por ej. baselines)
_mlflow_log_run(
    model_name="data_snapshot",
    params={
        "n_rows": int(len(df_final)),
        "n_products": int(df_final["product"].nunique()),
        "n_days": int(df_final["date"].nunique()),
        "target": TARGET
    },
    overall_csv=RESULTS_DIR / "baselines_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "baselines_metrics_by_h.csv",
    extra_artifacts=[
        ART_DIR / "df_final.parquet",
        ART_DIR / "df_final_schema.csv",
        ART_DIR / "df_final_sample.csv",
        ART_DIR / "features_selected.csv",
    ],
    tags={"stage": "data"}
)



In [13]:
#13
def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs(y_true[mask] - y_pred[mask])))

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    if mask.sum() == 0:
        return np.nan
    return float(np.sqrt(np.mean((y_true[mask] - y_pred[mask]) ** 2)))

def mape(y_true, y_pred, epsilon=1e-6, ignore_zeros=True):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    if ignore_zeros:
        mask = np.abs(y_true) > epsilon
        if mask.sum() == 0:
            return np.nan
        return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100.0)
    else:
        denom = np.where(np.abs(y_true) < epsilon, epsilon, np.abs(y_true))
        return float(np.mean(np.abs(y_true - y_pred) / denom) * 100.0)

def smape(y_true, y_pred, epsilon=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)).clip(min=epsilon)
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100.0)

def mase(y_true, y_pred, y_train, m=7, epsilon=1e-6):
    # Escala por error sNaive en train
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom_series = np.abs(np.asarray(y_train[m:], dtype=float) - np.asarray(y_train[:-m], dtype=float))
    d = np.mean(denom_series) if len(denom_series) > 0 else np.nan
    num = np.mean(np.abs(y_true - y_pred))
    if d is None or np.isnan(d) or d < epsilon:
        return np.nan
    return float(num / d)

def coverage(y_true, y_lo, y_hi):
    y_true = np.asarray(y_true, dtype=float)
    y_lo = np.asarray(y_lo, dtype=float)
    y_hi = np.asarray(y_hi, dtype=float)
    mask = ~np.isnan(y_true) & ~np.isnan(y_lo) & ~np.isnan(y_hi)
    if mask.sum() == 0:
        return np.nan
    inside = (y_true[mask] >= y_lo[mask]) & (y_true[mask] <= y_hi[mask])
    return float(np.mean(inside) * 100.0)

def summarize_metrics(df_eval, target_col="y"):
    out = {}
    for h, g in df_eval.groupby("h"):
        out[h] = {
            "MAE": mae(g[target_col], g["yhat"]),
            "RMSE": rmse(g[target_col], g["yhat"]),
            "MAPE": mape(g[target_col], g["yhat"], ignore_zeros=True),
            "sMAPE": smape(g[target_col], g["yhat"]),
        }
        if {"yhat_p10","yhat_p90"}.issubset(g.columns):
            out[h]["COV_p10_p90_%"] = coverage(g[target_col], g["yhat_p10"], g["yhat_p90"])
    overall = {
        "MAE": mae(df_eval[target_col], df_eval["yhat"]),
        "RMSE": rmse(df_eval[target_col], df_eval["yhat"]),
        "MAPE": mape(df_eval[target_col], df_eval["yhat"], ignore_zeros=True),
        "sMAPE": smape(df_eval[target_col], df_eval["yhat"]),
    }
    if {"yhat_p10","yhat_p90"}.issubset(df_eval.columns):
        overall["COV_p10_p90_%"] = coverage(df_eval[target_col], df_eval["yhat_p10"], df_eval["yhat_p90"])
    return pd.DataFrame(out).T, overall

print("Métricas listas.")


Métricas listas.


In [14]:
#14
from typing import List, Tuple

def rolling_origins(date_index: pd.Series, n_origins: int = 4, horizon: int = 7):
    unique_dates = pd.Series(pd.to_datetime(pd.unique(date_index))).sort_values()
    anchors = [unique_dates.iloc[-(i+1)*horizon] for i in range(n_origins)][::-1]  # bugfix
    splits: List[Tuple[pd.Timestamp, pd.Timestamp, pd.Timestamp]] = []
    for anchor in anchors:
        train_end = anchor - pd.Timedelta(days=1)
        test_start = anchor
        test_end = anchor + pd.Timedelta(days=horizon - 1)
        splits.append((train_end, test_start, test_end))
    return splits

splits_demo = rolling_origins(df["date"], n_origins=N_ORIGINS, horizon=HORIZON)
splits_demo


[(Timestamp('2025-02-23 00:00:00'),
  Timestamp('2025-02-24 00:00:00'),
  Timestamp('2025-03-02 00:00:00')),
 (Timestamp('2025-03-02 00:00:00'),
  Timestamp('2025-03-03 00:00:00'),
  Timestamp('2025-03-09 00:00:00')),
 (Timestamp('2025-03-09 00:00:00'),
  Timestamp('2025-03-10 00:00:00'),
  Timestamp('2025-03-16 00:00:00')),
 (Timestamp('2025-03-16 00:00:00'),
  Timestamp('2025-03-17 00:00:00'),
  Timestamp('2025-03-23 00:00:00'))]

In [15]:
#15
BASELINE_NAMES = ["naive1", "snaive7", "ma7"]

def _baseline_predict(train: pd.DataFrame, test_dates: pd.DatetimeIndex, horizon: int, target: str, kind: str):
    pieces = []
    for prod, g in train.groupby("product"):
        g = g.sort_values("date")
        if len(g) == 0:
            continue
        if kind == "naive1":
            last = g[target].iloc[-1]
            preds = [last] * horizon
        elif kind == "snaive7":
            hist = g[target].iloc[-7:].tolist()
            if len(hist) < 7:
                hist = [g[target].iloc[-1]] * 7
            preds = hist
        elif kind == "ma7":
            window = g[target].iloc[-7:]
            meanv = float(window.mean()) if len(window) > 0 else float(g[target].iloc[-1])
            preds = [meanv] * horizon
        else:
            raise ValueError(kind)
        dfp = pd.DataFrame({
            "date": test_dates,
            "product": prod,
            "h": list(range(1, horizon + 1)),
            "yhat": preds[:horizon],
        })
        pieces.append(dfp)
    return pd.concat(pieces, ignore_index=True)


In [16]:
#16
from lightgbm import LGBMRegressor

def filter_candidates(dfin, target=TARGET, null_frac_max=0.30):
    drop_like = {target, "date", "product"}
    cols = [c for c in dfin.columns if c not in drop_like and not c.startswith("y_")]
    cols = [c for c in cols if pd.api.types.is_numeric_dtype(dfin[c])]
    kept = []
    for c in cols:
        null_frac = dfin[c].isna().mean()
        if null_frac > null_frac_max: 
            continue
        if dfin[c].nunique(dropna=True) <= 1:
            continue
        kept.append(c)
    return kept

candidates = filter_candidates(df, target=TARGET, null_frac_max=0.30)
print("Candidatas post-filtro:", len(candidates))

def build_direct_labels(dfin, target, H=7):
    dfx = dfin.sort_values(["product","date"]).copy()
    for h in range(1, H+1):
        dfx[f"y_{h}"] = dfx.groupby("product")[target].shift(-h)
    return dfx

df_l = build_direct_labels(df, TARGET, H=HORIZON)

def _fit_lgbm(X, y, objective="auto", tweedie_power=None):
    params = dict(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=RANDOM_STATE,
    )
    if objective == "poisson":
        params.update(objective="poisson")
    elif objective == "tweedie":
        params.update(objective="tweedie")
        if tweedie_power is not None:
            params.update(tweedie_variance_power=float(tweedie_power))
    model = LGBMRegressor(**params)
    model.fit(X, y)
    return model

def _fit_lgbm_quantile(X, y, alpha):
    model = LGBMRegressor(
        objective="quantile", alpha=alpha,
        n_estimators=500, learning_rate=0.05,
        num_leaves=31, subsample=0.9, colsample_bytree=0.9,
        random_state=RANDOM_STATE
    )
    model.fit(X, y)
    return model

def lgbm_importance_until(dfin, feats, y_col, cutoff_date):
    data = dfin[(dfin["date"] <= cutoff_date)].dropna(subset=[y_col]).copy()
    if data.empty:
        return pd.DataFrame({"feature":[], "importance":[]})
    X, y = data[feats], data[y_col]
    m = LGBMRegressor(n_estimators=300, learning_rate=0.05, num_leaves=31,
                      subsample=0.9, colsample_bytree=0.9, random_state=RANDOM_STATE)
    m.fit(X, y)
    imp = pd.DataFrame({"feature": feats, "importance": m.feature_importances_})
    return imp.sort_values("importance", ascending=False)


Candidatas post-filtro: 49


In [17]:
#17 
#Inventario de features (candidates) usadas por modelos ML
ARTIFACTS_DIR = RESULTS_DIR / "artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

print(f"[Features seleccionadas - {len(candidates)}]:")
for c in candidates:
    print(" -", c)

pd.Series(candidates, name="feature").to_csv(ARTIFACTS_DIR / "features_selected.csv", index=False)
print("[Guardado] features_selected.csv")


[Features seleccionadas - 49]:
 - revenue
 - avg_price
 - first_sale_hour
 - last_sale_hour
 - avg_sale_hour
 - year
 - month
 - day
 - dayofweek
 - quarter
 - week_of_year
 - is_weekend
 - is_month_start
 - is_month_end
 - month_sin
 - month_cos
 - dayofweek_sin
 - dayofweek_cos
 - transactions_lag_1
 - transactions_lag_7
 - transactions_lag_14
 - transactions_roll_3
 - transactions_roll_7
 - transactions_roll_30
 - transactions_vol_7
 - total_daily_transactions
 - total_daily_revenue
 - market_share_transactions
 - product_Americano
 - product_Americano with Milk
 - product_Cappuccino
 - product_Cocoa
 - product_Cortado
 - product_Espresso
 - product_Hot Chocolate
 - product_Latte
 - wx_temperature_2m
 - wx_precipitation
 - wx_cloudcover
 - dow
 - weekofyear
 - is_holiday_prev
 - is_holiday_next
 - dow_sin
 - dow_cos
 - totals_day_t1
 - share_day_t1
 - totals_day_roll_7
 - competitor_sum_t1
[Guardado] features_selected.csv


## LGBM

In [18]:
#18
lgbm_all = []
splits = rolling_origins(df_l["date"], n_origins=N_ORIGINS, horizon=HORIZON)

for (train_end, test_start, test_end) in splits:
    train = df_l[df_l["date"] <= train_end].copy()
    test  = df_l[(df_l["date"] >= test_start) & (df_l["date"] <= test_end)].copy()

    if train["date"].nunique() < MIN_TRAIN_DAYS:
        print(f"[LGBM] Split saltado: historia insuficiente ({train['date'].nunique()} días).")
        continue

    # Importancias por split (anti-snooping)
    imp_h1 = lgbm_importance_until(train, candidates, "y_1", train_end).head(TOPK_IMP)
    imp_h7 = lgbm_importance_until(train, candidates, "y_7", train_end).head(TOPK_IMP)
    selected_feats = sorted(set(imp_h1["feature"]).union(set(imp_h7["feature"])))

    preds_blocks = []
    for h in range(1, HORIZON + 1):
        y_col = f"y_{h}"
        tr = train.dropna(subset=[y_col]).copy()
        if tr.empty:
            continue

        X_tr, y_tr = tr[selected_feats], tr[y_col]
        if USE_LOG1P_TARGET:
            y_tr = np.log1p(y_tr)

        # Ciclo pequeño de objetivos: auto vs poisson/tweedie (se elige el mejor en train por MAE)
        candidate_models = []
        objectives_to_try = ["auto"]
        if LGBM_OBJECTIVE in ["poisson", "tweedie"]:
            objectives_to_try = [LGBM_OBJECTIVE]
        if "tweedie" in objectives_to_try:
            tweedie_grid = TWEEDIE_POWERS
        else:
            tweedie_grid = [None]

        best_model = None
        best_mae = np.inf
        for obj in objectives_to_try:
            for power in tweedie_grid:
                mtmp = _fit_lgbm(X_tr, y_tr, objective=obj, tweedie_power=power)
                # uso in-sample MAE como heurística rápida (alternativa: CV por fecha)
                y_pred_tr = mtmp.predict(X_tr)
                if USE_LOG1P_TARGET:
                    y_pred_tr = np.expm1(y_pred_tr)
                cur_mae = mae(tr[y_col], y_pred_tr)
                if cur_mae < best_mae:
                    best_mae = cur_mae
                    best_model = (mtmp, obj, power)

        model_c, used_obj, used_power = best_model

        model_p10 = _fit_lgbm_quantile(X_tr, y_tr, alpha=0.10)
        model_p90 = _fit_lgbm_quantile(X_tr, y_tr, alpha=0.90)

        test_block = test.copy()
        test_block["h"] = (test_block["date"] - test_start).dt.days + 1
        mask_h = test_block["h"] == h
        X_te = test_block.loc[mask_h, selected_feats]

        yhat_c   = model_c.predict(X_te)
        yhat_p10 = model_p10.predict(X_te)
        yhat_p90 = model_p90.predict(X_te)

        if USE_LOG1P_TARGET:
            yhat_c   = np.expm1(yhat_c)
            yhat_p10 = np.expm1(yhat_p10)
            yhat_p90 = np.expm1(yhat_p90)

        out = test_block.loc[mask_h, ["date", "product"]].copy()
        out["h"] = h
        out["yhat"] = yhat_c
        out["yhat_p10"] = yhat_p10
        out["yhat_p90"] = yhat_p90
        out["lgbm_objective"] = used_obj
        out["tweedie_power"] = used_power
        preds_blocks.append(out)

    if preds_blocks:
        fold_preds = pd.concat(preds_blocks, ignore_index=True)
        y_true = test[["date", "product", TARGET]].copy()
        merged = y_true.merge(fold_preds, on=["date", "product"], how="left")
        merged["model"] = "lgbm_direct"
        lgbm_all.append(merged)

lgbm_results = pd.concat(lgbm_all, ignore_index=True) if lgbm_all else pd.DataFrame(columns=["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90","model"])
by_h_lgbm, overall_lgbm = summarize_metrics(lgbm_results.rename(columns={TARGET: "y"}))
display(by_h_lgbm)
display(pd.DataFrame([overall_lgbm]))
# Estandarizar by_h: asegurar columna 'h'
if "h" not in by_h_lgbm.columns:
    by_h_lgbm = by_h_lgbm.reset_index()
    if "index" in by_h_lgbm.columns and "h" not in by_h_lgbm.columns:
        by_h_lgbm = by_h_lgbm.rename(columns={"index": "h"})
by_h_lgbm = by_h_lgbm.sort_values("h").reset_index(drop=True)

RESULTS_DIR.mkdir(exist_ok=True, parents=True)
lgbm_results.to_csv(RESULTS_DIR / "lgbm_direct_forecasts.csv", index=False)
lgbm_results_norm = _normalize_for_forecast_save(lgbm_results, TARGET)
lgbm_results_norm.to_csv(RESULTS_DIR / "lgbm_direct_forecasts.csv", index=False)
by_h_lgbm.to_csv(RESULTS_DIR / "lgbm_direct_metrics_by_h.csv", index= False)
pd.DataFrame([overall_lgbm]).to_csv(RESULTS_DIR / "lgbm_direct_metrics_overall.csv", index=False)


_mlflow_log_run(
    model_name="lgbm_direct",
    params={
        "HORIZON": HORIZON,
        "N_ORIGINS": N_ORIGINS,
        "USE_LOG1P_TARGET": USE_LOG1P_TARGET,
        "topK_feats": "auto/según tu código"
    },
    overall_csv=RESULTS_DIR / "lgbm_direct_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "lgbm_direct_metrics_by_h.csv",
    extra_artifacts=[
        RESULTS_DIR / "lgbm_direct_forecasts.csv",
        RESULTS_DIR / "lgbm_direct_importances.csv",  
    ],
    tags={"family": "lgbm", "stage": "train"}
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000644 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2053
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 49
[LightGBM] [Info] Start training from score 1.125000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2053
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 49
[LightGBM] [Info] Start training from score 1.138542
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2023
[LightGBM] [Info] Number of data points in the train set: 

Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
1,1.081485,1.476333,41.557781,87.718469,68.75
2,1.086805,1.580462,46.019083,100.198492,65.625
3,1.134352,1.517946,51.032503,83.396717,62.5
4,1.018458,1.286023,73.645991,82.796613,68.75
5,1.21899,1.709245,51.70395,95.033108,56.25
6,1.263186,1.732761,75.91309,123.17993,62.5
7,0.658472,0.793765,56.572403,114.014686,71.875


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
0,1.065964,1.473067,55.987553,98.048288,65.178571




## PROPHET

In [19]:
#19
try:
    from prophet import Prophet
    PROPHET_OK = True
except Exception as e:
    print("Prophet no disponible, se omitirá. Error:", e)
    PROPHET_OK = False

prophet_all = []

candidate_regressors = [c for c in ["is_holiday","wx_temperature_2m","wx_precipitation","wx_cloudcover",
                                    "is_holiday_ext","is_holiday_prev","is_holiday_next"] if c in df.columns]
print("[Prophet] Regresores detectados:", candidate_regressors)

if PROPHET_OK:
    splits = rolling_origins(df["date"], n_origins=N_ORIGINS, horizon=HORIZON)
    for (train_end, test_start, test_end) in splits:
        train = df[df["date"] <= train_end].copy()
        test  = df[(df["date"] >= test_start) & (df["date"] <= test_end)].copy()
        if train["date"].nunique() < MIN_TRAIN_DAYS:
            print(f"[Prophet] Split saltado por historia insuficiente ({train['date'].nunique()} días).")
            continue

        fold = []
        for prod, g in train.groupby("product"):
            g = g.sort_values("date")
            aux = g.rename(columns={"date":"ds", TARGET:"y"}).copy()

            m = Prophet(interval_width=0.80, weekly_seasonality=True,
                        daily_seasonality=False, yearly_seasonality=False)

            if PROPHET_USE_REGRESSORS and candidate_regressors:
                for reg in candidate_regressors:
                    m.add_regressor(reg)

            cols_fit = ["ds","y"] + (candidate_regressors if PROPHET_USE_REGRESSORS else [])
            aux[cols_fit] = aux[cols_fit].sort_values("ds").ffill().bfill()
            m.fit(aux[cols_fit])

            future = pd.DataFrame({"ds": pd.date_range(test_start, test_end, freq="D")})
            if PROPHET_USE_REGRESSORS and candidate_regressors:
                regs_future = (df[df["product"]==prod]
                               .rename(columns={"date":"ds"})[["ds"] + candidate_regressors]
                               .drop_duplicates(subset=["ds"])
                               .sort_values("ds")
                               .ffill().bfill())
                future = future.merge(regs_future, on="ds", how="left")
                future[candidate_regressors] = future[candidate_regressors].ffill().bfill()

            # >>> CAMBIO CLAVE: traer bandas y renombrar a estándar
            fcst = m.predict(future)[["ds","yhat","yhat_lower","yhat_upper"]]
            fcst = fcst.rename(columns={"yhat_lower":"yhat_p10", "yhat_upper":"yhat_p90"})  # <- bandas p10/p90
            fcst["product"] = prod
            fold.append(fcst.rename(columns={"ds":"date"}))
            # <<<

        fold = pd.concat(fold, ignore_index=True) if fold else pd.DataFrame(columns=["date","yhat","yhat_p10","yhat_p90","product"])
        mask = (fold["date"] >= test_start) & (fold["date"] <= test_end)
        fold = fold.loc[mask].copy()
        fold["h"] = (fold["date"] - test_start).dt.days + 1

        y_true = test[["date","product",TARGET]].copy()
        merged = y_true.merge(fold, on=["date","product"], how="left")
        merged["model"] = "prophet"
        prophet_all.append(merged)

    # Resultados del experimento Prophet
    prophet_results = (pd.concat(prophet_all, ignore_index=True)
                       if prophet_all else
                       pd.DataFrame(columns=["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90","model"]))

    # Por compatibilidad si llegaran a venir columnas originales:
    if "yhat_lower" in prophet_results.columns and "yhat_p10" not in prophet_results.columns:
        prophet_results["yhat_p10"] = prophet_results["yhat_lower"].astype(float)
    if "yhat_upper" in prophet_results.columns and "yhat_p90" not in prophet_results.columns:
        prophet_results["yhat_p90"] = prophet_results["yhat_upper"].astype(float)

    # Asegurar 'h' visible si quedó como índice en algún paso
    if "h" not in prophet_results.columns:
        prophet_results = prophet_results.reset_index()
        if "index" in prophet_results.columns and "h" not in prophet_results.columns:
            prophet_results = prophet_results.rename(columns={"index": "h"})

    # Métricas (usa cobertura si yhat_p10/p90 existen)
    by_h_prophet, overall_prophet = summarize_metrics(
        prophet_results.rename(columns={TARGET: "y"})
    )
    display(by_h_prophet)
    display(pd.DataFrame([overall_prophet]))

    # Guardados: by_h CON columna 'h' y sin índice
    if "h" not in by_h_prophet.columns:
        by_h_prophet = by_h_prophet.reset_index()
        if "index" in by_h_prophet.columns and "h" not in by_h_prophet.columns:
            by_h_prophet = by_h_prophet.rename(columns={"index": "h"})
    by_h_prophet = by_h_prophet.sort_values("h").reset_index(drop=True)

    RESULTS_DIR.mkdir(exist_ok=True, parents=True)
    prophet_results.to_csv(RESULTS_DIR / "prophet_forecasts.csv", index=False)
    prophet_results_norm = _normalize_for_forecast_save(prophet_results, TARGET)
    prophet_results_norm.to_csv(RESULTS_DIR / "prophet_forecasts.csv", index=False)
    by_h_prophet.to_csv(RESULTS_DIR / "prophet_metrics_by_h.csv", index=False)
    pd.DataFrame([overall_prophet]).to_csv(RESULTS_DIR / "prophet_metrics_overall.csv", index=False)
else:
    print("Saltando Prophet.")


_mlflow_log_run(
    model_name="prophet",
    params={"HORIZON": HORIZON, "use_regressors": bool(PROPHET_USE_REGRESSORS)},
    overall_csv=RESULTS_DIR / "prophet_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "prophet_metrics_by_h.csv",
    extra_artifacts=[RESULTS_DIR / "prophet_forecasts.csv"],
    tags={"family": "prophet", "stage": "train"}
)


Importing plotly failed. Interactive plots will not work.
04:08:56 - cmdstanpy - INFO - Chain [1] start processing


[Prophet] Regresores detectados: ['is_holiday', 'wx_temperature_2m', 'wx_precipitation', 'wx_cloudcover', 'is_holiday_prev', 'is_holiday_next']


04:09:01 - cmdstanpy - INFO - Chain [1] done processing
04:09:01 - cmdstanpy - INFO - Chain [1] start processing
04:09:01 - cmdstanpy - INFO - Chain [1] done processing
04:09:01 - cmdstanpy - INFO - Chain [1] start processing
04:09:01 - cmdstanpy - INFO - Chain [1] done processing
04:09:01 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] start processing
04:09:02 - cmdstanpy - INFO - Chain [1] done processing
04:09:02 - cmdstanpy - INFO - Chain [1] 

Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
1,1.293938,1.815439,49.425003,97.644091,71.875
2,1.155643,1.642031,52.707088,100.350352,81.25
3,1.228788,1.591474,49.030049,78.305114,75.0
4,0.998436,1.399174,44.401634,78.734236,81.25
5,1.518304,2.157232,51.272682,101.870488,71.875
6,0.969899,1.203281,59.714583,127.236567,84.375
7,0.715869,0.938018,52.070689,109.38843,90.625


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
0,1.12584,1.579511,50.612771,99.075611,79.464286




## SARIMAX

In [20]:
#20
try:
    import statsmodels.api as sm
    SARIMAX_OK = True
except Exception as e:
    print("statsmodels no disponible para SARIMAX. Error:", e)
    SARIMAX_OK = False

sarimax_all = []
if SARIMAX_OK:
    splits = rolling_origins(df["date"], n_origins=N_ORIGINS, horizon=HORIZON)
    exog_cols = [c for c in ["is_holiday","is_holiday_prev","is_holiday_next",
                             "wx_temperature_2m","wx_precipitation","wx_cloudcover"] if c in df.columns]
    print(f"[SARIMAX] exog_cols: {exog_cols}")

    for (train_end, test_start, test_end) in splits:
        train = df[df["date"] <= train_end].copy()
        test  = df[(df["date"] >= test_start) & (df["date"] <= test_end)].copy()
        n_days = train["date"].nunique()
        print(f"[SARIMAX] Split {train_end.date()} | train_days={n_days} | test_days={test['date'].nunique()}")
        if n_days < MIN_TRAIN_DAYS:
            print(f"[SARIMAX] Split saltado por historia insuficiente ({n_days} días).")
            continue

        fold_parts = []
        for prod, g in train.groupby("product"):
            g = g.sort_values("date").set_index("date")

            # Target train (sin NaNs, frecuencia diaria)
            y_tr = g[TARGET].astype(float).asfreq("D").ffill().bfill()

            # Exógenas train alineadas EXACTAS al índice del target
            ex_tr = None
            if exog_cols:
                ex_tr = g[exog_cols].asfreq("D").ffill().bfill()
                ex_tr = ex_tr.reindex(y_tr.index)

            # Ensayo de órdenes (simple y robusto)
            orders_to_try = [((1,0,1),(1,0,1,7)), ((0,1,1),(0,1,1,7))]
            res = None
            for order, seasonal_order in orders_to_try:
                try:
                    model = sm.tsa.statespace.SARIMAX(
                        y_tr, order=order, seasonal_order=seasonal_order,
                        exog=ex_tr, enforce_stationarity=False, enforce_invertibility=False
                    )
                    res = model.fit(disp=False)
                    chosen = (order, seasonal_order, True)  # con exog
                    break
                except Exception as e1:
                    # Fallback sin exógenas
                    try:
                        model = sm.tsa.statespace.SARIMAX(
                            y_tr, order=order, seasonal_order=seasonal_order,
                            exog=None, enforce_stationarity=False, enforce_invertibility=False
                        )
                        res = model.fit(disp=False)
                        ex_tr = None
                        chosen = (order, seasonal_order, False)  # sin exog
                        break
                    except Exception as e2:
                        continue

            if res is None:
                print(f"[SARIMAX][{prod}] no se pudo ajustar en ningún orden (con/sin exog).")
                continue

            # Exógenas test (alineadas a future_idx)
            future_idx = pd.date_range(test_start, test_end, freq="D")
            ex_te = None
            if exog_cols and chosen[2]:  # solo si el modelo fue con exog
                g_full = df[df["product"]==prod].set_index("date")
                ex_te = g_full[exog_cols].asfreq("D").ffill().bfill().reindex(future_idx)

            try:
                fcst = res.get_forecast(steps=len(future_idx), exog=ex_te)
                yhat = fcst.predicted_mean

                #ntervalos p10–p90 para cobertura 
                conf = fcst.conf_int(alpha=0.20)  # 80% => p10-p90   
                conf_cols = list(conf.columns)                         
                # Detecta automáticamente columnas lower/upper        
                if len(conf_cols) >= 2:                                
                    lower_col = next((c for c in conf_cols if "lower" in c.lower()), conf_cols[0])
                    upper_col = next((c for c in conf_cols if "upper" in c.lower()), conf_cols[1])
                    yhat_p10 = conf[lower_col].values
                    yhat_p90 = conf[upper_col].values
                else:
                    yhat_p10 = None
                    yhat_p90 = None
               
            except Exception as e3:
                print(f"[SARIMAX][{prod}] fallo en forecast: {type(e3).__name__}: {e3}")
                continue

            out = pd.DataFrame({
                "date": future_idx,
                "product": prod,
                "yhat": yhat.values
            })
            
            if yhat_p10 is not None and yhat_p90 is not None:       
                out["yhat_p10"] = yhat_p10                           
                out["yhat_p90"] = yhat_p90                           
           

            fold_parts.append(out)

        if fold_parts:
            fold = pd.concat(fold_parts, ignore_index=True)
            fold["h"] = (fold["date"] - test_start).dt.days + 1
            y_true = test[["date","product",TARGET]].copy()
            merged = y_true.merge(fold, on=["date","product"], how="left")
            merged["model"] = "sarimax"
            sarimax_all.append(merged)
        else:
            print(f"[SARIMAX] Split {train_end.date()} sin predicciones válidas (todos los productos fallaron).")

sarimax_results = (pd.concat(sarimax_all, ignore_index=True)
                   if sarimax_all else
                   pd.DataFrame(columns=["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90","model"]))  

if not sarimax_results.empty:
    # Métricas (si yhat_p10/p90 existen, summarize_metrics calculará COV_p10_p90_%)
    by_h_sarimax, overall_sarimax = summarize_metrics(sarimax_results.rename(columns={TARGET:"y"}))
    display(by_h_sarimax)
    display(pd.DataFrame([overall_sarimax]))

    # Asegurar que 'h' esté como columna
    if "h" not in by_h_sarimax.columns:
        by_h_sarimax = by_h_sarimax.reset_index()
        if "index" in by_h_sarimax.columns and "h" not in by_h_sarimax.columns:
            by_h_sarimax = by_h_sarimax.rename(columns={"index":"h"})
    by_h_sarimax = by_h_sarimax.sort_values("h").reset_index(drop=True)

    sarimax_results.to_csv(RESULTS_DIR / "sarimax_forecasts.csv", index=False)
    sarimax_results_norm = _normalize_for_forecast_save(sarimax_results, TARGET)
    sarimax_results_norm.to_csv(RESULTS_DIR / "sarimax_forecasts.csv", index=False)
    by_h_sarimax.to_csv(RESULTS_DIR / "sarimax_metrics_by_h.csv", index=False) 
    pd.DataFrame([overall_sarimax]).to_csv(RESULTS_DIR / "sarimax_metrics_overall.csv", index=False)
else:
    print("Saltando resumen SARIMAX (sin resultados).")


_mlflow_log_run(
    model_name="sarimax",
    params={"HORIZON": HORIZON, "orders_searched": "[(1,0,1)x(1,0,1,7), (0,1,1)x(0,1,1,7)]"},
    overall_csv=RESULTS_DIR / "sarimax_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "sarimax_metrics_by_h.csv",
    extra_artifacts=[RESULTS_DIR / "sarimax_forecasts.csv"],
    tags={"family": "sarimax", "stage": "train"}
)


[SARIMAX] exog_cols: ['is_holiday', 'is_holiday_prev', 'is_holiday_next', 'wx_temperature_2m', 'wx_precipitation', 'wx_cloudcover']
[SARIMAX] Split 2025-02-23 | train_days=360 | test_days=7
[SARIMAX] Split 2025-03-02 | train_days=367 | test_days=7
[SARIMAX] Split 2025-03-09 | train_days=374 | test_days=7
[SARIMAX] Split 2025-03-16 | train_days=381 | test_days=7




Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
1,1.239872,1.851112,48.13283,97.270426,68.75
2,0.905599,1.275229,44.474806,95.156047,87.5
3,0.933366,1.234957,43.512605,72.314918,84.375
4,1.052289,1.41991,54.125328,86.94254,81.25
5,1.236522,1.812889,47.647011,95.92812,75.0
6,0.900183,1.225605,66.324917,126.536236,90.625
7,0.878168,1.172474,76.247366,116.867289,84.375


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
0,1.020857,1.451967,52.824646,98.716511,81.696429




In [None]:
''''# (reemplaza la celda de instalación anterior por esta)
import sys
from pathlib import Path

PKG_DIR = Path("./_keras_torch_pkgs").resolve()  # <--- ANTES decía TARGET
PKG_DIR.mkdir(exist_ok=True)

!{sys.executable} -m pip install -q -t "{PKG_DIR}" "keras==3.4.1" "torch==2.2.2" "numpy<2"

import sys, os
sys.path.insert(0, str(PKG_DIR))
os.environ.setdefault("KERAS_BACKEND", "torch")

import keras, torch
from keras import layers
print("Keras:", keras.__version__, "| Backend:", keras.config.backend(), "| Torch:", torch.__version__)

''''

In [21]:
#21
#LSTM (Keras 3 con backend PyTorch) — Multi-salida directa (vector de HORIZON)

import numpy as np
import pandas as pd
from typing import List, Tuple
from sklearn.preprocessing import StandardScaler

import keras
from keras import layers

# Hiperparámetros 
LSTM_LOOKBACK = 30           # días en la ventana de entrada
LSTM_UNITS    = 64
LSTM_DROPOUT  = 0.2
LSTM_EPOCHS   = 50
LSTM_BATCH    = 256
LSTM_LR       = 1e-3
LSTM_PATIENCE = 5



if "TARGET" not in globals() or not isinstance(TARGET, str) or TARGET not in df.columns:
    print("[LSTM] Aviso: 'TARGET' inválido o ausente; intentando autodetectar columna objetivo.")
    # Heurística: priorizar nombres típicos; si no, toma la primera numérica con pinta de target
    candidatos_target = [c for c in ["revenue","units","qty","quantity","target","y"] if c in df.columns]
    if candidatos_target:
        TARGET = candidatos_target[0]
    else:
        # fallback: primera numérica distinta de id/fecha
        num_cols = [c for c in df.columns
                    if c not in {"date","product"} and pd.api.types.is_numeric_dtype(df[c])]
        if not num_cols:
            raise ValueError("No se pudo determinar la columna objetivo (no hay columnas numéricas).")
        TARGET = num_cols[0]
    print(f"[LSTM] TARGET restaurado → '{TARGET}'")


df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")



def _numeric_feature_cols(dfin: pd.DataFrame, target: str) -> List[str]:
    """Selecciona columnas numéricas útiles: excluye 'date','product' y futuros (y_*)."""
    cols = [c for c in dfin.columns
            if c not in {"date","product"}
            and not c.startswith("y_")
            and pd.api.types.is_numeric_dtype(dfin[c])]
    # Asegura que el TARGET histórico esté presente como canal
    if target not in cols and target in dfin.columns:
        cols = [target] + cols
    return cols

def _build_lstm_direct_model(n_feats: int, horizon: int):
    inp = layers.Input(shape=(LSTM_LOOKBACK, n_feats))
    x = layers.LSTM(LSTM_UNITS, return_sequences=False)(inp)
    x = layers.Dropout(LSTM_DROPOUT)(x)
    x = layers.Dense(64, activation="relu")(x)
    out = layers.Dense(horizon)(x)   # vector [y+1 .. y+H]
    m = keras.Model(inp, out)
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=LSTM_LR), loss="mse")
    return m

# MC Dropout para intervalos P10–P90 (cobertura)

def _to_numpy(x):
    """Convierte salidas Keras (torch/tf) a numpy de forma robusta."""
    try:
        
        if hasattr(x, "detach"):
            return x.detach().cpu().numpy()
        
        return x.numpy()
    except Exception:
        return np.array(x)

def mc_dropout_predict_vec(model, X_win, n_samples=50):
    """
    Realiza N pases con dropout activo y devuelve:
    mean, p10, p90  (vectores de tamaño 'horizon').
    (soporta backend torch/tf)
    """
    preds = []
    for _ in range(n_samples):
        y = model(X_win, training=True)  # activa dropout en inferencia
        y = _to_numpy(y).ravel()         
        preds.append(y)
    P = np.stack(preds, axis=0)  # (n_samples, HORIZON)
    mean = P.mean(axis=0)
    p10  = np.percentile(P, 10, axis=0)
    p90  = np.percentile(P, 90, axis=0)
    return mean, p10, p90


def _make_supervised_direct(df_train: pd.DataFrame, feats: List[str], target: str, lookback: int, horizon: int
                           ) -> Tuple[np.ndarray, np.ndarray]:
    """
    Construye dataset (X, Y) para entrenamiento directo multi-horizonte.
    X: (n_samples, lookback, n_feats)
    Y: (n_samples, horizon) con el target de t+1..t+H
    """
    Xs, Ys = [], []
    for _, g in df_train.sort_values(["product","date"]).groupby("product"):
        g = g.copy()
        g[feats] = g[feats].ffill().bfill()
        vals = g[feats].values
        tgt  = g[target].values.astype(float)

        # Último índice i que permite horizonte completo: i <= len(g) - horizon - 1
        for i in range(lookback - 1, len(g) - horizon):
            Xs.append(vals[i - lookback + 1 : i + 1, :])
            Ys.append(tgt[i + 1 : i + 1 + horizon])
    if not Xs:
        return np.empty((0, lookback, len(feats))), np.empty((0, horizon))
    return np.stack(Xs), np.stack(Ys)

def _infer_direct_for_split(model, scaler, feats, train_prod, test_prod, target: str, lookback: int, horizon: int):
    """
    Para UN producto y UN split: usa la ÚLTIMA ventana del train para predecir
    los HORIZON días del bloque de test (test_start..test_end).
    """
    gtr = train_prod.sort_values("date").copy()
    gte = test_prod.sort_values("date").copy()

    # Asegura features disponibles
    gtr[feats] = gtr[feats].ffill().bfill()
    gte[feats] = gte[feats].ffill().bfill()

    # Construir la última ventana del train (si falta historia, prepad con la primera fila)
    hist_vals = gtr[feats].values
    if len(hist_vals) == 0:
        # Sin historia rellena con las primeras filas del test (caso extremo)
        hist_vals = gte[feats].values[:1, :]
    if len(hist_vals) < lookback:
        prepad = np.repeat(hist_vals[[0], :], lookback - len(hist_vals), axis=0)
        window = np.vstack([prepad, hist_vals])
    else:
        window = hist_vals[-lookback:, :]

    # Escalar e inferir
    X_win = scaler.transform(window)  # scaler espera 2D; transform por filas
    X_win = X_win.reshape(1, lookback, len(feats)).astype("float32")

    # Predicción con incertidumbre por MC Dropout
    yhat_vec, p10_vec, p90_vec = mc_dropout_predict_vec(model, X_win, n_samples=50)

    # Mapear fechas de test a yhat[h-1]
    out = gte[["date"]].copy()
    out["h"] = (out["date"] - gte["date"].min()).dt.days + 1  # 1..HORIZON
    out["yhat"]     = [yhat_vec[h-1] if 1 <= h <= horizon else np.nan for h in out["h"]]
    out["yhat_p10"] = [p10_vec[h-1]  if 1 <= h <= horizon else np.nan for h in out["h"]]
    out["yhat_p90"] = [p90_vec[h-1]  if 1 <= h <= horizon else np.nan for h in out["h"]]
    out["product"]  = gte["product"].iloc[0]
    return out[["date","product","h","yhat","yhat_p10","yhat_p90"]]

#  Entrenamiento + backtesting 

# Comprobación de backend NN disponible
try:
    _ = keras.config.backend()
    TF_OK = True
except Exception as e:
    print("[LSTM-direct] Backend Keras no disponible, se omite. Error:", e)
    TF_OK = False

lstm_dir_all = []
if TF_OK:
    # Definir features base: usa 'candidates' si existen; si no, toma numéricos razonables
    try:
        base_feats = candidates.copy()
        if TARGET not in base_feats:
            base_feats = [TARGET] + base_feats
    except NameError:
        base_feats = _numeric_feature_cols(df, TARGET)
    #  base_feats: dejar solo nombres de columnas numéricas válidas

    bad_items = [c for c in base_feats if not isinstance(c, str)]
    if bad_items:
        print("[LSTM] Aviso: removiendo items no-string en features:", bad_items)
    

    base_feats = [c for c in base_feats
                  if isinstance(c, str)
                  and c in df.columns
                  and pd.api.types.is_numeric_dtype(df[c])]
    
  
    if TARGET in df.columns and TARGET not in base_feats:
        base_feats = [TARGET] + base_feats
    
    
    if not base_feats:
        print("[LSTM] Aviso: 'base_feats' vacío tras limpiar; usando _numeric_feature_cols(df, TARGET).")
        base_feats = _numeric_feature_cols(df, TARGET)
    
    print(f"[LSTM] #features usadas: {len(base_feats)} → {base_feats[:15]}{' ...' if len(base_feats)>15 else ''}")

    splits = rolling_origins(df["date"], n_origins=N_ORIGINS, horizon=HORIZON)

    for (train_end, test_start, test_end) in splits:
        train = df[df["date"] <= train_end].copy()
        test  = df[(df["date"] >= test_start) & (df["date"] <= test_end)].copy()

        if train["date"].nunique() < MIN_TRAIN_DAYS:
            print(f"[LSTM-direct] Split saltado por historia insuficiente ({train['date'].nunique()} días).")
            continue

        # Preparar scaler sobre TRAIN únicamente
        df_train_ff = train.sort_values(["product","date"]).copy()
        df_train_ff[base_feats] = df_train_ff[base_feats].ffill().bfill()
        scaler = StandardScaler().fit(df_train_ff[base_feats].values)

        # Dataset supervisado directo (solo TRAIN  sin fuga)
        X_tr, Y_tr = _make_supervised_direct(df_train_ff, base_feats, TARGET, LSTM_LOOKBACK, HORIZON)
        if X_tr.shape[0] == 0:
            print("[LSTM-direct] No se pudieron construir secuencias en este split.")
            continue

        # Escalar X  (reshape para usar scaler 2D)
        
        X_tr_2d = X_tr.reshape(-1, X_tr.shape[-1])
        X_tr_scaled = scaler.transform(X_tr_2d).reshape(X_tr.shape).astype("float32")  
        Y_tr_model  = (np.log1p(Y_tr) if USE_LOG1P_TARGET else Y_tr).astype("float32") 


        # Modelo y entrenamiento
        model = _build_lstm_direct_model(n_feats=len(base_feats), horizon=HORIZON)
        es = keras.callbacks.EarlyStopping(monitor="loss", patience=LSTM_PATIENCE, restore_best_weights=True)
        model.fit(X_tr_scaled, Y_tr_model, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH, verbose=0, callbacks=[es])

        # Inferencia para el bloque de test (H días) por producto 
        fold_parts = []
        for prod, gte in test.groupby("product"):
            gtr = train[train["product"] == prod].copy()
            pred_df = _infer_direct_for_split(model, scaler, base_feats, gtr, gte, TARGET, LSTM_LOOKBACK, HORIZON)

            # Inversión del log si aplica (incluye bandas)
            if USE_LOG1P_TARGET:
                for c in ["yhat","yhat_p10","yhat_p90"]:
                    if c in pred_df.columns:
                        pred_df[c] = np.expm1(pred_df[c])

            # Mezclar con el target real
            merged = gte[["date","product",TARGET]].merge(pred_df, on=["date","product"], how="left")
            fold_parts.append(merged)

        merged_fold = (pd.concat(fold_parts, ignore_index=True) if fold_parts
                       else pd.DataFrame(columns=["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90"]))
        merged_fold["model"] = "lstm_direct"
        lstm_dir_all.append(merged_fold[["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90","model"]])

# Consolidar resultados y métricas
lstm_direct_results = (pd.concat(lstm_dir_all, ignore_index=True)
                       if lstm_dir_all
                       else pd.DataFrame(columns=["date","product",TARGET,"h","yhat","yhat_p10","yhat_p90","model"]))

by_h_lstm_dir, overall_lstm_dir = summarize_metrics(lstm_direct_results.rename(columns={TARGET:"y"}))
display(by_h_lstm_dir)
display(pd.DataFrame([overall_lstm_dir]))

# Guardar resultados
RESULTS_DIR.mkdir(exist_ok=True, parents=True)
lstm_direct_results.to_csv(RESULTS_DIR / "lstm_direct_forecasts.csv", index=False)
lstm_direct_results_norm = _normalize_for_forecast_save(lstm_direct_results, TARGET)
lstm_direct_results_norm.to_csv(RESULTS_DIR / "lstm_direct_forecasts.csv", index=False)
by_h_lstm_dir.to_csv(RESULTS_DIR / "lstm_direct_metrics_by_h.csv", index=False)
pd.DataFrame([overall_lstm_dir]).to_csv(RESULTS_DIR / "lstm_direct_metrics_overall.csv", index=False)


_mlflow_log_run(
    model_name="lstm_direct",
    params={
        "HORIZON": HORIZON,
        "LOOKBACK": LSTM_LOOKBACK,
        "UNITS": LSTM_UNITS,
        "DROPOUT": LSTM_DROPOUT,
        "LR": LSTM_LR,
        "BATCH": LSTM_BATCH,
        "USE_LOG1P_TARGET": USE_LOG1P_TARGET
    },
    overall_csv=RESULTS_DIR / "lstm_direct_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "lstm_direct_metrics_by_h.csv",
    extra_artifacts=[
        RESULTS_DIR / "lstm_direct_forecasts.csv",
        
    ],
    tags={"family": "lstm", "stage": "train"}
)


[LSTM] #features usadas: 50 → ['transactions', 'revenue', 'avg_price', 'first_sale_hour', 'last_sale_hour', 'avg_sale_hour', 'year', 'month', 'day', 'dayofweek', 'quarter', 'week_of_year', 'is_weekend', 'is_month_start', 'is_month_end'] ...


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
1,1.266681,1.592759,65.363569,92.152536,9.375
2,1.047509,1.366973,59.251718,92.953566,25.0
3,1.043011,1.345018,63.848086,73.409654,25.0
4,0.988439,1.28989,53.271689,78.685434,21.875
5,1.523825,2.224254,54.315887,102.502781,15.625
6,0.891389,1.191235,50.492631,126.473485,15.625
7,0.754064,0.99793,55.115054,117.563495,37.5


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
0,1.07356,1.475543,57.877748,97.677279,21.428571




In [22]:
#22 — Baselines

baseline_all = []
splits = rolling_origins(df["date"], n_origins=N_ORIGINS, horizon=HORIZON)

for (train_end, test_start, test_end) in splits:
    train = df[df["date"] <= train_end].copy()
    test  = df[(df["date"] >= test_start) & (df["date"] <= test_end)].copy()

    
    test["date"] = pd.to_datetime(test["date"])

    if train["date"].nunique() < MIN_TRAIN_DAYS:
        print(f"[Baselines] Split saltado por historia insuficiente ({train['date'].nunique()} días).")
        continue

    horizon_idx = pd.date_range(test_start, test_end, freq="D")

    for name in BASELINE_NAMES:
        preds = _baseline_predict(train, horizon_idx, HORIZON, TARGET, name).copy()

        
        if "date" in preds:
            preds["date"] = pd.to_datetime(preds["date"])
        else:
            raise ValueError(f"[Baselines] {_baseline_predict.__name__} debe devolver columna 'date'.")

        if "product" not in preds.columns:
            raise ValueError(f"[Baselines] {_baseline_predict.__name__} debe devolver columna 'product'.")

        if "yhat" not in preds.columns:
            raise ValueError(f"[Baselines] {_baseline_predict.__name__} debe devolver columna 'yhat'.")

        
        if "h" not in preds.columns:
            preds["h"] = (preds["date"] - test_start).dt.days + 1  # HORIZON

        # Merge con el bloque de test
        merged = test.merge(preds[["date","product","h","yhat"]], on=["date","product"], how="left")
        merged["model"] = name

        baseline_all.append(merged[["date","product",TARGET,"h","yhat","model"]])

# Resultados agregados (sin bandas cobertura quedará NaN en summarize_metrics)
baseline_results = (
    pd.concat(baseline_all, ignore_index=True)
    if baseline_all else
    pd.DataFrame(columns=["date","product",TARGET,"h","yhat","model"])
)

# Métricas por horizonte y global
by_h_baseline, overall_baseline = summarize_metrics(baseline_results.rename(columns={TARGET:"y"}))


if "h" not in by_h_baseline.columns:
    by_h_baseline = by_h_baseline.reset_index()
    if "index" in by_h_baseline.columns and "h" not in by_h_baseline.columns:
        by_h_baseline = by_h_baseline.rename(columns={"index": "h"})


if "h" in by_h_baseline.columns:
    by_h_baseline = by_h_baseline.sort_values("h").reset_index(drop=True)

display(by_h_baseline)
display(pd.DataFrame([overall_baseline]))

# Guardar
RESULTS_DIR.mkdir(exist_ok=True, parents=True)
baseline_results.to_csv(RESULTS_DIR / "baselines_forecasts.csv", index=False)
baseline_results_norm = _normalize_for_forecast_save(baseline_results, TARGET)
baseline_results_norm.to_csv(RESULTS_DIR / "baselines_forecasts.csv", index=False)
by_h_baseline.to_csv(RESULTS_DIR / "baselines_metrics_by_h.csv", index=False)   
pd.DataFrame([overall_baseline]).to_csv(RESULTS_DIR / "baselines_metrics_overall.csv", index=False)

_mlflow_log_run(
    model_name="baselines",
    params={"names": ",".join(BASELINE_NAMES), "HORIZON": HORIZON},
    overall_csv=RESULTS_DIR / "baselines_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "baselines_metrics_by_h.csv",
    extra_artifacts=[RESULTS_DIR / "baselines_forecasts.csv"],
    tags={"family": "baselines", "stage": "train"}
)


Unnamed: 0,h,MAE,RMSE,MAPE,sMAPE
0,1,1.50744,2.1753,65.109435,74.592093
1,2,1.138393,1.72941,61.355219,76.410945
2,3,1.209821,1.719919,60.912698,82.363135
3,4,1.209821,1.747386,62.577839,80.546706
4,5,1.534226,2.322207,61.316672,77.5954
5,6,0.94494,1.385435,74.751984,91.046764
6,7,0.800595,1.127878,74.603175,92.678669


Unnamed: 0,MAE,RMSE,MAPE,sMAPE
0,1.192177,1.785595,64.930023,82.176245


In [23]:
#22

from IPython.display import display, Markdown
def show_df(title: str, df):
    display(Markdown(f"### {title}"))
    try:
        if hasattr(df, "empty") and df.empty:
            display(Markdown("_(sin datos)_"))
        else:
            display(df)
    except Exception:
        display(df)

# Guardar métricas por producto×h para todas las familias disponibles
def save_metrics_by_product(model_name, df_pred, target=TARGET):
    if df_pred is None or df_pred.empty:
        return
    tmp = df_pred.copy()

    
    if "y" not in tmp.columns and target in tmp.columns:
        tmp = tmp.rename(columns={target: "y"})

    
    if "y" not in tmp.columns:
        print(f"[save_metrics_by_product] {model_name}: no tiene columna 'y' ni '{target}'. Se omite.")
        return
    if "h" not in tmp.columns:
        
        if "index" in tmp.columns and "h" not in tmp.columns:
            tmp = tmp.rename(columns={"index": "h"})
        elif isinstance(tmp.index, pd.MultiIndex) and "h" in tmp.index.names:
            tmp = tmp.reset_index()
        if "h" not in tmp.columns:
            print(f"[save_metrics_by_product] {model_name}: no se encontró 'h'. Se omite.")
            return

    prod_h = (tmp.groupby(["product","h"])
                .apply(lambda g: pd.Series({"MAE": mae(g["y"], g["yhat"]),
                                            "sMAPE": smape(g["y"], g["yhat"])}))
                .reset_index())
    prod_h.to_csv(RESULTS_DIR / f"{model_name}_metrics_by_product_h.csv", index=False)

save_metrics_by_product("lgbm_direct", lgbm_results)
save_metrics_by_product("prophet", locals().get("prophet_results", pd.DataFrame()))
save_metrics_by_product("sarimax", sarimax_results)
save_metrics_by_product("baselines", baseline_results)
save_metrics_by_product("lstm_direct", locals().get("lstm_direct_results", pd.DataFrame()))  

# Ranking overall
print("Usando carpeta de resultados:", RESULTS_DIR.resolve())
paths = list(RESULTS_DIR.glob("*_metrics_overall.csv"))
rows = []
for p in paths:
    try:
        dfm = pd.read_csv(p)
        if not dfm.empty:
            model = p.stem.replace("_metrics_overall", "")
            row = dfm.iloc[0].to_dict()
            row["model"] = model
            rows.append(row)
    except Exception as e:
        print("Error leyendo", p.name, e)

if rows:
    cols = ["model","MAE","RMSE","MAPE","sMAPE"]
    rank = pd.DataFrame(rows)[cols].sort_values("sMAPE")
    show_df("Ranking overall de modelos (ordenado por sMAPE; menor = mejor)", rank)
    rank.to_csv(RESULTS_DIR / "_model_ranking.csv", index=False)
else:
    print("No hay métricas overall para comparar.")

# Ensemble por producto: escoger mejor modelo por producto×h según sMAPE del último split (aprox usando todo)
def load_family(name):
    path = RESULTS_DIR / f"{name}_forecasts.csv"
    return pd.read_csv(path) if path.exists() else pd.DataFrame()

families = {
    "lgbm_direct": load_family("lgbm_direct"),
    "prophet": load_family("prophet"),
    "sarimax": load_family("sarimax"),
    "baselines": load_family("baselines"),
    "lstm_direct": load_family("lstm_direct"),   
}
# Filtrar vacíos
families = {k:v for k,v in families.items() if not v.empty}

# Calcular sMAPE por product×h por familia y quedarSE con el ganador.
scores = []
for name, dfp in families.items():
    dfp2 = dfp.copy()

    
    if "y" not in dfp2.columns and TARGET in dfp2.columns:
        dfp2 = dfp2.rename(columns={TARGET:"y"})
    if "y" not in dfp2.columns:
        print(f"[ensemble] {name}: no tiene 'y' ni '{TARGET}', se omite de scores.")
        continue

    
    if "h" not in dfp2.columns:
        if "index" in dfp2.columns and "h" not in dfp2.columns:
            dfp2 = dfp2.rename(columns={"index":"h"})
        elif isinstance(dfp2.index, pd.MultiIndex) and "h" in dfp2.index.names:
            dfp2 = dfp2.reset_index()
        if "h" not in dfp2.columns:
            print(f"[ensemble] {name}: no tiene 'h', se omite de scores.")
            continue

    sm = (dfp2.groupby(["product","h"])
              .apply(lambda g: smape(g["y"], g["yhat"]))
              .reset_index(name="sMAPE"))
    sm["family"] = name
    scores.append(sm)

if scores:
    scores = pd.concat(scores, ignore_index=True)
    winners = scores.sort_values(["product","h","sMAPE"]).groupby(["product","h"]).first().reset_index()
    winners.to_csv(RESULTS_DIR / "winners_by_product_h.csv", index=False)
    show_df("Ganadores por producto×h (según sMAPE)", winners.head())

    # Construir las predicciones del ensemble "winner-takes-all"
    pieces = []
    for _, row in winners.iterrows():
        fam = row["family"]
        prod = row["product"]
        h = row["h"]
        dfp = families[fam]
        sel = dfp[(dfp["product"]==prod) & (dfp["h"]==h)].copy()
        if "model" not in sel.columns:
            sel["model"] = f"ensemble({fam})"
        pieces.append(sel)
    if pieces:
        ensemble_preds = pd.concat(pieces, ignore_index=True)
        by_h_ens, overall_ens = summarize_metrics(
            ensemble_preds.rename(columns={TARGET:"y"}) if TARGET in ensemble_preds.columns
            else ensemble_preds.rename(columns={"y":"y"})
        )
        show_df("Ensemble – métricas por horizonte (h)", by_h_ens)
        show_df("Ensemble – métricas overall", pd.DataFrame([overall_ens]))
        ensemble_preds.to_csv(RESULTS_DIR / "ensemble_winner_takes_all.csv", index=False)
        by_h_ens.to_csv(RESULTS_DIR / "ensemble_metrics_by_h.csv", index=False)
        pd.DataFrame([overall_ens]).to_csv(RESULTS_DIR / "ensemble_metrics_overall.csv", index=False)


        _mlflow_log_run(
    model_name="ensemble",
    params={"strategy": "winner-takes-all by (product,h)"},
    overall_csv=RESULTS_DIR / "ensemble_metrics_overall.csv",
    by_h_csv=RESULTS_DIR / "ensemble_metrics_by_h.csv",
    extra_artifacts=[RESULTS_DIR / "ensemble_winner_takes_all.csv"],
    tags={"family": "ensemble", "stage": "eval"}
)



Usando carpeta de resultados: C:\Users\Julian\results


### Ranking overall de modelos (ordenado por sMAPE; menor = mejor)

Unnamed: 0,model,MAE,RMSE,MAPE,sMAPE
2,ensemble,4.849966,16.537163,47.048229,70.255118
0,all_models,37.739537,56.34504,64.930023,82.176245
1,baselines,1.192177,1.785595,64.930023,82.176245
4,lstm_direct,1.07356,1.475543,57.877748,97.677279
3,lgbm_direct,1.065964,1.473067,55.987553,98.048288
6,sarimax,1.020857,1.451967,52.824646,98.716511
5,prophet,1.12584,1.579511,50.612771,99.075611


### Ganadores por producto×h (según sMAPE)

Unnamed: 0,product,h,sMAPE,family
0,Americano,1,22.865162,prophet
1,Americano,2,30.205933,lstm_direct
2,Americano,3,30.600267,sarimax
3,Americano,4,20.684168,lgbm_direct
4,Americano,5,39.391788,sarimax


### Ensemble – métricas por horizonte (h)

Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
1,0.74336,1.233965,39.207437,52.905313,54.166667
2,0.642904,1.007704,44.140995,68.572092,60.0
3,0.798277,1.072999,47.376331,75.522031,64.285714
4,0.676207,0.997281,48.338805,60.076528,64.285714
5,0.792558,1.363131,38.53963,62.269901,50.0
6,0.622781,0.948375,55.687702,84.970868,43.75
7,0.48696,0.720894,53.101863,78.878425,85.0


### Ensemble – métricas overall

Unnamed: 0,MAE,RMSE,MAPE,sMAPE,COV_p10_p90_%
0,0.669982,1.054812,46.355818,70.021924,60.625




In [24]:
#23
import pandas as pd
from pathlib import Path

results_dir = Path("results")
overall_files = list(results_dir.glob("*_metrics_overall.csv"))

dfs = []
for f in overall_files:
    model_name = f.stem.replace("_metrics_overall","")
    df = pd.read_csv(f)
    df["model"] = model_name
    dfs.append(df)

summary = pd.concat(dfs, ignore_index=True)
summary.to_csv(results_dir / "all_models_metrics_overall.csv", index=False)

print("Guardado:", results_dir / "all_models_metrics_overall.csv")
summary


Guardado: results\all_models_metrics_overall.csv


Unnamed: 0,MAE,RMSE,MAPE,sMAPE,model,COV_p10_p90_%
0,37.739537,56.34504,64.930023,82.176245,all_models,
1,22.418259,36.756103,50.460271,72.140813,all_models,54.487179
2,55.642252,80.267362,96.635683,191.086168,all_models,17.410714
3,33.986557,48.23061,55.77283,98.50643,all_models,7.589286
4,35.251163,49.815489,50.714061,99.912611,all_models,79.910714
5,32.321921,46.300026,53.800282,99.492605,all_models,82.142857
6,1.192177,1.785595,64.930023,82.176245,all_models,
7,4.849966,16.537163,47.048229,70.255118,all_models,58.125
8,1.065964,1.473067,55.987553,98.048288,all_models,65.178571
9,33.986557,48.23061,55.77283,98.50643,all_models,7.589286


In [None]:
# 21.1 — Selección del mejor modelo (overall) y registry
import json
from pathlib import Path

# Configura la métrica de selección (todas se minimizan)
BEST_METRIC = "sMAPE"      # alternativas: "RMSE", "MAE"
INCLUIR_ENSEMBLE = False   # True si permites que el ensemble sea elegido
EXCLUIR = {"all_models", "baselines"} | (set() if INCLUIR_ENSEMBLE else {"ensemble"})  # excluye siempre 'all_models'

def _read_overall_metrics(path: Path):
    """Lee un *_metrics_overall.csv y devuelve dict con métricas principales.
       Tolera variaciones de nombre/caso y presencia/ausencia de coverage.
    """
    try:
        dfm = pd.read_csv(path)
        if dfm.empty:
            return None
        row = dfm.iloc[0]

        def pick(*names):
            # busca exacto (case-insensitive) y luego por substring
            for n in names:
                for c in dfm.columns:
                    if c.strip().lower() == n.lower():
                        return pd.to_numeric(row[c], errors="coerce")
            for n in names:
                cols = [c for c in dfm.columns if n.lower() in c.lower()]
                if cols:
                    return pd.to_numeric(row[cols[0]], errors="coerce")
            return None

        return {
            "MAE":  pick("MAE"),
            "RMSE": pick("RMSE"),
            "MAPE": pick("MAPE"),
            "sMAPE": pick("sMAPE"),
            "COV_p10_p90_%": pick("COV_p10_p90_%", "coverage", "cov")
        }
    except Exception as e:
        print(f"[selector] error leyendo {path.name}: {type(e).__name__}: {e}")
        return None

# Descubrir todas las familias disponibles en RESULTS_DIR
paths = list(RESULTS_DIR.glob("*_metrics_overall.csv"))
rows = []
for p in paths:
    model_name = p.stem.replace("_metrics_overall", "")
    if model_name in EXCLUIR:
        continue
    # (opcional) también puedes excluir baselines aquí si no quieres que compitan:
    # if model_name == "baselines": continue

    m = _read_overall_metrics(p)
    if m:
        m["model"] = model_name
        m["path"] = str(p)
        rows.append(m)

scores = pd.DataFrame(rows)

if not scores.empty:
    # Asegurar numéricos
    for c in ["MAE","RMSE","MAPE","sMAPE","COV_p10_p90_%"]:
        if c in scores.columns:
            scores[c] = pd.to_numeric(scores[c], errors="coerce")

    # Elegir métrica disponible
    if BEST_METRIC not in scores.columns or scores[BEST_METRIC].isna().all():
        for fallback in ["sMAPE","RMSE","MAE"]:
            if fallback in scores.columns and not scores[fallback].isna().all():
                BEST_METRIC = fallback
                break

    # Orden (todas son de minimizar)
    scores_sorted = scores.sort_values(BEST_METRIC, ascending=True).reset_index(drop=True)

    # Mostrar tabla ordenada
    cols_show = [c for c in ["model","MAE","RMSE","MAPE","sMAPE","COV_p10_p90_%"] if c in scores_sorted.columns]
    try:
        show_df(f"📌 Selección de mejor modelo por {BEST_METRIC} (menor = mejor)", scores_sorted[cols_show])
    except Exception:
        display(scores_sorted[cols_show])

    # Mejor
    best_row = scores_sorted.iloc[0].to_dict()
    best = {
        "model": best_row["model"],
        "criteria": f"min {BEST_METRIC}",
        "metrics": {k: best_row.get(k) for k in ["MAE","RMSE","MAPE","sMAPE","COV_p10_p90_%"] if k in scores_sorted.columns},
        "metrics_path": best_row["path"]
    }
    print("[Mejor modelo por", BEST_METRIC, "]:", best["model"])
else:
    best = None
    print("No hay métricas overall para seleccionar mejor modelo.")

# Guardar registry
REGISTRY = RESULTS_DIR / "models" / "registry.json"
REGISTRY.parent.mkdir(parents=True, exist_ok=True)
with open(REGISTRY, "w", encoding="utf-8") as f:
    json.dump({"best_model": best}, f, ensure_ascii=False, indent=2)

print("[Guardado] registry.json ->", REGISTRY)



In [None]:
# 23 — Final fit & Serialize (según mejor modelo del registry)
# 23 — Final fit & Serialize (según mejor modelo del registry)
import json, joblib
from datetime import datetime
from pathlib import Path

MODELS_DIR = RESULTS_DIR / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helper: escoger mejor NO-baseline por los CSV overall ----------
def _pick_best_non_baseline(results_dir: Path, prefer="sMAPE"):
    paths = list(results_dir.glob("*_metrics_overall.csv"))
    filas = []
    for p in paths:
        name = p.stem.replace("_metrics_overall", "")
        # excluir agregados y baselines/ensemble para despliegue
        if name in {"all_models", "baselines", "ensemble"}:
            continue
        try:
            dfm = pd.read_csv(p)
            if dfm.empty:
                continue
            row = dfm.iloc[0]
            def pick(col):
                # tolera mayúsculas/minúsculas
                for c in dfm.columns:
                    if c.strip().lower() == col.lower():
                        return pd.to_numeric(row[c], errors="coerce")
                # búsqueda por substring como fallback
                for c in dfm.columns:
                    if col.lower() in c.strip().lower():
                        return pd.to_numeric(row[c], errors="coerce")
                return None
            filas.append({
                "model": name,
                "MAE": pick("MAE"),
                "RMSE": pick("RMSE"),
                "sMAPE": pick("sMAPE"),
            })
        except Exception as e:
            print(f"[final-fit] no se pudo leer {p.name}: {e}")
    if not filas:
        return None
    s = pd.DataFrame(filas)
    # métrica preferida (todas minimizan); cae a RMSE o MAE si falta
    metric = prefer if prefer in s.columns else ("sMAPE" if "sMAPE" in s.columns else ("RMSE" if "RMSE" in s.columns else "MAE"))
    s = s.dropna(subset=[metric])
    if s.empty:
        return None
    best_row = s.sort_values(metric, ascending=True).iloc[0]
    return str(best_row["model"])

# ---------- leer registry y aplicar protección anti-baseline ----------
REGISTRY = MODELS_DIR / "registry.json"
with open(REGISTRY, "r", encoding="utf-8") as f:
    reg = json.load(f)
best = reg.get("best_model") or {}
best_name = (best.get("model") or "").lower()

# Si el registry eligió baseline/ensemble o viene vacío, forzamos el mejor NO-baseline
if best_name in {"", "baselines", "ensemble"}:
    alt = _pick_best_non_baseline(RESULTS_DIR, prefer="sMAPE")
    if alt:
        print(f"[Final-Fit] Ignorando '{best_name}' para despliegue. Usando mejor no-baseline: {alt}")
        best_name = alt
    else:
        # como último recurso puedes caer en LGBM (si quieres mantener el comportamiento previo)
        print("[Final-Fit] No hay modelos no-baseline con métricas overall. Se usará 'lgbm_direct' por defecto.")
        best_name = "lgbm_direct"

ts = datetime.now().strftime("%Y%m%d_%H%M%S")

def _save_meta(name, meta):
    with open(MODELS_DIR / f"{name}_{ts}.meta.json", "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)


if best_name in ["lgbm_direct", ""]:
    # ===== Re-entrenar LGBM Directo con TODA la historia =====
    # Usamos tus funciones de #12: candidates y build_direct_labels
    df_full = df.copy()
    df_l_full = build_direct_labels(df_full, TARGET, H=HORIZON)

    # (opcional) re-selección de features por importancia con toda la historia:
    # usa la función lgbm_importance_until en la última fecha
    try:
        last_date = df_l_full["date"].max()
        imp_h1 = lgbm_importance_until(df_l_full, candidates, "y_1", last_date).head(TOPK_IMP)
        imp_h7 = lgbm_importance_until(df_l_full, candidates, "y_7", last_date).head(TOPK_IMP)
        selected_feats = sorted(set(imp_h1["feature"]).union(set(imp_h7["feature"])))
    except Exception:
        selected_feats = candidates

    # Entrena un modelo por cada horizonte y guárdalos en un dict (o 1 multioutput si tienes ese wrapper)
    final_models = {}
    for h in range(1, HORIZON+1):
        y_col = f"y_{h}"
        tr = df_l_full.dropna(subset=[y_col]).copy()
        if tr.empty: 
            continue
        X_tr, y_tr = tr[selected_feats], tr[y_col]
        if USE_LOG1P_TARGET:
            y_tr = np.log1p(y_tr)
        m = _fit_lgbm(X_tr, y_tr, objective="auto")  # usa tu helper de #12
        final_models[h] = m

    # Serializa el “ensemble” por horizonte
    joblib.dump({"models_by_h": final_models, "features": selected_feats}, MODELS_DIR / f"lgbm_direct_{ts}.joblib")
    _save_meta("lgbm_direct", {
        "model_type": "lgbm_direct",
        "target": TARGET,
        "horizon": int(HORIZON),
        "features": selected_feats,
        "use_log1p": bool(USE_LOG1P_TARGET)
    })
    print("[Guardado] lgbm_direct_", ts)

elif best_name == "lstm_direct":
    # ===== Re-entrenar LSTM Directo con TODA la historia =====
    # Reusa la misma receta que en #19
    try:
        base_feats = candidates.copy()
        if TARGET not in base_feats: base_feats = [TARGET] + base_feats
    except NameError:
        base_feats = [c for c in df.columns if c not in {"date","product"} and not c.startswith("y_") and pd.api.types.is_numeric_dtype(df[c])]
        if TARGET not in base_feats: base_feats = [TARGET] + base_feats

    df_train_ff = df.sort_values(["product","date"]).copy()
    df_train_ff[base_feats] = df_train_ff[base_feats].ffill().bfill()

    scaler = StandardScaler().fit(df_train_ff[base_feats].values)
    X_tr, Y_tr = _make_supervised_direct(df_train_ff, base_feats, TARGET, LSTM_LOOKBACK, HORIZON)
    X2 = scaler.transform(X_tr.reshape(-1, X_tr.shape[-1])).reshape(X_tr.shape)
    Y2 = np.log1p(Y_tr) if USE_LOG1P_TARGET else Y_tr

    final_lstm = _build_lstm_direct_model(n_feats=len(base_feats), horizon=HORIZON)
    es = keras.callbacks.EarlyStopping(monitor="loss", patience=LSTM_PATIENCE, restore_best_weights=True)
    final_lstm.fit(X2, Y2, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH, verbose=0, callbacks=[es])

    # Serializa
    final_lstm.save(MODELS_DIR / f"lstm_direct_{ts}.keras")
    joblib.dump(scaler, MODELS_DIR / f"lstm_direct_{ts}.scaler.joblib")
    _save_meta("lstm_direct", {
        "model_type": "lstm_direct",
        "target": TARGET,
        "horizon": int(HORIZON),
        "lookback": int(LSTM_LOOKBACK),
        "features": base_feats,
        "use_log1p": bool(USE_LOG1P_TARGET)
    })
    print("[Guardado] lstm_direct_", ts)

else:
    print(f"[Info] El mejor en registry es '{best_name}'. Para Prophet/SARIMAX, guarda por producto:")
    print(" - Prophet: model_to_json(m) por producto")
    print(" - SARIMAX: results.fit().save(path) por producto")


In [None]:
#18
meta = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
    "python": sys.version,
    "os": platform.platform(),
    "params": {
        "TARGET": TARGET,
        "HORIZON": HORIZON,
        "N_ORIGINS": N_ORIGINS,
        "MIN_TRAIN_DAYS": MIN_TRAIN_DAYS,
        "TOPK_IMP": TOPK_IMP,
        "USE_LOG1P_TARGET": USE_LOG1P_TARGET,
        "LGBM_OBJECTIVE": LGBM_OBJECTIVE,
        "TWEEDIE_POWERS": TWEEDIE_POWERS,
        "CAP_OUTLIERS": CAP_OUTLIERS,
        "OUTLIER_Q": OUTLIER_Q,
        "USE_RICH_CALENDAR": USE_RICH_CALENDAR,
        "UA_HOLIDAYS_PATH": UA_HOLIDAYS_PATH,
        "ADD_BUSINESS_AGGREGATES": ADD_BUSINESS_AGGREGATES,
        "USE_INDEX_WEATHER_HOLIDAYS": USE_INDEX_WEATHER_HOLIDAYS,
        "WEATHER_AGG": WEATHER_AGG,
        "HOLIDAY_COL": HOLIDAY_COL,
        "PROPHET_USE_REGRESSORS": PROPHET_USE_REGRESSORS,
        "RANDOM_STATE": RANDOM_STATE
    }
}

def file_sha1(path):
    try:
        with open(path, "rb") as f:
            import hashlib
            return hashlib.sha1(f.read()).hexdigest()
    except Exception:
        return None

meta["data_sha1"] = file_sha1(DATA_PATH)
meta["index_sha1"] = file_sha1(INDEX_PATH)

RESULTS_DIR.mkdir(exist_ok=True, parents=True)
with open(RESULTS_DIR / "metadata.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print("metadata.json guardado en:", (RESULTS_DIR / "metadata.json").resolve())


In [25]:
from pathlib import Path
import sys, subprocess

MLFLOW_DIR = (RESULTS_DIR / "mlruns").resolve()
MLFLOW_DIR.mkdir(parents=True, exist_ok=True)

backend = MLFLOW_DIR.as_uri()  # p.ej. file:///C:/Users/Julian/tu-proyecto/results/mlruns
print("Backend store URI:", backend)

# Verificación rápida de instalación
import mlflow
print("MLflow version:", mlflow.__version__)

# Lanza la UI con el mismo Python del notebook
cmd = [sys.executable, "-m", "mlflow", "ui",
       "--backend-store-uri", backend,
       "--port", "5001",
       "--host", "127.0.0.1"]
print("Launching:", " ".join(cmd))
subprocess.Popen(cmd)


Backend store URI: file:///C:/Users/Julian/results/mlruns
MLflow version: 2.14.1
Launching: C:\Users\Julian\miniconda3\envs\coffee310\python.exe -m mlflow ui --backend-store-uri file:///C:/Users/Julian/results/mlruns --port 5001 --host 127.0.0.1


<Popen: returncode: None args: ['C:\\Users\\Julian\\miniconda3\\envs\\coffee...>