In [1]:
import os, json
import numpy as np
import pandas as pd
from pathlib import Path


In [3]:

PARQUET_PATH = "./data/market_panel.parquet"  # <-- cambia si es necesario
PARQUET_TDA_PATH = "./data/panel_tda.parquet"
FREQ = "B"                                    # frecuencia business-daily
H = 5                                        # horizonte de pronóstico (días hábiles)
N_WINDOWS = 4                                 # para backtesting rolling
SEED = 42

# Hiperparámetros base NBEATSx (seguro/rápido para empezar)
MAX_STEPS = 1500
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
SCALER_TYPE = "robust"   # consistente con tu y escalado robusto

# Datos para guardar el modelo
MODEL_DIR  = "./models/nbeatsx_run"   # carpeta del modelo
META_PATH  = f"{MODEL_DIR}/meta.json" # metadatos útiles

In [4]:
def cargar_panel(parquet_path):
    df = pd.read_parquet(parquet_path)
    # chequeos básicos
    req = {"unique_id","ds","y"}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"Faltan columnas requeridas: {missing}")
    # tipos
    df["unique_id"] = df["unique_id"].astype(str)
    df["ds"] = pd.to_datetime(df["ds"])
    df["y"] = pd.to_numeric(df["y"], errors="coerce")
    # orden
    df = df.sort_values(["unique_id","ds"]).reset_index(drop=True)
    return df


def inferir_listas_exog(df):
    """
    Detecta exógenas típicas si existen en el parquet.
    - futr_exog: derivadas del tiempo (conocidas al horizonte)
    - hist_exog: señales del pasado (vix, realized vol)
    - stat_exog: constantes por serie (sector, etc.)
    """
    futr_candidatas = ["dow","dom","woy","month","qtr","eom","eoq","eoy"]
    hist_candidatas = ["vix", "Close"]
    # agregar rv_* si existen
    hist_candidatas += [c for c in df.columns if c.startswith("rv_")]


    futr_exog = [c for c in futr_candidatas if c in df.columns]
    hist_exog = [c for c in hist_candidatas if c in df.columns]


    return futr_exog, hist_exog


def train_test_cutoff(df, h):
    """
    Crea un cutoff por serie para separar train/test final (últimos h puntos de cada serie).
    Devuelve df_train (todas las series hasta -h) y df_test_mask (índices del último bloque).
    """
    # Para cada serie, marcamos las últimas h filas
    df = df.copy()
    df["row_id"] = np.arange(len(df))
    last_idx = []
    for uid, g in df.groupby("unique_id", sort=False):
        if len(g) >= h:
            last_idx.extend(g.tail(h)["row_id"].tolist())
        else:
            # si la serie es más corta que h, se marca completo como train (sin test)
            pass
    test_mask = df["row_id"].isin(last_idx)
    df_train = df.loc[~test_mask].drop(columns=["row_id"]).reset_index(drop=True)
    df_test_mask = df.loc[test_mask, ["unique_id","ds"]].reset_index(drop=True)
    return df_train, df_test_mask

In [5]:
def _Mape(y_true, y_pred, eps=1e-8):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)
    return 200.0 * np.mean(num / den)

def _mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

def _rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true)**2))

def _mase(y_true, y_pred, y_insample, m=1):
    # naive estacional: |y_t - y_{t-m}|
    if len(y_insample) <= m:
        return np.nan
    naive_errors = np.abs(y_insample[m:] - y_insample[:-m])
    d = np.mean(naive_errors) if len(naive_errors) > 0 else np.nan
    if not np.isfinite(d) or d == 0:
        return np.nan
    return np.mean(np.abs(y_pred - y_true)) / d

def compute_cv_metrics(cv_df, panel, model_col="NBEATSx", seasonal_period=5):
    """
    cv_df: salida de nf.cross_validation (cols: unique_id, ds, cutoff, <model_col>, [y?])
    panel: dataframe original con y
    """
    df = cv_df.copy()
    if "y" not in df.columns:
        df = df.merge(panel[["unique_id","ds","y"]], on=["unique_id","ds"], how="left")

    # métricas por serie (agregadas sobre todas las ventanas)
    rows = []
    for uid, g in df.groupby("unique_id", sort=False):
        y_true = g["y"].to_numpy(dtype=float)
        y_pred = g[model_col].to_numpy(dtype=float)

        # insample para MASE: usar todo el historial de la serie (excluyendo filas NaN de y)
        insample = panel.loc[panel["unique_id"]==uid, "y"].dropna().to_numpy(dtype=float)

        Mape = _Mape(y_true, y_pred)
        mae   = _mae(y_true, y_pred)
        rmse  = _rmse(y_true, y_pred)
        mase  = _mase(y_true, y_pred, insample, m=seasonal_period)

        rows.append({"unique_id": uid, "MAPE": Mape, "MAE": mae, "RMSE": rmse, "MASE": mase})

    per_series = pd.DataFrame(rows)
    overall = per_series[["MAPE","MAE","RMSE","MASE"]].mean(numeric_only=True).to_frame("mean").T
    overall.insert(0, "unique_id", "OVERALL")
    metrics = pd.concat([overall, per_series], axis=0, ignore_index=True)
    return metrics


In [6]:
def backtest_nbeatsx(panel, freq, h, n_windows, futr_exog, hist_exog, stat_exog):
    """
    Backtesting rolling con NeuralForecast.NBEATSx.
    Devuelve: (cv_df, cv_metrics)
      - cv_df: predicciones por ventana
      - cv_metrics: métricas agregadas por modelo/serie
    """
    # IMPORTANTE: usamos librería Nixtla (requiere instanciar clases de la lib)
    from neuralforecast import NeuralForecast
    from neuralforecast.models import NBEATSx
    from neuralforecast.losses.pytorch import MSE

    # Regla rápida de input_size: ~7*h (ajústalo luego por performance)
    input_size = int(7 * h)

    # Modelo
# dentro de backtest_nbeatsx y fit_predict_nbeatsx
    model = NBEATSx(
        h=h,
        input_size=int(7*h),
        max_steps=MAX_STEPS,
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        scaler_type=SCALER_TYPE,
        futr_exog_list=futr_exog,
        hist_exog_list=hist_exog,
        stat_exog_list=stat_exog,
        random_seed=SEED,
    )


    # Empaquetar y ejecutar cross_validation
    nf = NeuralForecast(models=[model], freq=freq)
    # n_windows rolling, step_size = h (ventanas no solapadas)
    cv_df = nf.cross_validation(
        df=panel,
        n_windows=n_windows,
        step_size=h,
        # test_size=h,
        verbose=True,
    )
    # Métricas
    cv_metrics = compute_cv_metrics(cv_df, panel, model_col="NBEATSx", seasonal_period=5)
    return cv_df, cv_metrics


def fit_predict_nbeatsx(panel_train, panel_full, freq, h, futr_exog, hist_exog, stat_exog):
    """
    Entrena en train y predice h en el panel completo (últimas fechas por serie).
    Devuelve: fcst_df con columnas ['unique_id','ds','NBEATSx'].
    """
    from neuralforecast import NeuralForecast
    from neuralforecast.models import NBEATSx
    from neuralforecast.losses.pytorch import MSE

    input_size = int(7 * h)

    model = NBEATSx(
        h=h,
        input_size=input_size,
        max_steps=MAX_STEPS,
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        loss=MSE(),
        scaler_type=SCALER_TYPE,
        futr_exog_list =futr_exog,
        hist_exog_list =hist_exog,
        stat_exog_list =stat_exog,
        random_seed=SEED,
    )

    nf = NeuralForecast(models=[model], freq=freq)
    nf.fit(panel_train)

    # predict() pide que el panel_full tenga futr_exog para el horizonte (si las usas)
    fcst = nf.predict(futr_df=panel_full)
    return fcst

In [8]:
panel = cargar_panel(PARQUET_PATH)
panel = panel.drop(columns= ["logret_raw"])
panel = panel[panel["ds"] > "2019-01-31 00:00:00"]
# 3.2 Exógenas
futr_exog, hist_exog = inferir_listas_exog(panel)
print("futr_exog:", futr_exog)
print("hist_exog:", hist_exog)
stat_exog = None


futr_exog: ['dow', 'dom', 'woy', 'month', 'qtr', 'eom', 'eoq', 'eoy']
hist_exog: ['vix', 'Close', 'rv_5', 'rv_10', 'rv_21']


In [7]:

# 3.3 Backtesting (opcional pero recomendado)
print("\n>>> BACKTESTING")
cv_df, cv_metrics = backtest_nbeatsx(
    panel=panel,
    freq=FREQ,
    h=H,
    n_windows=N_WINDOWS,
    futr_exog=futr_exog,
    hist_exog=hist_exog,
    stat_exog=stat_exog

)
# Guardar resultados de CV
os.makedirs("./outputs", exist_ok=True)
cv_df.to_parquet("./outputs/nbeatsx_cv_predictions.parquet", index=False)
cv_metrics.to_parquet("./outputs/nbeatsx_cv_metrics.parquet", index=False)
print("Guardado backtesting en ./outputs/")

# 3.4 Entrenamiento final + predicción del último horizonte
print("\n>>> FIT FINAL + PREDICT")



>>> BACKTESTING


Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 3.2 M  | train
-------------------------------------------------------
3.2 M     Trainable params
440       Non-trainable params
3.2 M     Total params
12.847    Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

Guardado backtesting en ./outputs/

>>> FIT FINAL + PREDICT


In [8]:
panel_train, test_mask = train_test_cutoff(panel, h=H)
# Para predict() con futr_exog necesitas tener esas columnas también
# en el rango futuro; si tus futr_exog son puramente de calendario,
# basta con construir filas futuras (NeuralForecast internamente expande;
# si requiere futr_df explícito, se pasa panel completo).
fcst = fit_predict_nbeatsx(
    panel_train=panel_train,
    panel_full=panel,   # si usas futr_exog de calendario, el modelo las conoce por 'ds'
    freq=FREQ,
    h=H,
    futr_exog=futr_exog,
    hist_exog=hist_exog,
    stat_exog=stat_exog
)
fcst.to_parquet("./outputs/nbeatsx_final_forecast.parquet", index=False)
print("Guardado forecast final en ./outputs/nbeatsx_final_forecast.parquet")

# Vista rápida de métricas promedio
print("\nMétricas CV (primeras filas):")
print(cv_metrics.head(10).to_string(index=False))

Seed set to 42


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MSE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 3.2 M  | train
-------------------------------------------------------
3.2 M     Trainable params
440       Non-trainable params
3.2 M     Total params
12.847    Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

Guardado forecast final en ./outputs/nbeatsx_final_forecast.parquet

Métricas CV (primeras filas):
unique_id       MAPE      MAE     RMSE     MASE
  OVERALL 144.727100 0.763723 0.987706 0.765808
     AAPL 142.031405 0.796379 1.005430 0.802363
     AMZN 132.888966 0.803535 1.123757 0.820885
    BRK-B 154.405127 0.709165 0.818155 0.728582
    GOOGL 155.586250 0.801076 0.956690 0.779988
      LLY 122.293911 0.553250 0.765415 0.524365
     META 144.703886 0.738299 1.255008 0.728609
     MSFT 129.138223 0.552940 0.750129 0.548542
     NVDA 166.702057 0.737746 0.880457 0.767951
      NVO 132.984601 0.844296 0.985310 0.809980


## NBEATS + TDA

In [11]:
panel_tda = cargar_panel(PARQUET_TDA_PATH)
panel_tda = panel_tda.drop(columns= ["logret_raw"])
panel_tda = panel_tda[panel_tda["ds"] > "2019-01-31 00:00:00"]
# 3.2 Exógenas

hist_exog_tda = hist_exog +  ["tda_amplitude_h1_w21","tda_n_points_h1_w21"]
stat_exog = None

In [12]:
# 3.3 Backtesting (opcional pero recomendado)
print("\n>>> BACKTESTING")
cv_df_tda, cv_tda_metrics = backtest_nbeatsx(
    panel=panel_tda,
    freq=FREQ,
    h=H,
    n_windows=N_WINDOWS,
    futr_exog=futr_exog,
    hist_exog=hist_exog_tda,
    stat_exog=stat_exog

)
# Guardar resultados de CV
os.makedirs("./outputs", exist_ok=True)
cv_df_tda.to_parquet("./outputs/nbeatsx_cv_tda_predictions_TDA.parquet", index=False)
cv_tda_metrics.to_parquet("./outputs/nbeatsx_cv_tdas_metrics.parquet", index=False)
print("Guardado backtesting en ./outputs/")

# 3.4 Entrenamiento final + predicción del último horizonte
print("\n>>> FIT FINAL + PREDICT")


>>> BACKTESTING


Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MAE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 3.3 M  | train
-------------------------------------------------------
3.3 M     Trainable params
440       Non-trainable params
3.3 M     Total params
13.277    Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

Guardado backtesting en ./outputs/

>>> FIT FINAL + PREDICT


In [13]:
panel_train_tda, test_mask = train_test_cutoff(panel_tda, h=H)

# Para predict() con futr_exog necesitas tener esas columnas también
# en el rango futuro; si tus futr_exog son puramente de calendario,
# basta con construir filas futuras (NeuralForecast internamente expande;
# si requiere futr_df explícito, se pasa panel completo).
fcst = fit_predict_nbeatsx(
    panel_train=panel_train_tda,
    panel_full=panel_tda,   # si usas futr_exog de calendario, el modelo las conoce por 'ds'
    freq=FREQ,
    h=H,
    futr_exog=futr_exog,
    hist_exog=hist_exog_tda,
    stat_exog=stat_exog
)
fcst.to_parquet("./outputs/nbeatsx_final_forecast_TDA.parquet", index=False)
print("Guardado forecast final en ./outputs/nbeatsx_final_forecast_TDA.parquet")

# Vista rápida de métricas promedio
print("\nMétricas CV (primeras filas):")
print(cv_tda_metrics.head(10).to_string(index=False))

Seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type          | Params | Mode 
-------------------------------------------------------
0 | loss         | MSE           | 0      | train
1 | padder_train | ConstantPad1d | 0      | train
2 | scaler       | TemporalNorm  | 0      | train
3 | blocks       | ModuleList    | 3.3 M  | train
-------------------------------------------------------
3.3 M     Trainable params
440       Non-trainable params
3.3 M     Total params
13.277    Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1500` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

Guardado forecast final en ./outputs/nbeatsx_final_forecast_TDA.parquet

Métricas CV (primeras filas):
unique_id       MAPE      MAE     RMSE     MASE
  OVERALL 138.162656 0.753556 0.987983 0.752030
     AAPL 146.974767 0.650383 0.837232 0.655270
     AMZN 130.135715 0.868917 1.225404 0.887678
    BRK-B 109.301696 0.505050 0.687265 0.518879
    GOOGL 130.150629 0.855264 1.070553 0.832750
      LLY 134.626462 0.757138 0.976743 0.717607
     META 135.665791 0.728481 1.229224 0.718920
     MSFT 157.464532 0.618018 0.752733 0.613102
     NVDA 127.036164 0.486218 0.627543 0.506124
      NVO 158.481772 1.003588 1.159681 0.962797


In [42]:
# def guardar_modelo_nbeatsx(nf, futr_exog, hist_exog, stat_exog, freq, h):
#     os.makedirs(MODEL_DIR, exist_ok=True)
#     nf.save(MODEL_DIR)  # guarda pesos, normalizadores y config interna
#     meta = {
#         "freq": freq,
#         "h": int(h),
#         "futr_exog": futr_exog,
#         "hist_exog": hist_exog,
#         "stat_exog": stat_exog,
#         "saved_at": pd.Timestamp.utcnow().isoformat()
#     }
#     with open(META_PATH, "w", encoding="utf-8") as f:
#         json.dump(meta, f, ensure_ascii=False, indent=2)
