In [1]:
###############   DEPENDENCY LOAD BLOCK   ###############
# Designed for Google Colab with GPU support

import sys, subprocess, importlib, os

def pipi(args):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + args)

# Prevent torchvision::nms bug (safe even if torchvision not used)
os.environ["TORCHVISION_DISABLE_NMS_EXPORT"] = "1"

# --- Core check
import numpy as np
print("Using NumPy:", np.__version__)
try:
    import torch
    print("Using Torch:", torch.__version__)
except Exception:
    pass

# --- Install neuralforecast and dependencies
pipi(["neuralforecast==1.7.4", "--no-deps"])
pipi([
    "pytorch-lightning==2.2.5",
    "torchmetrics==1.3.1",
    "einops>=0.7.0",
    "pandas>=2.2.0",
    "scikit-learn>=1.3.0",
    "lightgbm>=4.3.0"
])
pipi(["utilsforecast==0.2.12"])
pipi(["coreforecast==0.0.12"])

# --- Import checks
import pandas as pd
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS
from neuralforecast.losses.pytorch import PMM, GMM, MAE as NF_MAE
from lightgbm import LGBMRegressor

print("Pandas:", pd.__version__)
print("NeuralForecast imported OK")
print("LightGBM imported OK")

Using NumPy: 2.0.2
Using Torch: 2.8.0+cu126
Pandas: 2.2.2
NeuralForecast imported OK
LightGBM imported OK


In [2]:
# =========================
# BLOCK 2 — ELECTRICITY PREPROCESSING (with truncation to first 100 unique_ids)
# =========================
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import os
import pandas as pd
import numpy as np

CONFIG = {
    "RAW_CSV": "/content/drive/MyDrive/electricity.csv",   # <-- adjust path
    "OUT_DIR": "/content/drive/MyDrive/electricity_results_no_mlf",
    "VAL_SIZE": 2*7*24,     # 2 weeks (in hours)
    "TEST_WEEKS": 4,        # last 4 weeks = test (plus -1h to align)
    "FREQ": "h",
    "DAYFIRST": True,        # your data is day-first; keep True

    # Added configuration for models
    "HORIZON": 24,
    "NHITS_INPUT_SIZE": 24 * 7,
    "NHITS_STEP_SIZE": 24,
    "NHITS_LOSS": "MAE",
    "NHITS_PMM_COMPONENTS": 10,
    "NHITS_STEPS": 1000,
    "NHITS_LR": 1e-3,
    "NHITS_BATCH": 32,

    "LGBM_LAGS": [24, 24*2, 24*7],
    "LGBM_PARAMS": {"n_estimators": 100, "learning_rate": 0.1},

    "CHUNK_SIZE": 50,
    "SEEDS": [42, 43, 44, 45, 46],

    # >>> NEW: limit dataset size to the first N unique_ids (alphabetical) <<<
    "MAX_IDS": 100,
}

os.makedirs(CONFIG["OUT_DIR"], exist_ok=True)

def preprocess_electricity(cfg: dict):
    # --- Load
    df = pd.read_csv(cfg["RAW_CSV"], low_memory=False)

    # --- Required renames to canonical schema
    if "client" not in df.columns:
        raise ValueError("Input must contain a 'client' column.")
    if "consumption" not in df.columns:
        raise ValueError("Input must contain a 'consumption' column.")
    if "ds" not in df.columns:
        raise ValueError("Input must contain a 'ds' timestamp column.")

    # Create variables list (mu_*, pi_*, sigma_*) — add if missing
    variables = [f'mu_{i}' for i in range(1, 11)] + \
                [f'pi_{i}' for i in range(1, 11)] + \
                [f'sigma_{i}' for i in range(1, 11)]

    # Build total_mean / total_variance if missing
    if "total_mean" not in df.columns:     df["total_mean"] = 0.0
    if "total_variance" not in df.columns: df["total_variance"] = 0.0

    # Ensure the MDN mixture columns exist (fill with 0 if not)
    for c in variables:
        if c not in df.columns:
            df[c] = 0.0

    # Canonical projection
    Y_df = pd.DataFrame({
        "ds": df["ds"],
        "unique_id": df["client"].astype(str),
        "y": pd.to_numeric(df["consumption"], errors="coerce")
    })

    # Attach exog
    Y_df["total_mean"] = pd.to_numeric(df["total_mean"], errors="coerce")
    Y_df["total_variance"] = pd.to_numeric(df["total_variance"], errors="coerce")

    # Attach mixtures in a stable order
    for c in variables:
        Y_df[c] = pd.to_numeric(df[c], errors="coerce")

    # Ensure 'ds' is datetime (dayfirst with mixed formats)
    Y_df["ds"] = pd.to_datetime(Y_df["ds"], dayfirst=cfg["DAYFIRST"], format="mixed", errors="coerce")

    # Drop rows without timestamp; sort
    Y_df = (Y_df.dropna(subset=["ds"])
                 .sort_values(["unique_id","ds"])
                 .reset_index(drop=True))

    # Fill missing exog values with 0
    Y_df[["total_mean","total_variance"]] = Y_df[["total_mean","total_variance"]].fillna(0.0)
    Y_df[variables] = Y_df[variables].fillna(0.0)

    # If y is missing, set to 0 (or drop — here we follow your fill pattern)
    Y_df["y"] = Y_df["y"].fillna(0.0)

    # --- NEW: Truncate to the first N unique_ids (alphabetical, deterministic)
    max_ids = int(cfg.get("MAX_IDS", 0) or 0)
    if max_ids > 0:
        all_ids = sorted(Y_df["unique_id"].astype(str).unique().tolist())
        keep_ids = set(all_ids[:max_ids])
        before_rows = len(Y_df)
        before_ids  = Y_df["unique_id"].nunique()
        Y_df = Y_df[Y_df["unique_id"].astype(str).isin(keep_ids)].copy()
        after_rows = len(Y_df)
        after_ids  = Y_df["unique_id"].nunique()
        print(f"[TRUNCATE] Limited to first {max_ids} unique_ids "
              f"(alphabetical). Series: {before_ids} → {after_ids}; "
              f"Rows: {before_rows} → {after_rows}")

    # --- Fixed splits (your logic)
    max_date = Y_df["ds"].max()
    threshold_date = max_date - pd.Timedelta(weeks=cfg["TEST_WEEKS"]) - pd.Timedelta(hours=1)

    val_size  = int(cfg["VAL_SIZE"])
    test_size = Y_df[Y_df["ds"] > threshold_date]["ds"].nunique()

    unique_dates = sorted(Y_df["ds"].unique())
    if val_size + test_size > len(unique_dates):
        raise ValueError(
            f"Not enough timestamps to allocate val_size({val_size}) + test_size({test_size}). "
            f"Total unique timestamps: {len(unique_dates)}"
        )

    training_cutoff   = unique_dates[-(val_size + test_size)]
    validation_cutoff = unique_dates[-test_size]

    # Partitions
    Y_train_df = Y_df[Y_df["ds"] <= training_cutoff].copy()
    Y_val_df   = Y_df[(Y_df["ds"] > training_cutoff) & (Y_df["ds"] <= validation_cutoff)].copy()
    Y_test_df  = Y_df[Y_df["ds"] > validation_cutoff].copy()

    # --- Counts per unique_id (helper)
    def _count_periods(df, name):
        c = df.groupby("unique_id")["ds"].nunique().reset_index()
        c.columns = ["unique_id", f"{name}_periods"]
        return c

    train_counts = _count_periods(Y_train_df, "train")
    val_counts   = _count_periods(Y_val_df, "val")
    test_counts  = _count_periods(Y_test_df, "test")
    counts_df    = train_counts.merge(val_counts, on="unique_id", how="outer").merge(test_counts, on="unique_id", how="outer")

    # --- Save artifacts
    Y_df_out = os.path.join(cfg["OUT_DIR"], "electricity_clean_full.csv")
    Y_df.to_csv(Y_df_out, index=False)

    dense_test = Y_df[Y_df["ds"] > validation_cutoff][["unique_id","ds","y"]].copy()
    dense_test_out = os.path.join(cfg["OUT_DIR"], "test_actuals.csv")
    dense_test.to_csv(dense_test_out, index=False)

    split_meta = pd.DataFrame({
        "key": ["max_date", "threshold_date", "training_cutoff", "validation_cutoff",
                "val_size", "test_size"],
        "value": [max_date.isoformat(),
                  threshold_date.isoformat(),
                  pd.Timestamp(training_cutoff).isoformat(),
                  pd.Timestamp(validation_cutoff).isoformat(),
                  val_size,
                  test_size]
    })
    split_meta_out = os.path.join(cfg["OUT_DIR"], "split_meta.csv")
    split_meta.to_csv(split_meta_out, index=False)

    counts_out = os.path.join(cfg["OUT_DIR"], "partition_counts.csv")
    counts_df.to_csv(counts_out, index=False)

    print(f"[PREP] Rows: {len(Y_df)} | Clients: {Y_df['unique_id'].nunique()}")
    print(f"[PREP] Max date: {max_date} | Threshold date: {threshold_date}")
    print(f"[PREP] val_size(hours)={val_size} | test_size(hours)={test_size}")
    print(f"[SAVE] Clean full: {Y_df_out}")
    print(f"[SAVE] Test grid:  {dense_test_out}")
    print(f"[SAVE] Split meta: {split_meta_out}")
    print(f"[SAVE] Counts:     {counts_out}")

    return Y_df, Y_train_df, Y_val_df, Y_test_df, counts_df, threshold_date, val_size, test_size

# --- Run it
Y_df, Y_train_df, Y_val_df, Y_test_df, counts_df, threshold_date, val_size, test_size = preprocess_electricity(CONFIG)

# Quick preview
display(Y_df.head(3))
display(counts_df.head(10))

Mounted at /content/drive
[TRUNCATE] Limited to first 100 unique_ids (alphabetical). Series: 370 → 100; Rows: 12787570 → 3456100
[PREP] Rows: 3456100 | Clients: 100
[PREP] Max date: 2015-01-01 00:00:00 | Threshold date: 2014-12-03 23:00:00
[PREP] val_size(hours)=336 | test_size(hours)=673
[SAVE] Clean full: /content/drive/MyDrive/electricity_results_no_mlf/electricity_clean_full.csv
[SAVE] Test grid:  /content/drive/MyDrive/electricity_results_no_mlf/test_actuals.csv
[SAVE] Split meta: /content/drive/MyDrive/electricity_results_no_mlf/split_meta.csv
[SAVE] Counts:     /content/drive/MyDrive/electricity_results_no_mlf/partition_counts.csv


Unnamed: 0,ds,unique_id,y,total_mean,total_variance,mu_1,mu_2,mu_3,mu_4,mu_5,...,sigma_1,sigma_2,sigma_3,sigma_4,sigma_5,sigma_6,sigma_7,sigma_8,sigma_9,sigma_10
0,2011-01-22 00:00:00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011-01-22 01:00:00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011-01-22 02:00:00,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,unique_id,train_periods,val_periods,test_periods
0,1,33553,336,672
1,10,33553,336,672
2,100,33553,336,672
3,101,33553,336,672
4,102,33553,336,672
5,103,33553,336,672
6,104,33553,336,672
7,105,33553,336,672
8,106,33553,336,672
9,107,33553,336,672


In [3]:
# =========================
# BLOCK 1: PREP + MDN COVARIATES
# =========================

import os, gc, json
import numpy as np
import pandas as pd
from typing import Tuple
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim

# --- CONFIG (shared) ---
CONFIG = {
    "CLEAN_CSV_PATH": "/content/drive/MyDrive/electricity_results_no_mlf/electricity_clean_full.csv",
    "OUT_DIR": "/content/drive/MyDrive/myproject/results_mdn_nhits/",
    "FREQ": "h",

    # Split
    "VAL_SIZE": 2*7*24,     # 2 weeks validation
    "TEST_SPLIT_Q": 0.80,   # last 20% timestamps = test

    # MDN
    "MDN_MIXES": 24,
    "MDN_HIDDEN": 16,
    "MDN_EPOCHS": 400,
    "MDN_PATIENCE": 20,
    "MDN_LR": 1e-3,

    # Seeds
    "SEED_FOR_MDN": 1,
}
os.makedirs(CONFIG["OUT_DIR"], exist_ok=True)

def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def free_mem(*objs):
    for o in objs:
        try: del o
        except: pass
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def load_clean(path: str) -> pd.DataFrame:
    """
    Normalize to: ['unique_id','ds','y'] + (optionals).
    If 'unique_id' missing => single-series 'series_0'.
    """
    df = pd.read_csv(path, low_memory=False)

    # --- normalize ID ---
    if "unique_id" not in df.columns:
        if "client" in df.columns:
            df = df.rename(columns={"client": "unique_id"})
        else:
            df["unique_id"] = "series_0"

    # --- normalize target ---
    if "y" not in df.columns:
        if "consumption" in df.columns:
            df = df.rename(columns={"consumption": "y"})
        elif "value" in df.columns:
            df = df.rename(columns={"value": "y"})
        else:
            raise ValueError("Target not found. Expect 'y' or an alias.")

    # --- normalize timestamp ---
    if "ds" not in df.columns:
        for cand in ["datetime", "timestamp", "date", "Date", "time"]:
            if cand in df.columns:
                df = df.rename(columns={cand: "ds"})
                break
    if "ds" not in df.columns:
        raise ValueError("Timestamp not found. Expect 'ds' or common alias.")
    df["ds"] = pd.to_datetime(df["ds"], errors="coerce", utc=False)
    if df["ds"].isna().all():
        raise ValueError("Failed to parse timestamps in 'ds'.")

    # types
    df["unique_id"] = df["unique_id"].astype(str)
    df["y"] = pd.to_numeric(df["y"], errors="coerce").astype("float32")

    # calendar
    df = df.sort_values(["unique_id", "ds"]).reset_index(drop=True)
    df["Hour"] = df["ds"].dt.hour.astype("int16")
    df["weekday"] = df["ds"].dt.weekday.astype("int8")
    return df

# -------------------------
# MDN
# -------------------------
class MDN(nn.Module):
    def __init__(self, input_dims, hidden_dims, n_mixes):
        super().__init__()
        self.n_mixes = n_mixes
        self.h1 = nn.Linear(input_dims, hidden_dims)
        self.h2 = nn.Linear(hidden_dims, hidden_dims)
        self.out = nn.Linear(hidden_dims, n_mixes * 3)
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.act(self.h1(x))
        x = self.act(self.h2(x))
        raw = self.out(x)
        return self._split_params(raw)

    def _split_params(self, raw):
        B, K = raw.size(0), self.n_mixes
        mu = raw[:, :K].view(B, K, 1)
        sigma = torch.exp(raw[:, K:2*K]).view(B, K, 1) + 1e-6
        pis = torch.softmax(raw[:, 2*K:], dim=1)
        return mu, sigma, pis

    @staticmethod
    def nll(mu, sigma, pis, target):
        B, K, _ = mu.shape
        y = target.view(B,1,1).expand(B,K,1)
        dist = torch.distributions.Normal(mu, sigma)
        log_probs = dist.log_prob(y).sum(dim=2)
        weighted = log_probs + torch.log(pis)
        mx = weighted.max(dim=1, keepdim=True).values
        log_sum = mx + torch.log(torch.sum(torch.exp(weighted-mx), dim=1, keepdim=True))
        return -log_sum.mean()

    def mixture_mean_var(self, x):
        mu, sigma, pis = self.forward(x)
        muK = mu.squeeze(-1); sigK = sigma.squeeze(-1)
        mean = torch.sum(pis*muK, dim=1)
        var = torch.sum(pis*(sigK**2 + muK**2), dim=1) - mean**2
        return mean, var.clamp_min(1e-12)

def train_mdn(features, target, n_mixes, hidden, epochs, patience, lr, seed, device):
    set_seed(seed)
    Xtr, Xva, ytr, yva = train_test_split(features, target, test_size=0.2, random_state=seed)
    ysc = StandardScaler()
    ytr_s = ysc.fit_transform(ytr.reshape(-1,1))
    yva_s = ysc.transform(yva.reshape(-1,1))

    Xtr_t = torch.tensor(Xtr, dtype=torch.float32, device=device)
    Xva_t = torch.tensor(Xva, dtype=torch.float32, device=device)
    ytr_t = torch.tensor(ytr_s, dtype=torch.float32, device=device)
    yva_t = torch.tensor(yva_s, dtype=torch.float32, device=device)

    model = MDN(Xtr.shape[1], hidden, n_mixes).to(device)
    opt = optim.Adam(model.parameters(), lr=lr)

    best = float("inf"); bad = 0; best_state = None
    for ep in range(1, epochs+1):
        model.train(); opt.zero_grad()
        mu, sig, pis = model(Xtr_t); loss = MDN.nll(mu, sig, pis, ytr_t)
        loss.backward(); opt.step()
        model.eval()
        with torch.no_grad():
            mu_v, sig_v, pis_v = model(Xva_t); vloss = MDN.nll(mu_v, sig_v, pis_v, yva_t)
        if vloss.item() < best - 1e-4:
            best = vloss.item(); bad = 0
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
        else:
            bad += 1
            if bad >= patience: break
    if best_state: model.load_state_dict(best_state)
    free_mem(Xtr_t, Xva_t, ytr_t, yva_t)
    return model, ysc

def mdn_features_for_rows(model, ysc, hours, weekdays, device):
    X = np.column_stack([hours, weekdays]).astype(np.float32)
    Xt = torch.tensor(X, dtype=torch.float32, device=device)
    with torch.no_grad():
        mean_s, var_s = model.mixture_mean_var(Xt)
    mean_np = mean_s.cpu().numpy().reshape(-1, 1)
    var_np  = var_s.cpu().numpy()
    total_mean = ysc.inverse_transform(mean_np).ravel()
    total_std  = np.sqrt(var_np) * ysc.scale_[0]
    total_var  = total_std**2
    free_mem(Xt)
    return {"total_mean": total_mean, "total_variance": total_var}

# -------------------------
# MAIN (Block 1)
# -------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

base = load_clean(CONFIG["CLEAN_CSV_PATH"])

# Compute split boundaries (saved so LGBM/NHITS blocks can reuse exactly)
threshold_date = base["ds"].quantile(CONFIG["TEST_SPLIT_Q"])
VAL_SIZE = int(CONFIG["VAL_SIZE"])
TEST_SIZE = base[base["ds"] > threshold_date]["ds"].nunique()
unique_dates = sorted(base["ds"].unique())
training_cutoff = unique_dates[-(VAL_SIZE + TEST_SIZE)]
validation_cutoff = unique_dates[-TEST_SIZE]

base["is_train"] = base["ds"] <= training_cutoff
base["is_val"]   = (base["ds"] > training_cutoff) & (base["ds"] <= validation_cutoff)
base["is_test"]  = base["ds"] > validation_cutoff

# Train MDN on TRAIN rows only; covariates for ALL rows
Xtr = base.loc[base["is_train"], ["Hour", "weekday"]].to_numpy(np.float32)
ytr = base.loc[base["is_train"], "y"].to_numpy(np.float32)

mdn, ysc = train_mdn(
    Xtr, ytr,
    n_mixes=CONFIG["MDN_MIXES"],
    hidden=CONFIG["MDN_HIDDEN"],
    epochs=CONFIG["MDN_EPOCHS"],
    patience=CONFIG["MDN_PATIENCE"],
    lr=CONFIG["MDN_LR"],
    seed=CONFIG["SEED_FOR_MDN"],
    device=DEVICE
)

feats = mdn_features_for_rows(
    mdn, ysc,
    base["Hour"].to_numpy(), base["weekday"].to_numpy(),
    DEVICE
)
base["total_mean"]     = feats["total_mean"].astype("float32")
base["total_variance"] = feats["total_variance"].astype("float32")

# Persist outputs for next blocks
enriched_path = os.path.join(CONFIG["OUT_DIR"], "dataset_with_mdn.csv")
base.to_csv(enriched_path, index=False)

with open(os.path.join(CONFIG["OUT_DIR"], "splits.json"), "w") as f:
    json.dump({
        "training_cutoff": pd.Timestamp(training_cutoff).isoformat(),
        "validation_cutoff": pd.Timestamp(validation_cutoff).isoformat(),
        "VAL_SIZE": int(CONFIG["VAL_SIZE"]),
        "TEST_SPLIT_Q": float(CONFIG["TEST_SPLIT_Q"])
    }, f, indent=2)

# Also save dense test actuals
dense_test = base.loc[base["is_test"], ["unique_id","ds","y"]].copy()
dense_test.to_csv(os.path.join(CONFIG["OUT_DIR"], "test_actuals.csv"), index=False)

print(f"[OK] Wrote MDN-enriched dataset: {enriched_path}")
print(f"[OK] Wrote splits.json with cutoffs. Test actuals saved.")

[OK] Wrote MDN-enriched dataset: /content/drive/MyDrive/myproject/results_mdn_nhits/dataset_with_mdn.csv
[OK] Wrote splits.json with cutoffs. Test actuals saved.


In [4]:
# # =========================
# # BLOCK 2: LGBM FORECASTING (uses MDN covariates)
# # =========================

# import os, gc, json
# import numpy as np
# import pandas as pd
# from lightgbm import LGBMRegressor

# # --- CONFIG (must match Block 1 OUT_DIR and settings where relevant) ---
# CONFIG = {
#     "OUT_DIR": "/content/drive/MyDrive/myproject/results_mdn_nhits/",
#     "SEEDS_FOR_REPS": [1, 2, 3, 4, 5],
#     "USE_GPU": True,   # try GPU if available
# }

# def free_mem(*objs):
#     for o in objs:
#         try: del o
#         except: pass
#     gc.collect()

# def add_lags(df, lags=(1,24,168)):
#     out = df.sort_values(["unique_id","ds"]).copy()
#     for L in lags:
#         out[f"lag_{L}"] = out.groupby("unique_id")["y"].shift(L)
#     return out

# def get_lgbm_params(seed: int, prefer_gpu: bool = True) -> dict:
#     base = dict(
#         n_estimators=500,
#         learning_rate=0.05,
#         subsample=0.9,
#         colsample_bytree=0.9,
#         num_leaves=20,
#         min_child_samples=20,
#         random_state=seed,
#         verbosity=-1,
#     )
#     if prefer_gpu:
#         # Will raise if LightGBM was not built with CUDA; we'll catch later.
#         base.update(dict(device='gpu', max_bin=255, gpu_platform_id=-1, gpu_device_id=0))
#     else:
#         base.update(dict(device='cpu'))
#     return base

# def run_lgbm_once(df_all, use_mdn, validation_cutoff, lags=(24*14,24*21), seed=42, prefer_gpu=True):
#     feat = ["Hour","weekday"] + (["total_mean","total_variance"] if use_mdn else [])
#     lag_cols = [f"lag_{L}" for L in lags]
#     X_cols = feat + lag_cols

#     df = add_lags(df_all, lags).copy()
#     for c in X_cols + ["y"]:
#         if c in df.columns:
#             df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")

#     tr = df[df["ds"] <= pd.to_datetime(validation_cutoff)].dropna(subset=feat+["y"]).copy()
#     te = df[df["ds"] >  pd.to_datetime(validation_cutoff)].copy()

#     params = get_lgbm_params(seed, prefer_gpu=prefer_gpu)
#     try:
#         model = LGBMRegressor(**params)
#         model.fit(tr[X_cols], tr["y"].astype("float32"))
#     except Exception as e:
#         # Fallback to CPU if GPU build unavailable
#         print(f"[LGBM] GPU unavailable ({e}); falling back to CPU.")
#         params = get_lgbm_params(seed, prefer_gpu=False)
#         model = LGBMRegressor(**params)
#         model.fit(tr[X_cols], tr["y"].astype("float32"))

#     preds=[]
#     for uid, g in te.groupby("unique_id"):
#         g = g.sort_values("ds").copy()
#         if "lag_1" not in g.columns: g["lag_1"] = np.nan
#         g["lag_1"] = pd.to_numeric(g["lag_1"], errors="coerce").astype("float32")
#         for i in range(len(g)):
#             x_df = g.iloc[[i]][X_cols].astype("float32")
#             p = float(model.predict(x_df, validate_features=False)[0])
#             preds.append((uid, g.iloc[i]["ds"], p))
#             if i+1 < len(g):
#                 g.iat[i+1, g.columns.get_loc("lag_1")] = np.float32(p)

#     return pd.DataFrame(preds, columns=["unique_id","ds","yhat"])

# # -------------------------
# # MAIN (Block 2)
# # -------------------------
# enriched_path = os.path.join(CONFIG["OUT_DIR"], "dataset_with_mdn.csv")
# splits_path   = os.path.join(CONFIG["OUT_DIR"], "splits.json")

# df = pd.read_csv(enriched_path, low_memory=False, parse_dates=["ds"])
# with open(splits_path, "r") as f:
#     splits = json.load(f)
# validation_cutoff = splits["validation_cutoff"]

# # Two frames (with & without MDN covariates)
# df_with = df[["unique_id","ds","y","total_mean","total_variance","Hour","weekday"]].copy().sort_values(["unique_id","ds"])
# df_no   = df[["unique_id","ds","y","Hour","weekday"]].copy().sort_values(["unique_id","ds"])

# for label, use_mdn in [("lgbm_nomdn", False), ("lgbm_mdn", True)]:
#     print(f"\n[RUN] {label}")
#     for seed in CONFIG["SEEDS_FOR_REPS"]:
#         preds = run_lgbm_once(
#             df_all = (df_with if use_mdn else df_no),
#             use_mdn = use_mdn,
#             validation_cutoff = validation_cutoff,
#             lags = (24*14,24*21),
#             seed = seed,
#             prefer_gpu = CONFIG["USE_GPU"]
#         )
#         preds.to_csv(os.path.join(CONFIG["OUT_DIR"], f"test_forecasts_{label}_seed{seed}.csv"), index=False)
#         free_mem(preds)

# print("\n[OK] LGBM forecasts saved.")

In [8]:
# =========================
# BLOCK — LGBM DAILY-FIRST MAPE (TEST-ONLY, 2 models x N seeds)
# =========================
import os
import json
import numpy as np
import pandas as pd
from scipy.stats import t

# --- paths/config ---
try:
    OUT_DIR = CONFIG["OUT_DIR"]
    SEEDS   = CONFIG["SEEDS_FOR_REPS"]
except NameError:
    OUT_DIR = "/content/drive/MyDrive/myproject/results_mdn_nhits/"
    SEEDS   = [1, 2, 3, 4, 5]

LGBM_LABELS = [
    "lgbm_nomdn",
    "lgbm_mdn",
]

ACTUALS_PATH = os.path.join(OUT_DIR, "test_actuals.csv")
SUMMARY_OUT  = os.path.join(OUT_DIR, "mape_summary_lgbm.csv")
DETAIL_OUT   = os.path.join(OUT_DIR, "mape_daily_per_seed_lgbm.csv")

# --- helpers ---
def safe_read_csv(path, parse_dates=("ds",)):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing file: {path}")
    df = pd.read_csv(path, low_memory=False)
    for col in parse_dates:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    if "unique_id" in df.columns:
        df["unique_id"] = df["unique_id"].astype(str)
    return df

def dedup_hourly(df, y_col):
    """
    Remove duplicate hourly rows per (unique_id, ds), keeping the last.
    Only rows with finite y_col are kept.
    """
    d = df.copy()
    if "ds" not in d.columns:
        raise ValueError("dedup_hourly: 'ds' column required")
    d = d.dropna(subset=["ds"])
    d = d.sort_values(["unique_id", "ds"]).drop_duplicates(["unique_id", "ds"], keep="last")
    d = d[np.isfinite(d[y_col])]
    return d

def limit_to_test_frame(df, test_start, test_end):
    """Keep rows with ds in [test_start, test_end] inclusive."""
    d = df.copy()
    m = (d["ds"] >= test_start) & (d["ds"] <= test_end)
    return d.loc[m].copy()

def daily_agg_sum(df, y_col):
    """Aggregate hourly rows to a single daily sum across all series."""
    if "ds" not in df.columns:
        raise ValueError("daily_agg_sum: 'ds' column required")
    d = df.copy()
    d["date"] = d["ds"].dt.date
    return d.groupby("date", as_index=False)[y_col].sum()

def compute_daily_mape(y_true_daily, y_pred_daily):
    """
    Returns Series of daily MAPE (%) aligned on 'date'.
    Zero-actual days are excluded to avoid division by zero.
    """
    merged = y_true_daily.merge(y_pred_daily, on="date", how="inner")
    mask = (merged["y"] != 0) & np.isfinite(merged["y"]) & np.isfinite(merged["yhat"])
    if not mask.any():
        return pd.Series([], dtype=float)
    return 100.0 * (merged.loc[mask, "y"] - merged.loc[mask, "yhat"]).abs() / merged.loc[mask, "y"]

# --- load actuals (test-only) ---
dense_test = safe_read_csv(ACTUALS_PATH, parse_dates=("ds",))
need_actual = {"unique_id","ds","y"}
if not need_actual.issubset(dense_test.columns):
    raise ValueError("test_actuals.csv must contain 'unique_id','ds','y'.")

# Define the official test time window and valid ids from actuals
test_start = dense_test["ds"].min()
test_end   = dense_test["ds"].max()
valid_ids  = set(dense_test["unique_id"].unique().tolist())
print(f"[INFO] Test frame: {test_start} → {test_end} | Series in test: {len(valid_ids)}")

# Build daily actuals (sum across all clients)
daily_actual = (
    daily_agg_sum(dense_test, "y")
    .rename(columns={"y":"y"})
    .sort_values("date")
    .reset_index(drop=True)
)

# --- compute MAPE per seed and summarize (dedup + test-frame only) ---
detail_rows = []
summary_rows = []

for label in LGBM_LABELS:
    per_seed_means = []
    for seed in SEEDS:
        fpath = os.path.join(OUT_DIR, f"test_forecasts_{label}_seed{seed}.csv")
        if not os.path.exists(fpath):
            continue

        dfp = safe_read_csv(fpath, parse_dates=("ds",))
        need_pred = {"unique_id","ds","yhat"}
        if not need_pred.issubset(dfp.columns):
            raise ValueError(f"{fpath} must contain columns {need_pred}")

        # 1) Keep only series that exist in the test set
        dfp = dfp[dfp["unique_id"].isin(valid_ids)]

        # 2) Restrict to official test time frame
        dfp = limit_to_test_frame(dfp, test_start, test_end)

        # 3) Deduplicate hourly predictions per (unique_id, ds)
        dfp = dedup_hourly(dfp, "yhat")

        if dfp.empty:
            continue

        # Daily sums across all clients (duplicates removed upstream)
        daily_pred = (
            daily_agg_sum(dfp, "yhat")
            .rename(columns={"yhat":"yhat"})
            .sort_values("date")
            .reset_index(drop=True)
        )

        # daily MAPE series (aligned on 'date' and test-only)
        mape_series = compute_daily_mape(daily_actual, daily_pred)

        # Save aligned rows (date, y, yhat, MAPE) for detail
        if not mape_series.empty:
            merged = daily_actual.merge(daily_pred, on="date", how="inner")
            valid_idx = mape_series.index
            merged_valid = merged.iloc[valid_idx].copy()
            merged_valid = merged_valid.assign(model=label, seed=seed, MAPE=mape_series.values)[
                ["model","seed","date","y","yhat","MAPE"]
            ]
            detail_rows.append(merged_valid)
            per_seed_means.append(float(mape_series.mean()))

    # summarize across seeds for this model
    n = len(per_seed_means)
    if n == 0:
        summary_rows.append({"Model":label,"Seeds_Used":0,"Mean_MAPE":np.nan,"Margin_of_Error":np.nan,"CI_95":"NA"})
    else:
        arr = np.array(per_seed_means, dtype=float)
        mean_mape = float(np.mean(arr))
        if n > 1:
            sd = float(np.std(arr, ddof=1))
            se = sd / np.sqrt(n)
            tcrit = t.ppf(0.975, df=n-1)
            margin = float(tcrit * se)
            ci_expr = f"{mean_mape:.2f} ± {margin:.2f}"
        else:
            margin = np.nan
            ci_expr = "NA"
        summary_rows.append({"Model":label,"Seeds_Used":n,"Mean_MAPE":mean_mape,"Margin_of_Error":margin,"CI_95":ci_expr})

# --- write outputs ---
if detail_rows:
    detail_df = pd.concat(detail_rows, ignore_index=True).sort_values(["model","seed","date"])
    detail_df.to_csv(DETAIL_OUT, index=False)
    print(f"[DETAIL:LGBM] Saved daily per-seed MAPE → {DETAIL_OUT}")
else:
    print("[DETAIL:LGBM] No overlapping daily forecasts found; detail file not written.")

summary_df = pd.DataFrame(summary_rows).sort_values("Model")
summary_df.to_csv(SUMMARY_OUT, index=False)
print("\n--- LGBM: Mean of per-seed daily MAPEs (±95% CI across seeds) ---")
print(summary_df.to_string(index=False))
print(f"[SUMMARY:LGBM] Saved → {SUMMARY_OUT}")

[INFO] Test frame: 2014-03-19 02:00:00 → 2015-01-01 00:00:00 | Series in test: 100
[DETAIL:LGBM] Saved daily per-seed MAPE → /content/drive/MyDrive/myproject/results_mdn_nhits/mape_daily_per_seed_lgbm.csv

--- LGBM: Mean of per-seed daily MAPEs (±95% CI across seeds) ---
     Model  Seeds_Used  Mean_MAPE  Margin_of_Error       CI_95
  lgbm_mdn           5   4.945426         0.002475 4.95 ± 0.00
lgbm_nomdn           5   4.951883         0.006036 4.95 ± 0.01
[SUMMARY:LGBM] Saved → /content/drive/MyDrive/myproject/results_mdn_nhits/mape_summary_lgbm.csv


In [6]:
# # =========================
# # BLOCK 3: NHITS FORECASTING (uses MDN covariates) — chunked + robust writes
# # =========================

# import os, gc, json, time, tempfile
# import numpy as np
# import pandas as pd
# import torch

# # Ensure NF returns id as a COLUMN (must be set before importing NeuralForecast)
# os.environ["NIXTLA_ID_AS_COL"] = "1"

# from neuralforecast import NeuralForecast
# from neuralforecast.models import NHITS
# from neuralforecast.losses.pytorch import PMM, GMM, MAE as NF_MAE

# # --- CONFIG (must match Block 1 OUT_DIR and high-level settings) ---
# CONFIG = {
#     "OUT_DIR": "/content/drive/MyDrive/myproject/results_mdn_nhits/",
#     "FREQ": "h",

#     # Forecasting
#     "HORIZON": 24*14,       # 2 weeks horizon
#     "INPUT_SIZE": 24*21,    # 3 weeks input

#     # NHITS training (dense windows)
#     "NHITS_LR": 1e-3,
#     "NHITS_STEPS": 1000,
#     "NHITS_BATCH": 32,
#     # "NHITS_WINDOWS_BS": 16,  # uncomment to set explicitly

#     # cross-val stride (spacing between CV windows; NOT training stride)
#     "CV_STEP_SIZE": 1,

#     # Chunking
#     "MAX_IDS_PER_CHUNK": 100,

#     # Replications
#     "SEEDS_FOR_REPS": [1, 2, 3, 4, 5],
# }

# # ----------------- helpers -----------------
# def set_seed(seed: int):
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)

# def free_mem(*objs):
#     for o in objs:
#         try: del o
#         except: pass
#     gc.collect()
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()

# def align_test_size(test_size_raw: int, h: int, step: int) -> int:
#     """Align test_size so that (test_size - h) % step == 0 (and >= h)."""
#     if test_size_raw < h:
#         return h
#     rem = (test_size_raw - h) % step
#     aligned = test_size_raw - rem
#     if aligned < h:
#         aligned = h
#     return int(aligned)

# def chunked(lst, n):
#     for i in range(0, len(lst), n):
#         yield lst[i:i+n]

# def build_nhits(loss_obj, seed, use_mdn, cfg):
#     return NHITS(
#         h=cfg["HORIZON"], input_size=cfg["INPUT_SIZE"],
#         loss=loss_obj,
#         n_pool_kernel_size=[16,8,1], n_freq_downsample=[24,12,1],
#         scaler_type="robust",

#         # TRAINING (dense windows)
#         max_steps=cfg["NHITS_STEPS"],
#         learning_rate=cfg["NHITS_LR"],
#         batch_size=cfg["NHITS_BATCH"],
#         windows_batch_size=cfg.get("NHITS_WINDOWS_BS", 16),
#         step_size=50,  # IMPORTANT: dense training windows (NOT CV stride)

#         # I/O & reproducibility
#         num_workers_loader=0, drop_last_loader=False,
#         random_seed=seed,
#         futr_exog_list=(["total_mean","total_variance"] if use_mdn else None),
#         inference_windows_batch_size=1,

#         accelerator="auto",
#         enable_checkpointing=False, logger=False, enable_model_summary=False,
#         detect_anomaly=False
#     )

# def safe_to_csv(df, final_path, attempts=5, sleep_s=1.0, fallback_dir="/content"):
#     """Write CSV atomically with retries; fall back to local disk on repeated mount errors."""
#     final_path = os.path.abspath(final_path)
#     dest_dir = os.path.dirname(final_path)
#     os.makedirs(dest_dir, exist_ok=True)

#     last_err = None
#     for k in range(1, attempts+1):
#         tmp_file = None
#         try:
#             with tempfile.NamedTemporaryFile("w", delete=False, dir=dest_dir, suffix=".tmp") as tmp:
#                 tmp_file = tmp.name
#                 df.to_csv(tmp_file, index=False)
#                 tmp.flush()
#                 os.fsync(tmp.fileno())
#             os.replace(tmp_file, final_path)
#             print(f"[WRITE] {final_path} (attempt {k})")
#             return final_path
#         except OSError as e:
#             last_err = e
#             try:
#                 if tmp_file and os.path.exists(tmp_file):
#                     os.remove(tmp_file)
#             except Exception:
#                 pass
#             print(f"[WARN] Write failed (attempt {k}/{attempts}) → {e}; retrying...")
#             time.sleep(sleep_s)

#     base = os.path.basename(final_path)
#     fb_dir = os.path.abspath(fallback_dir)
#     os.makedirs(fb_dir, exist_ok=True)
#     fb_path = os.path.join(fb_dir, base)
#     df.to_csv(fb_path, index=False)
#     print(f"[FALLBACK] Saved to local path: {fb_path}")
#     if last_err:
#         print(f"[LAST ERROR] {last_err}")
#     return fb_path

# # ----------------- MAIN -----------------
# enriched_path = os.path.join(CONFIG["OUT_DIR"], "dataset_with_mdn.csv")
# splits_path   = os.path.join(CONFIG["OUT_DIR"], "splits.json")

# df = pd.read_csv(enriched_path, low_memory=False, parse_dates=["ds"])
# with open(splits_path, "r") as f:
#     splits = json.load(f)
# validation_cutoff = pd.to_datetime(splits["validation_cutoff"])
# VAL_SIZE = int(splits["VAL_SIZE"])

# # Recompute TEST_SIZE from data to set CV sizes robustly
# threshold_date = df["ds"].quantile(float(splits.get("TEST_SPLIT_Q", 0.80)))
# TEST_SIZE_RAW = df[df["ds"] > threshold_date]["ds"].nunique()

# H = int(CONFIG["HORIZON"])
# STEP_DEFAULT = int(CONFIG["CV_STEP_SIZE"])
# step_size_cv = STEP_DEFAULT if (TEST_SIZE_RAW - H) >= 0 else 1
# TEST_SIZE_CV = align_test_size(TEST_SIZE_RAW, H, step_size_cv)
# if TEST_SIZE_CV < H:
#     TEST_SIZE_CV = H

# # Base frames (with & without MDN covariates)
# df_with = df[["unique_id","ds","y","total_mean","total_variance"]].copy().sort_values(["unique_id","ds"])
# df_no   = df[["unique_id","ds","y"]].copy().sort_values(["unique_id","ds"])

# # ---- Safety: pre-filter to ids with enough non-NaN target length ----
# min_required = CONFIG["INPUT_SIZE"] + CONFIG["HORIZON"] + VAL_SIZE + TEST_SIZE_CV
# len_per_id = df_no.groupby("unique_id")["y"].apply(lambda s: s.notna().sum()).astype(int)
# usable_ids = len_per_id[len_per_id >= min_required].index.astype(str).tolist()

# if not usable_ids:
#     raise RuntimeError(
#         f"No series meet minimum length {min_required}. "
#         "Reduce INPUT_SIZE/HORIZON/VAL_SIZE/TEST_SIZE_CV or include more history."
#     )

# # Restrict to usable ids
# df_with = df_with[df_with["unique_id"].astype(str).isin(usable_ids)]
# df_no   = df_no[df_no["unique_id"].astype(str).isin(usable_ids)]

# nhits_variants = [
#     # ("nhits_mae_nomdn", NF_MAE(), False),
#     # ("nhits_mae_mdn",   NF_MAE(), True),
#     ("nhits_gmm_nomdn", GMM(n_components=15, quantiles=[.5]), False),
#     ("nhits_pmm_nomdn", PMM(n_components=15, quantiles=[.5]), False),
# ]

# all_ids = sorted(set(usable_ids))
# CHUNK = int(CONFIG["MAX_IDS_PER_CHUNK"])

# for label, loss_obj, use_mdn in nhits_variants:
#     print(f"\n[RUN] {label}")
#     cols_in = ["unique_id","ds","y"] + (["total_mean","total_variance"] if use_mdn else [])
#     base_df = (df_with if use_mdn else df_no)[cols_in].copy()

#     for seed in CONFIG["SEEDS_FOR_REPS"]:
#         set_seed(seed)
#         chunk_outputs = []

#         for c_idx, id_chunk in enumerate(chunked(all_ids, CHUNK), start=1):
#             df_chunk = base_df[base_df["unique_id"].astype(str).isin(id_chunk)].copy()
#             df_chunk = df_chunk.dropna(subset=["ds","y"]).reset_index(drop=True)

#             print(f"[{label} | seed={seed}] Chunk {c_idx}: {len(id_chunk)} ids → {len(df_chunk):,} rows")

#             if df_chunk.empty:
#                 print("  [SKIP] Empty chunk after cleaning.")
#                 continue

#             # Ensure each id in this chunk still has enough points
#             ok_ids = (df_chunk.groupby("unique_id")["y"].apply(lambda s: s.notna().sum()) >= min_required)
#             valid_ids = ok_ids[ok_ids].index.astype(str).tolist()
#             if not valid_ids:
#                 print(f"  [SKIP] No ids with ≥ {min_required} points in this chunk.")
#                 continue
#             if len(valid_ids) < len(set(id_chunk)):
#                 df_chunk = df_chunk[df_chunk["unique_id"].astype(str).isin(valid_ids)].reset_index(drop=True)

#             model = build_nhits(loss_obj, seed, use_mdn, CONFIG)
#             nf = NeuralForecast(models=[model], freq=CONFIG["FREQ"])

#             Y_hat = nf.cross_validation(
#                 df=df_chunk,
#                 val_size=VAL_SIZE,
#                 test_size=TEST_SIZE_CV,
#                 n_windows=None,
#                 # step_size=step_size_cv,   # CV spacing; NOT training stride
#                 verbose=False
#             )

#             if "unique_id" not in Y_hat.columns or "ds" not in Y_hat.columns:
#                 Y_hat = Y_hat.reset_index()

#             ycols = [c for c in Y_hat.columns if c.upper().startswith("NHITS")]
#             if not ycols:
#                 print("  [SKIP] NHITS output column not found in this chunk; continuing.")
#                 free_mem(Y_hat, nf, model)
#                 continue
#             ycol = ycols[-1]

#             Y_chunk = (
#                 Y_hat.loc[Y_hat["ds"] > validation_cutoff, ["unique_id","ds",ycol]]
#                      .rename(columns={ycol: "yhat"})
#                      .sort_values(["unique_id","ds"])
#                      .reset_index(drop=True)
#             )

#             if Y_chunk.empty:
#                 print("  [SKIP] No test rows produced for this chunk.")
#             else:
#                 chunk_outputs.append(Y_chunk)

#             free_mem(Y_hat, Y_chunk, nf, model, df_chunk)

#         # Save concatenated predictions for this model/seed
#         if chunk_outputs:
#             Y_seed = pd.concat(chunk_outputs, ignore_index=True).sort_values(["unique_id","ds"])
#             out_path = os.path.join(CONFIG["OUT_DIR"], f"test_forecasts_{label}_seed{seed}.csv")
#             safe_to_csv(Y_seed, out_path)
#             free_mem(Y_seed, chunk_outputs)
#         else:
#             print(f"[WARN] No predictions produced for {label} seed={seed}")

# print("\n[OK] NHITS forecasts saved (chunked, robust writes).")

In [7]:
# =========================
# BLOCK — NHITS DAILY-FIRST MAPE (TEST-ONLY, 4 models x N seeds)
# =========================
import os
import json
import numpy as np
import pandas as pd
from scipy.stats import t

# --- paths/config ---
CONFIG = {
    "OUT_DIR": "/content/drive/MyDrive/myproject/results_mdn_nhits/",
    "FREQ": "h",

    # Forecasting (kept for reference; not used directly here)
    "HORIZON": 24*14,
    "INPUT_SIZE": 24*21,
    "NHITS_LR": 1e-3,
    "NHITS_STEPS": 1000,
    "NHITS_BATCH": 32,
    "CV_STEP_SIZE": 1,
    "MAX_IDS_PER_CHUNK": 100,
    "SEEDS_FOR_REPS": [1, 2, 3, 4, 5],
}

try:
    OUT_DIR = CONFIG["OUT_DIR"]
    SEEDS   = CONFIG["SEEDS_FOR_REPS"]
except NameError:
    OUT_DIR = "/content/drive/MyDrive/myproject/results_mdn_nhits/"
    SEEDS   = [1, 2, 3, 4, 5]

NHITS_LABELS = [
    "nhits_mae_nomdn",
    "nhits_mae_mdn",
    "nhits_gmm_nomdn",
    "nhits_pmm_nomdn",
]

ACTUALS_PATH = os.path.join(OUT_DIR, "test_actuals.csv")
SUMMARY_OUT  = os.path.join(OUT_DIR, "mape_summary_nhits.csv")
DETAIL_OUT   = os.path.join(OUT_DIR, "mape_daily_per_seed_nhits.csv")

# --- helpers ---
def safe_read_csv(path, parse_dates=("ds",)):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing file: {path}")
    df = pd.read_csv(path, low_memory=False)
    for col in parse_dates:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")
    if "unique_id" in df.columns:
        df["unique_id"] = df["unique_id"].astype(str)
    return df

def dedup_hourly(df, y_col):
    """
    Remove duplicate hourly rows per (unique_id, ds), keeping the last.
    Only rows with finite y_col are kept.
    """
    d = df.copy()
    if "ds" not in d.columns:
        raise ValueError("dedup_hourly: 'ds' column required")
    d = d.dropna(subset=["ds"])
    d = d.sort_values(["unique_id", "ds"]).drop_duplicates(["unique_id", "ds"], keep="last")
    d = d[np.isfinite(d[y_col])]
    return d

def limit_to_test_frame(df, test_start, test_end):
    """Keep rows with ds in [test_start, test_end] inclusive."""
    d = df.copy()
    m = (d["ds"] >= test_start) & (d["ds"] <= test_end)
    return d.loc[m].copy()

def daily_agg_sum(df, y_col):
    """Aggregate hourly rows to a single daily sum across all series."""
    if "ds" not in df.columns:
        raise ValueError("daily_agg_sum: 'ds' column required")
    d = df.copy()
    d["date"] = d["ds"].dt.date
    return d.groupby("date", as_index=False)[y_col].sum()

def compute_daily_mape(y_true_daily, y_pred_daily):
    """
    Returns Series of daily MAPE (%) aligned on 'date'.
    Zero-actual days are excluded to avoid division by zero.
    """
    merged = y_true_daily.merge(y_pred_daily, on="date", how="inner")
    mask = (merged["y"] != 0) & np.isfinite(merged["y"]) & np.isfinite(merged["yhat"])
    if not mask.any():
        return pd.Series([], dtype=float)
    return 100.0 * (merged.loc[mask, "y"] - merged.loc[mask, "yhat"]).abs() / merged.loc[mask, "y"]

# --- load actuals (test-only) ---
dense_test = safe_read_csv(ACTUALS_PATH, parse_dates=("ds",))
need_actual = {"unique_id","ds","y"}
if not need_actual.issubset(dense_test.columns):
    raise ValueError("test_actuals.csv must contain 'unique_id','ds','y'.")

# Define the test time window from the actuals file
test_start = dense_test["ds"].min()
test_end   = dense_test["ds"].max()
valid_ids  = set(dense_test["unique_id"].unique().tolist())
print(f"[INFO] Test frame: {test_start} → {test_end} | Series in test: {len(valid_ids)}")

# Build daily actuals (sum across all clients), already unique by date
daily_actual = (
    daily_agg_sum(dense_test, "y")
    .rename(columns={"y":"y"})
    .sort_values("date")
    .reset_index(drop=True)
)

# --- compute MAPE per seed and summarize (dedup + test-frame only) ---
detail_rows = []
summary_rows = []

for label in NHITS_LABELS:
    per_seed_means = []
    for seed in SEEDS:
        fpath = os.path.join(OUT_DIR, f"test_forecasts_{label}_seed{seed}.csv")
        if not os.path.exists(fpath):
            continue

        dfp = safe_read_csv(fpath, parse_dates=("ds",))
        need_pred = {"unique_id","ds","yhat"}
        if not need_pred.issubset(dfp.columns):
            raise ValueError(f"{fpath} must contain columns {need_pred}")

        # 1) Keep only series that exist in the test set
        dfp = dfp[dfp["unique_id"].isin(valid_ids)]

        # 2) Restrict to official test time frame
        dfp = limit_to_test_frame(dfp, test_start, test_end)

        # 3) Deduplicate hourly predictions per (unique_id, ds)
        dfp = dedup_hourly(dfp, "yhat")

        if dfp.empty:
            continue

        # Daily sums across all clients (duplicates removed upstream)
        daily_pred = (
            daily_agg_sum(dfp, "yhat")
            .rename(columns={"yhat":"yhat"})
            .sort_values("date")
            .reset_index(drop=True)
        )

        # daily MAPE series (aligned on 'date' and test-only)
        mape_series = compute_daily_mape(daily_actual, daily_pred)

        # Save aligned rows (date, y, yhat, MAPE) for detail
        if not mape_series.empty:
            merged = daily_actual.merge(daily_pred, on="date", how="inner")
            valid_idx = mape_series.index
            merged_valid = merged.iloc[valid_idx].copy()
            merged_valid = merged_valid.assign(model=label, seed=seed, MAPE=mape_series.values)[
                ["model","seed","date","y","yhat","MAPE"]
            ]
            detail_rows.append(merged_valid)
            per_seed_means.append(float(mape_series.mean()))

    # summarize across seeds for this model
    n = len(per_seed_means)
    if n == 0:
        summary_rows.append({"Model":label,"Seeds_Used":0,"Mean_MAPE":np.nan,"Margin_of_Error":np.nan,"CI_95":"NA"})
    else:
        arr = np.array(per_seed_means, dtype=float)
        mean_mape = float(np.mean(arr))
        if n > 1:
            sd = float(np.std(arr, ddof=1))
            se = sd / np.sqrt(n)
            from scipy.stats import t as _t
            tcrit = _t.ppf(0.975, df=n-1)
            margin = float(tcrit * se)
            ci_expr = f"{mean_mape:.2f} ± {margin:.2f}"
        else:
            margin = np.nan
            ci_expr = "NA"
        summary_rows.append({"Model":label,"Seeds_Used":n,"Mean_MAPE":mean_mape,"Margin_of_Error":margin,"CI_95":ci_expr})

# --- write outputs ---
if detail_rows:
    detail_df = pd.concat(detail_rows, ignore_index=True).sort_values(["model","seed","date"])
    detail_df.to_csv(DETAIL_OUT, index=False)
    print(f"[DETAIL:NHITS] Saved daily per-seed MAPE → {DETAIL_OUT}")
else:
    print("[DETAIL:NHITS] No overlapping daily forecasts found; detail file not written.")

summary_df = pd.DataFrame(summary_rows).sort_values("Model")
summary_df.to_csv(SUMMARY_OUT, index=False)
print("\n--- NHITS: Mean of per-seed daily MAPEs (±95% CI across seeds) ---")
print(summary_df.to_string(index=False))
print(f"[SUMMARY:NHITS] Saved → {SUMMARY_OUT}")

[INFO] Test frame: 2014-03-19 02:00:00 → 2015-01-01 00:00:00 | Series in test: 100
[DETAIL:NHITS] Saved daily per-seed MAPE → /content/drive/MyDrive/myproject/results_mdn_nhits/mape_daily_per_seed_nhits.csv

--- NHITS: Mean of per-seed daily MAPEs (±95% CI across seeds) ---
          Model  Seeds_Used  Mean_MAPE  Margin_of_Error       CI_95
nhits_gmm_nomdn           5   7.881752         4.868146 7.88 ± 4.87
  nhits_mae_mdn           5   2.698247         0.430410 2.70 ± 0.43
nhits_mae_nomdn           5   2.823359         0.547312 2.82 ± 0.55
nhits_pmm_nomdn           5   3.341523         0.744345 3.34 ± 0.74
[SUMMARY:NHITS] Saved → /content/drive/MyDrive/myproject/results_mdn_nhits/mape_summary_nhits.csv
