In [None]:
### Beta version: Let's listen to only the most accurate forecasters

# ──────────────────────────────────────────────────────────────
# Top-N forecaster ensemble • mean vs inverse-MSE weighting
# ──────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import warnings
from tqdm.auto import tqdm
from scipy.stats import norm, binomtest

# 🔇 suppress every warning category
warnings.filterwarnings("ignore")          # ← one line does it all
np.seterr(all="ignore")                    # silence NumPy runtime warnings

WINDOWS = [3, 6, 12]
TOP_NS  = [5, 10, 15, 20, 25]
RIDGE   = 1e-6

PANELS = {"Full panel": df_full, "COVID-filtered panel": df}

def evaluate_panel(panel: pd.DataFrame, label: str) -> pd.DataFrame:
    dates = np.sort(panel["release_date"].unique())
    metrics = []

    for W in tqdm(WINDOWS, desc=f"{label}: windows"):
        for N in TOP_NS:
            rows = []

            for idx in range(W, len(dates)):
                t = dates[idx]
                hist = panel[panel["release_date"]
                             .between(dates[idx - W], dates[idx - 1])]

                elig = hist.groupby("economist")["forecast"].apply(lambda s: s.notna().all())
                econs = elig[elig].index
                if econs.empty:
                    continue

                mse = (hist[hist["economist"].isin(econs)]
                       .groupby("economist")["error"]
                       .apply(lambda s: np.mean(s**2)))
                top = mse.nsmallest(N).index          # up to N forecasters

                cur = panel[(panel["release_date"] == t) &
                            (panel["economist"].isin(top))]
                f_t = cur.set_index("economist")["forecast"].dropna()
                if f_t.empty:
                    continue

                weights = {
                    "mean": pd.Series(1 / len(f_t), index=f_t.index),
                    "inv_mse": 1 / (mse.loc[f_t.index] + RIDGE)
                }
                weights["inv_mse"] /= weights["inv_mse"].sum()

                median_all = panel.loc[panel["release_date"] == t,
                                       "forecast"].dropna().median()
                actual = panel.loc[panel["release_date"] == t,
                                   "actual"].iloc[0]

                for method, w in weights.items():
                    smart = np.dot(w, f_t)
                    pred_dir = int(smart > median_all)
                    actual_dir = int(actual > median_all) if pd.notna(actual) else np.nan
                    rows.append((W, len(f_t), method, t,
                                 smart, median_all, actual,
                                 pred_dir, actual_dir))

            if not rows:
                continue

            df_all = pd.DataFrame(rows, columns=[
                "window", "top_N", "method", "date",
                "smart", "median", "actual", "pred_dir", "actual_dir"
            ])
            eval_df = df_all.dropna(subset=["actual"])
            if eval_df.empty:
                continue

            eval_df["smart_err"]  = eval_df["smart"]  - eval_df["actual"]
            eval_df["median_err"] = eval_df["median"] - eval_df["actual"]

            obs     = len(eval_df)
            rmse_s  = np.sqrt((eval_df["smart_err"]**2).mean())
            rmse_m  = np.sqrt((eval_df["median_err"]**2).mean())

            diff = eval_df["smart_err"]**2 - eval_df["median_err"]**2
            dm_p = 2 * (1 - norm.cdf(abs(diff.mean() / diff.std(ddof=1) *
                                         np.sqrt(obs))))

            # directional metrics
            hits     = (eval_df["pred_dir"] == eval_df["actual_dir"]).sum()
            hit_rate = hits / obs
            binom_p  = binomtest(hits, obs, 0.5).pvalue

            p1, p2 = eval_df["pred_dir"].mean(), eval_df["actual_dir"].mean()
            joint  = ((eval_df["pred_dir"].astype(int) &
                       eval_df["actual_dir"].astype(int))).mean()
            pt_stat = (joint - p1 * p2) / np.sqrt(p1 * p2 * (1 - p1) * (1 - p2) / obs)
            pt_p    = 2 * (1 - norm.cdf(abs(pt_stat)))

            metrics.append({
                "window": W, "top_N": N, "method": method, "obs": obs,
                "RMSE_smart": rmse_s, "RMSE_median": rmse_m,
                "HitRate": hit_rate, "Binom_p": binom_p,
                "PT_stat": pt_stat, "PT_p": pt_p, "DM_p": dm_p
            })

    return pd.DataFrame(metrics)

pd.set_option("display.float_format", "{:.3f}".format)

for name, pdf in PANELS.items():
    tbl = (evaluate_panel(pdf, name)
           .sort_values(["window", "top_N", "method"])
           .reset_index(drop=True))
    print(f"\n--- {name} : Top-N ensemble (mean vs inv_mse) ---")
    print(tbl.to_string(index=False))

### Beta version: Let's listen only to forecasters that reacted to the ADP print

# ──────────────────────────────────────────────────────────────
# “Fresh-update” ensemble:
# • keep only forecasts with |asof − release_date| ≤ 3 days
# • smart forecasts:   mean  &  median   of those “fresh” forecasts
# • three directional signals:
#       pred_dir_mean   (mean > crowd median)
#       pred_dir_med    (median > crowd median)
#       pred_dir_vote   (majority of fresh forecasters above crowd median)
# • evaluation on realised months
# ──────────────────────────────────────────────────────────────
import numpy as np, pandas as pd, warnings
from tqdm.auto import tqdm
from scipy.stats import norm, binomtest

warnings.filterwarnings("ignore")
np.seterr(all="ignore")

PANELS = {
    "Full panel"          : df_full,   # assume in memory
    "COVID-filtered panel": df
}

def evaluate_fresh(panel: pd.DataFrame, label: str) -> pd.DataFrame:
    # ±3-day mask (absolute difference ≤ 3 days)
    panel = panel.copy()
    panel["asof_delta"] = (panel["release_date"] - panel["asof"]).abs().dt.days
    panel["is_fresh"]   = panel["asof_delta"] <= 3

    dates   = np.sort(panel["release_date"].unique())
    rows    = []

    for t in tqdm(dates, desc=label):
        month_all   = panel[panel["release_date"] == t]
        month_fresh = month_all[month_all["is_fresh"]]

        if month_fresh.empty:        # no qualifying forecasts
            continue

        crowd_med = month_all["forecast"].dropna().median()
        smart_mean = month_fresh["forecast"].mean()
        smart_med  = month_fresh["forecast"].median()

        # individual vote: 1 if their forecast > crowd median
        voter_flags = (month_fresh["forecast"] > crowd_med).astype(int)
        pred_dir_vote = int(voter_flags.mean() > 0.5)   # strict majority

        # level-based directions
        pred_dir_mean = int(smart_mean > crowd_med)
        pred_dir_med  = int(smart_med  > crowd_med)

        actual = month_all["actual"].iloc[0]

        rows.append((t, smart_mean, smart_med, crowd_med, actual,
                     pred_dir_mean, pred_dir_med, pred_dir_vote))

    cols = ["date","smart_mean","smart_med","crowd_median","actual",
            "dir_mean","dir_med","dir_vote"]
    df = pd.DataFrame(rows, columns=cols)

    eval_df = df.dropna(subset=["actual"])
    if eval_df.empty:
        return pd.DataFrame()       # nothing to score

    # errors versus actual
    eval_df["err_mean"] = eval_df["smart_mean"] - eval_df["actual"]
    eval_df["err_med"]  = eval_df["smart_med"]  - eval_df["actual"]
    eval_df["err_crowd"]= eval_df["crowd_median"] - eval_df["actual"]

    # helper for directional metrics
    def dir_metrics(flag_col):
        hits = (eval_df[flag_col] == (eval_df["actual"] > eval_df["crowd_median"]).astype(int)).sum()
        hit_rate = hits / len(eval_df)
        binom_p  = binomtest(hits, len(eval_df), 0.5).pvalue
        p1, p2 = eval_df[flag_col].mean(), (eval_df["actual"] > eval_df["crowd_median"]).mean()
        joint  = (eval_df[flag_col].astype(int) &
                  (eval_df["actual"] > eval_df["crowd_median"]).astype(int)).mean()
        pt_stat = (joint - p1*p2) / np.sqrt(p1*p2*(1-p1)*(1-p2)/len(eval_df))
        pt_p    = 2*(1 - norm.cdf(abs(pt_stat)))
        return hit_rate, binom_p, pt_stat, pt_p

    rmse_mean = np.sqrt((eval_df["err_mean"]**2).mean())
    rmse_med  = np.sqrt((eval_df["err_med"]**2 ).mean())
    rmse_crowd= np.sqrt((eval_df["err_crowd"]**2).mean())

    # Diebold-Mariano (smart_mean vs crowd median)
    diff_mean = eval_df["err_mean"]**2 - eval_df["err_crowd"]**2
    dm_p_mean = 2*(1 - norm.cdf(abs(diff_mean.mean()/diff_mean.std(ddof=1) *
                                     np.sqrt(len(eval_df)))))

    # Diebold-Mariano (smart_med vs crowd median)
    diff_med  = eval_df["err_med"]**2 - eval_df["err_crowd"]**2
    dm_p_med  = 2*(1 - norm.cdf(abs(diff_med.mean()/diff_med.std(ddof=1) *
                                     np.sqrt(len(eval_df)))))

    # directional stats
    hit_mn, bin_mn, pt_mn, ptp_mn = dir_metrics("dir_mean")
    hit_md, bin_md, pt_md, ptp_md = dir_metrics("dir_med")
    hit_vt, bin_vt, pt_vt, ptp_vt = dir_metrics("dir_vote")

    return pd.DataFrame([{
        "obs": len(eval_df),
        "RMSE_mean":   rmse_mean,
        "RMSE_medianSmart": rmse_med,
        "RMSE_crowdMedian": rmse_crowd,
        "DM_p_mean":   dm_p_mean,
        "DM_p_medianSmart": dm_p_med,
        # directional metrics
        "HitRate_mean": hit_mn,   "Binom_p_mean": bin_mn, "PT_p_mean": ptp_mn,
        "HitRate_med":  hit_md,   "Binom_p_med":  bin_md, "PT_p_med":  ptp_md,
        "HitRate_vote": hit_vt,   "Binom_p_vote": bin_vt, "PT_p_vote": ptp_vt
    }])

# ---------- run & display ----------
pd.set_option("display.float_format", "{:.3f}".format)

for name, pdf in PANELS.items():
    tbl = evaluate_fresh(pdf, name)
    print(f"\n--- {name} : 3-day fresh-update ensemble ---")
    if tbl.empty:
        print("No months qualified (no fresh forecasts found).")
    else:
        print(tbl.to_string(index=False))


## 0: Baseline static forecast on full sample

Rolling 6-month fixed window. For every valid economist (for a prediction at time t, has a contiguous 6-month forecast history for previous 6 releases), weight prediction by inverse MSE. 


This implements an out-of-sample error estimate with a rolling 6-month estimation window. Weights don't use information from the target month and actual value at month *t* is unseen. In other words, all errors are "live" errors that could have been observed in real time.

Briefly, the procedure: 
1. Starts at 7th release (for 6 month release prior)
2. From estimation window, keep economists that supplied a forecast for all six months (per contiguity rule)
3. Compute MSE for each economist using errors against already known actuals (no lookahead)
4. Generate forecast for release t 
5. Store OOS evaluation error
6. Roll window forward a month and repeat

In [1]:
# old smart code that that does ensemble averaging rather than majority vote 

In [None]:
# # in-sample, done via smart forecast averaging 

# # -------------------------------------------------------------
# # Majority‑vote ensemble search  (k = 3, 5)
# #   • choose best combo on  FULL, trailing‑24 m, trailing‑12 m
# #   • show full stratified directional diagnostics
# #   • finish with a summary table + majority “verdict”
# # -------------------------------------------------------------

# # ──────────────────────  SETTINGS  ────────────────────────────
# TODAY   = pd.Timestamp.today().normalize()
# WINDOWS = {
#     "FULL" : None,
#     "T24M" : TODAY - pd.DateOffset(months=24),
#     "T12M" : TODAY - pd.DateOffset(months=12),
#     "T6M" : TODAY - pd.DateOffset(months=6),
# }

# # REGIMES was defined earlier; re‑use the same dict.
# # If not present in your notebook, paste the REGIMES definition here.

# # ─────────── 1) candidate pool (unchanged)  ───────────────────
# pool = set()
# for model_id, panel_map in eval_tables.items():
#     for panel in ["COVID", "Full"]:
#         df = panel_map.get(panel)
#         if df is None or df.empty:
#             continue
#         pool.add(df.loc[df["RMSE_smart"].idxmin(), "spec_id"])  # lowest RMSE
#         pool.add(df.loc[df["HitRate"].idxmax(),    "spec_id"])  # highest HR
#         rob = df[(df["DM_p"] < .10) & (df["PT_p"] < .10)]
#         if not rob.empty:
#             pool.add(rob.loc[rob["RMSE_smart"].idxmin(), "spec_id"])

# pool = sorted(pool)
# print(f"Total unique candidate specs selected: {len(pool)}")

# # ─────────── 2) collect FULL‑panel OOS frames  ────────────────
# oos_full = {}
# for mdl, panel_map in oos_maps.items():
#     oos_full.update(panel_map.get("Full", {}))

# pool = [s for s in pool if s in oos_full]
# assert len(pool) >= 3, "Need ≥3 viable specs with Full‑panel OOS"
# print(f"Usable specs with Full‑panel OOS: {len(pool)}\n")

# # ─────────── 3) helpers  ──────────────────────────────────────
# def merged_oos(combo):
#     """Return merged DataFrame (ens, median, actual) for entire history."""
#     k   = len(combo)
#     dfs = []
#     for i, sid in enumerate(combo):
#         tmp = (oos_full[sid][["date", "smart", "median", "actual"]]
#                .rename(columns={"smart":  f"smart_{i}",
#                                 "median": f"median_{i}",
#                                 "actual": f"actual_{i}"}))
#         dfs.append(tmp)
#     df = dfs[0]
#     for d in dfs[1:]:
#         df = df.merge(d, on="date")

#     df["median"] = df["median_0"]
#     df["actual"] = df["actual_0"]
#     df["ens"]    = df[[f"smart_{i}" for i in range(k)]].mean(axis=1)
#     return df[["date", "ens", "median", "actual"]]

# def dir_metrics(df):
#     """Directional hit‑rate + Binom/PT p‑values on realised part of df."""
#     realised = df.dropna(subset=["actual"]).copy()
#     realised["pred_dir"]   = (realised["ens"]    > realised["median"]).astype(int)
#     realised["actual_dir"] = (realised["actual"] > realised["median"]).astype(int)

#     hits = (realised["pred_dir"] == realised["actual_dir"]).sum()
#     n    = len(realised)
#     if n == 0:
#         return np.nan, n, np.nan, np.nan
#     hr   = hits / n
#     binom_p = stats.binomtest(hits, n, .5).pvalue
#     p1, p2  = realised["pred_dir"].mean(), realised["actual_dir"].mean()
#     c_joint = (realised["pred_dir"] & realised["actual_dir"]).mean()
#     pt_p = 2 * (1 - stats.norm.cdf(abs((c_joint - p1*p2) /
#                                        np.sqrt(p1*p2*(1-p1)*(1-p2)/n))))
#     return hr, n, binom_p, pt_p

# def stratified_table(df):
#     rows = []
#     for lbl, (start, end) in REGIMES.items():
#         sub = df[(df["date"] >= start) & (df["date"] <= end)]
#         hr, n, bp, pt = dir_metrics(sub)
#         if np.isnan(hr):
#             continue
#         rows.append({"Regime": lbl, "Obs": n,
#                      "HitRate": hr, "Binom_p": bp, "PT_p": pt})
#     return pd.DataFrame(rows)

# # ─────────── 4) exhaustive search for k = 3 and 5  ────────────
# summary_rows  = []   # for final table
# verdict_votes = []   # collect live “Beat/Miss” signals

# for k in (3, 5):
#     if len(pool) < k:
#         print(f"Skipping k={k}: pool too small\n")
#         continue

#     combos_k = list(itertools.combinations(pool, k))

#     for win_lbl, start_date in WINDOWS.items():
#         best = {"combo": None, "hr": -np.inf, "n": None,
#                 "binom": np.nan, "pt": np.nan,
#                 "median_live": np.nan, "signal": "n/a",
#                 "df_full": None}

#         for combo in combos_k:
#             df_all = merged_oos(combo)
#             # window‑restricted view for choosing winner
#             df_eval = df_all if start_date is None else df_all[df_all["date"] >= start_date]
#             hr, n, bp, pt = dir_metrics(df_eval)
#             if hr > best["hr"]:
#                 # live month info
#                 unreleased = df_all[df_all["actual"].isna()].sort_values("date")
#                 if not unreleased.empty:
#                     last = unreleased.iloc[-1]
#                     signal = "Beat" if last["ens"] > last["median"] else "Miss"
#                     median_live = last["median"]
#                 else:
#                     signal, median_live = "n/a", np.nan

#                 best.update({"combo": combo, "hr": hr, "n": n,
#                              "binom": bp, "pt": pt,
#                              "median_live": median_live, "signal": signal,
#                              "df_full": df_all})

#         # ── print detailed results ───────────────────────────
#         print(f"[{win_lbl}]  Best ensemble  k={k}")
#         print(f"  Specs        : {best['combo']}")
#         print(f"  HitRate      : {best['hr']:.2%}  over {best['n']} months")
#         print(f"  Binom p‑val  : {best['binom']:.3f}   PT p‑val : {best['pt']:.3f}")
#         if best["signal"] != "n/a":
#             print(f"  Consensus med: {best['median_live']:.0f} k")
#         print(f"  Live signal  : {best['signal']}")

#         # stratified across FULL history (not limited to window)
#         st = stratified_table(best["df_full"])
#         if st.empty:
#             print("  Stratified: no realised data yet.\n")
#         else:
#             print("  Stratified performance:")
#             print(st.to_string(index=False, float_format=lambda x: f"{x:0.3f}"))
#             print()

#         # gather for summary / verdict
#         summary_rows.append({
#             "Window": win_lbl, "k": k,
#             "Specs": best["combo"],
#             "HitRate": best["hr"], "Obs": best["n"],
#             "Binom_p": best["binom"], "PT_p": best["pt"],
#             "LiveSignal": best["signal"]
#         })
#         verdict_votes.append(best["signal"])

# # ─────────── 5) summary table & final verdict  ────────────────
# summary_df = (pd.DataFrame(summary_rows)
#               .sort_values(["Window", "k"])
#               .reset_index(drop=True))
# pd.set_option("display.float_format", "{:.3f}".format)
# print("\n================  ENSEMBLE SUMMARY  ================\n")
# print(summary_df.to_string(index=False))

# # majority vote across the six “best” ensembles (ties → ‘No consensus’)
# valid_votes = [v for v in verdict_votes if v in ("Beat", "Miss")]
# if valid_votes:
#     beat_count = valid_votes.count("Beat")
#     miss_count = valid_votes.count("Miss")
#     if beat_count > miss_count:
#         verdict = "Beat"
#     elif miss_count > beat_count:
#         verdict = "Miss"
#     else:
#         verdict = "No consensus"
# else:
#     verdict = "No live signal available"

# print(f"\n>>> FINAL VERDICT (majority across ensembles):  {verdict}\n")

Old ensemble dynamic backtest

In [None]:
# --- PATCH: ensure hit is integer, then rebuild the summaries ---
summary_rows, strat_tables = [], {}

for tag, recs in records.items():
    # cast "hit" to int so binomtest gets integers
    dfp = (pd.DataFrame(recs)
             .dropna(subset=["hit"])
             .assign(hit       = lambda d: d["hit"].astype(int),
                     pred_dir  = lambda d: d["pred_dir"].astype(int),
                     actual_dir= lambda d: d["actual_dir"].astype(int)))

    ov_hr = dfp["hit"].mean() if not dfp.empty else math.nan
    strat = _stratified(dfp) if not dfp.empty else pd.DataFrame()
    ac    = _ac_score(ov_hr, strat["HitRate"], LAMBDA_AC) if not strat.empty else math.nan
    if not strat.empty:
        strat_tables[tag] = strat

    summary_rows.append(
        dict(Method=tag, Obs=len(dfp), HitRate=ov_hr, **{f"ACλ={LAMBDA_AC}": ac})
    )

# ---------- (same printing / plotting block as before) ----------
pd.set_option("display.float_format", "{:.3f}".format)

print("\n=== Back‑test summary (2017‑‑) ===")
print(pd.DataFrame(summary_rows).to_string(index=False))

for tag, tbl in strat_tables.items():
    print(f"\n--- Stratified diagnostics for {tag} ---")
    print(tbl.to_string(index=False, float_format=lambda x: f"{x:0.3f}"))
    ac_val = next(r[f"ACλ={LAMBDA_AC}"] for r in summary_rows if r["Method"] == tag)
    print(f"Accuracy × Consistency (λ = {LAMBDA_AC}):  {ac_val:0.3f}\n")

print("\n==============  Summary Table  ==============")
print(pd.DataFrame(summary_rows)
        .set_index("Method")
        .to_string(float_format=lambda x: f"{x:0.3f}"))


In [None]:
# --- evolution plot -------------------------------------------------
plt.figure(figsize=(8,4))
for tag, pts in sel_hr_ts.items():
    if not pts:               # skip empty series
        continue
    dates, h = zip(*pts)
    plt.plot(dates, h, label=tag)
plt.title("Rolling hit‑rate of winning MV ensemble")
plt.ylabel("Hit‑rate")
plt.xlabel("Release date")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# ==============================================================
#  Dynamic majority‑vote ensembles   (2017‑01 → present)
#    • trailing‑window winners  T3 / T6 / T12 / T24
#    • Best‑to‑Date (BTD) single‑spec benchmark
#    • Accuracy × Consistency score per method (λ = 1.0)
#    • Stratified diagnostics: 2017‑19 / 2020‑22 / 2023‑‑
#    • Evolution plot of winner hit‑rate (five lines)
# ==============================================================

import itertools, math, warnings, matplotlib.pyplot as plt
from collections import defaultdict
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
from scipy import stats
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# ---------- configuration ------------------------------------
EVAL_START = pd.Timestamp("2010-01-01")            # first forecast recorded
WINDOWS    = {3: "T3", 6: "T6", 12: "T12", 24: "T24"}
K_SET      = (3, 5)                                # ensemble sizes
ALPHA      = 0.10                                  # robust‑winner p‑value
LAMBDA_AC  = 1.0                                   # weight in A×C score

REGIMES = {
    "Post-GFC (2010-14)": (pd.Timestamp("2010-01-01"),
                            pd.Timestamp("2014-12-31")),
    "Late expansion (2015-2019)": (pd.Timestamp("2015-01-01"),
                            pd.Timestamp("2019-12-31")),
    "COVID (2020‑22)"    : (pd.Timestamp("2020-01-01"),
                            pd.Timestamp("2022-12-31")),
    "Post‑COVID (2023‑‑)": (pd.Timestamp("2023-01-01"),
                            pd.Timestamp.today()),
}

# ---------- helper functions ---------------------------------
def _spec_metrics(oos: pd.DataFrame) -> Dict[str, Any]:
    df = oos.dropna(subset=["actual"]).copy()
    if df.empty:
        return dict(obs=0, rmse=np.nan, hr=np.nan, dm_p=np.nan, pt_p=np.nan)

    df["smart_err"]  = df["smart"]  - df["actual"]
    df["median_err"] = df["median"] - df["actual"]
    df["pred_dir"]   = (df["smart"]  > df["median"]).astype(int)
    df["actual_dir"] = (df["actual"] > df["median"]).astype(int)

    obs  = len(df)
    rmse = np.sqrt((df["smart_err"]**2).mean())
    hr   = (df["pred_dir"] == df["actual_dir"]).mean()

    diff = df["smart_err"]**2 - df["median_err"]**2
    dm_p = 1.0 if diff.std(ddof=1) == 0 else \
           2*(1-stats.norm.cdf(abs(diff.mean()/diff.std(ddof=1)*np.sqrt(obs))))

    p1, p2 = df["pred_dir"].mean(), df["actual_dir"].mean()
    denom  = p1*p2*(1-p1)*(1-p2)
    pt_p   = 1.0 if denom == 0 else \
             2*(1-stats.norm.cdf(abs(((df["pred_dir"] & df["actual_dir"]).mean()-p1*p2) /
                                     np.sqrt(denom/obs))))

    return dict(obs=obs, rmse=rmse, hr=hr, dm_p=dm_p, pt_p=pt_p)


def _candidate_pool(stats_df: pd.DataFrame) -> List[str]:
    chosen = set()
    for (_, _), grp in stats_df.groupby(["model", "panel"]):
        grp = grp[grp["obs"] > 0]
        if grp.empty:
            continue
        chosen.add(grp.loc[grp["rmse"].idxmin(), "spec_id"])   # lowest RMSE
        chosen.add(grp.loc[grp["hr"].idxmax(),   "spec_id"])   # highest HR
        rob = grp[(grp["dm_p"] < ALPHA) & (grp["pt_p"] < ALPHA)]
        if not rob.empty:
            chosen.add(rob.loc[rob["rmse"].idxmin(), "spec_id"])
    return sorted(chosen)


_combo_cache: Dict[Tuple[str, ...], pd.DataFrame] = {}
def _merged_oos_cached(combo: Tuple[str, ...]) -> pd.DataFrame:
    if combo not in _combo_cache:
        _combo_cache[combo] = merged_oos(combo)     # requires earlier cells
    return _combo_cache[combo]


def _mv_hitrate(df: pd.DataFrame) -> float:
    if df.empty:
        return np.nan
    smart_cols = [c for c in df.columns if c.startswith("smart_")]
    votes      = (df[smart_cols].gt(df["median"], axis=0)).sum(axis=1)
    pred_dir   = (votes > len(smart_cols)/2).astype(int)
    actual_dir = (df["actual"] > df["median"]).astype(int)
    return (pred_dir == actual_dir).mean()


def _combo_hitrate(combo, t_cut, window):
    df = _merged_oos_cached(combo)
    sub = df[(df["date"] >= t_cut - pd.DateOffset(months=window)) &
             (df["date"] <  t_cut) & df["actual"].notna()]
    return _mv_hitrate(sub), len(sub)


def _window_rmse(combo, t_cut, window):
    df = _merged_oos_cached(combo)
    sub = df[(df["date"] >= t_cut - pd.DateOffset(months=window)) &
             (df["date"] <  t_cut) & df["actual"].notna()]
    return (np.inf if sub.empty else np.sqrt(((sub["ens"]-sub["actual"])**2).mean()))


def _choose_best_combo(pool, t_cut, window):
    best_combo, best_hr, best_rmse = None, -1.0, np.inf
    for k in K_SET:
        if len(pool) < k:
            continue
        for combo in itertools.combinations(pool, k):
            hr, _ = _combo_hitrate(combo, t_cut, window)
            if math.isnan(hr):
                continue
            if hr > best_hr:
                best_combo, best_hr = combo, hr
                best_rmse = _window_rmse(combo, t_cut, window)
            elif hr == best_hr:
                rmse = _window_rmse(combo, t_cut, window)
                if rmse < best_rmse or (rmse == best_rmse and combo < best_combo):
                    best_combo, best_rmse = combo, rmse
    return best_combo, best_hr


def _ensemble_direction_mv(combo, t_date):
    df = _merged_oos_cached(combo)
    row = df[df["date"] == t_date].iloc[0]
    smart_cols = [c for c in row.index if c.startswith("smart_")]
    votes = sum(row[c] > row["median"] for c in smart_cols)
    return int(votes > len(smart_cols)/2)


def _actual_direction(any_spec_df, t_date):
    row = any_spec_df[any_spec_df["date"] == t_date].iloc[0]
    return int(row["actual"] > row["median"])


def _stratified(df_preds: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for lbl, (s, e) in REGIMES.items():
        sub = df_preds[(df_preds["date"] >= s) & (df_preds["date"] <= e)]
        if sub.empty:
            continue
        hits = sub["hit"].mean()
        n    = len(sub)
        binom = stats.binomtest(sub["hit"].sum(), n, .5).pvalue
        p1, p2 = sub["pred_dir"].mean(), sub["actual_dir"].mean()
        denom  = p1*p2*(1-p1)*(1-p2)
        pt     = 1.0 if denom == 0 else \
                 2*(1-stats.norm.cdf(abs(((sub["pred_dir"] & sub["actual_dir"]).mean()-p1*p2) /
                                         math.sqrt(denom/n))))
        rows.append(dict(Regime=lbl, Obs=n, HitRate=hits, Binom_p=binom, PT_p=pt))
    return pd.DataFrame(rows)


def _ac_score(overall_hr, reg_hits, lam=LAMBDA_AC):
    if len(reg_hits) < 2 or math.isnan(overall_hr):
        return np.nan
    return (1 - overall_hr) + lam*np.std(reg_hits, ddof=1)

# -------------------------------------------------------------
# 1. timeline + warm‑up discovery (≥24 realised obs)
# -------------------------------------------------------------
all_dates = sorted({pd.to_datetime(d)
                    for mdl in oos_maps.values()
                    for pnl in mdl.values()
                    for o in pnl.values()
                    for d in o["date"].unique()})

warmup_idx = 0
with tqdm(total=len(all_dates), desc="Finding warm‑up start") as bar:
    while warmup_idx < len(all_dates) and all_dates[warmup_idx] < EVAL_START:
        warmup_idx += 1
        bar.update(1)

# -------------------------------------------------------------
# 2. rolling evaluation
# -------------------------------------------------------------
records   = {tag: [] for tag in WINDOWS.values()}
records["BTD"] = []
sel_hr_ts = {tag: [] for tag in list(WINDOWS.values()) + ["BTD"]}

# helpers for BTD
cum_hits: Dict[str, int] = defaultdict(int)
cum_obs : Dict[str, int] = defaultdict(int)

dates_iter = tqdm(all_dates[warmup_idx:], desc="Rolling evaluation", unit="month")
sample_oos_any = next(iter(oos_maps[next(iter(oos_maps))]["Full"].values()))

for t in dates_iter:
    if t < EVAL_START:
        continue

    # --- per‑spec metrics through t‑1 -----------------------
    rows_stats = []
    for m_id, pnl_map in oos_maps.items():
        for p_name, spec_dict in pnl_map.items():
            for sid, oos in spec_dict.items():
                met = _spec_metrics(oos[oos["date"] < t])
                met.update(model=m_id, panel=p_name, spec_id=sid)
                rows_stats.append(met)
    stats_df = pd.DataFrame(rows_stats)
    if stats_df["obs"].max() < 12:
        continue

    # --- candidate pool -------------------------------------
    pool_t = _candidate_pool(stats_df)
    if len(pool_t) < 3:
        continue

    # --- choose MV winners per window -----------------------
    best_combo: Dict[str, Tuple[str, ...]] = {}
    win_hr     : Dict[str, float]          = {}

    for W in WINDOWS:
        combo, hr = _choose_best_combo(pool_t, t, W)
        if combo:
            tag = WINDOWS[W]
            best_combo[tag] = combo
            win_hr[tag]     = hr

    # --- BTD single‑spec ------------------------------------
    for row in stats_df.itertuples():
        cum_hits[row.spec_id] = int(row.hr * row.obs)   # store total hits to date
        cum_obs [row.spec_id] = row.obs                 # store total obs to date
    if cum_obs:
        best_sid = max(cum_obs, key=lambda s: cum_hits[s] / cum_obs[s])
        best_combo["BTD"] = (best_sid,)
        win_hr["BTD"]     = cum_hits[best_sid] / cum_obs[best_sid]

    if not best_combo:
        continue

    # --- actual known? --------------------------------------
    actual_known = pd.notna(sample_oos_any.loc[sample_oos_any["date"] == t, "actual"].iloc[0])

    # --- record predictions & hit‑rate evolution ------------
    for tag, combo in best_combo.items():
        pred_dir = _ensemble_direction_mv(combo, t)  # works for 1- or many‑member
        rec = dict(date=t, pred_dir=pred_dir, sel_hr=win_hr[tag])
        if actual_known:
            rec["actual_dir"] = _actual_direction(sample_oos_any, t)
            rec["hit"]        = int(rec["pred_dir"] == rec["actual_dir"])
        records[tag].append(rec)
        sel_hr_ts[tag].append((t, win_hr[tag]))

# -------------------------------------------------------------
# 3. summaries, stratified diagnostics, AC score
# -------------------------------------------------------------

# --- PATCH: ensure hit is integer, then rebuild the summaries ---
summary_rows, strat_tables = [], {}

for tag, recs in records.items():
    # cast "hit" to int so binomtest gets integers
    dfp = (pd.DataFrame(recs)
             .dropna(subset=["hit"])
             .assign(hit       = lambda d: d["hit"].astype(int),
                     pred_dir  = lambda d: d["pred_dir"].astype(int),
                     actual_dir= lambda d: d["actual_dir"].astype(int)))

    ov_hr = dfp["hit"].mean() if not dfp.empty else math.nan
    strat = _stratified(dfp) if not dfp.empty else pd.DataFrame()
    ac    = _ac_score(ov_hr, strat["HitRate"], LAMBDA_AC) if not strat.empty else math.nan
    if not strat.empty:
        strat_tables[tag] = strat

    summary_rows.append(
        dict(Method=tag, Obs=len(dfp), HitRate=ov_hr, **{f"ACλ={LAMBDA_AC}": ac})
    )

# ---------- (same printing / plotting block as before) ----------
pd.set_option("display.float_format", "{:.3f}".format)

print("\n=== Back‑test summary (2017‑‑) ===")
print(pd.DataFrame(summary_rows).to_string(index=False))

for tag, tbl in strat_tables.items():
    print(f"\n--- Stratified diagnostics for {tag} ---")
    print(tbl.to_string(index=False, float_format=lambda x: f"{x:0.3f}"))
    ac_val = next(r[f"ACλ={LAMBDA_AC}"] for r in summary_rows if r["Method"] == tag)
    print(f"Accuracy × Consistency (λ = {LAMBDA_AC}):  {ac_val:0.3f}\n")

print("\n==============  Summary Table  ==============")
print(pd.DataFrame(summary_rows)
        .set_index("Method")
        .to_string(float_format=lambda x: f"{x:0.3f}"))


In [None]:
import itertools, math
from collections import defaultdict
from typing import List, Tuple, Dict, Any

import numpy as np
import pandas as pd
from scipy import stats
from tqdm.auto import tqdm

# ---------------- configuration -----------------------------
WINDOWS = {6: "T6", 12: "T12", 24: "T24"}     # trailing windows
K_SET   = (3, 5)                     # ensemble sizes
ALPHA   = 0.10                       # p‑value threshold for robust winner
# ------------------------------------------------------------

# ---------- helper functions (unchanged except tqdm) --------
def _spec_metrics(oos: pd.DataFrame) -> Dict[str, Any]:
    df = oos.dropna(subset=["actual"]).copy()
    if df.empty:
        return {"obs": 0, "rmse": np.nan, "hr": np.nan,
                "dm_p": np.nan, "pt_p": np.nan}

    df["smart_err"]  = df["smart"]  - df["actual"]
    df["median_err"] = df["median"] - df["actual"]
    df["actual_dir"] = (df["actual"] > df["median"]).astype(int)

    obs  = len(df)
    rmse = np.sqrt((df["smart_err"] ** 2).mean())
    hr   = (df["pred_dir"] == df["actual_dir"]).mean()

    diff = df["smart_err"] ** 2 - df["median_err"] ** 2
    dm_p = (1.0 if diff.std(ddof=1) == 0 else
            2 * (1 - stats.norm.cdf(abs(diff.mean() /
                                         diff.std(ddof=1) *
                                         math.sqrt(obs)))))

    p1, p2 = df["pred_dir"].mean(), df["actual_dir"].mean()
    denom  = p1 * p2 * (1 - p1) * (1 - p2)
    pt_p   = (1.0 if denom == 0 else
              2 * (1 - stats.norm.cdf(abs(((df["pred_dir"] &
                                             df["actual_dir"]).mean() -
                                             p1 * p2) /
                                           math.sqrt(denom / obs)))))

    return {"obs": obs, "rmse": rmse, "hr": hr, "dm_p": dm_p, "pt_p": pt_p}


def _candidate_pool(stats_df: pd.DataFrame) -> List[str]:
    chosen = set()
    for (mdl, pnl), grp in stats_df.groupby(["model", "panel"]):
        grp = grp[grp["obs"] > 0]
        if grp.empty:
            continue
        chosen.add(grp.loc[grp["rmse"].idxmin(), "spec_id"])
        chosen.add(grp.loc[grp["hr"].idxmax(),   "spec_id"])
        robust = grp[(grp["dm_p"] < ALPHA) & (grp["pt_p"] < ALPHA)]
        if not robust.empty:
            chosen.add(robust.loc[robust["rmse"].idxmin(), "spec_id"])
    return sorted(chosen)


_combo_cache: Dict[Tuple[str, ...], pd.DataFrame] = {}
def _merged_oos_cached(combo: Tuple[str, ...]):
    if combo not in _combo_cache:
        _combo_cache[combo] = merged_oos(combo)
    return _combo_cache[combo]


def _combo_hitrate(combo: Tuple[str, ...], t_cut: pd.Timestamp,
                   window: int) -> Tuple[float, int]:
    df = _merged_oos_cached(combo)
    sub = df[(df["date"] >= t_cut - pd.DateOffset(months=window)) &
             (df["date"] <  t_cut) & df["actual"].notna()].copy()
    if sub.empty:
        return np.nan, 0
    sub["actual_dir"] = (sub["actual"] > sub["median"]).astype(int)
    hr = (sub["ens"] > sub["median"]).eq(sub["actual_dir"]).mean()
    return hr, len(sub)


def _choose_best_combo(pool: List[str], t_cut: pd.Timestamp,
                       window: int) -> Tuple[Tuple[str, ...], float]:
    best_combo, best_hr, best_rmse = None, -1.0, np.inf
    for k in K_SET:
        if len(pool) < k:
            continue
        for combo in itertools.combinations(pool, k):
            hr, _ = _combo_hitrate(combo, t_cut, window)
            if math.isnan(hr):
                continue
            if hr > best_hr:
                best_combo, best_hr = combo, hr
                best_rmse = _window_rmse(combo, t_cut, window)
            elif hr == best_hr:
                rmse = _window_rmse(combo, t_cut, window)
                if rmse < best_rmse or (rmse == best_rmse and combo < best_combo):
                    best_combo, best_rmse = combo, rmse
    return best_combo, best_hr


def _window_rmse(combo: Tuple[str, ...], t_cut: pd.Timestamp, window: int):
    df = _merged_oos_cached(combo)
    sub = df[(df["date"] >= t_cut - pd.DateOffset(months=window)) &
             (df["date"] <  t_cut) & df["actual"].notna()]
    return (np.inf if sub.empty else
            math.sqrt(((sub["ens"] - sub["actual"]) ** 2).mean()))


def _ensemble_direction(combo: Tuple[str, ...], t_date: pd.Timestamp) -> int:
    df = _merged_oos_cached(combo)
    row = df[df["date"] == t_date].iloc[0]
    return int(row["ens"] > row["median"])


def _actual_direction(spec_any: pd.DataFrame, t_date: pd.Timestamp) -> int:
    row = spec_any[spec_any["date"] == t_date].iloc[0]
    return int(row["actual"] > row["median"])


def stratified_table(df_preds: pd.DataFrame) -> pd.DataFrame:
    """Directional diagnostics by regime; columns coerced to int."""
    df = df_preds.copy()
    df["pred_dir"]   = df["pred_dir"].astype(int)
    df["actual_dir"] = df["actual_dir"].astype(int)

    rows = []
    for lbl, (start, end) in REGIMES.items():
        sub = df[(df["date"] >= start) & (df["date"] <= end)]
        if sub.empty:
            continue

        hits = (sub["pred_dir"] == sub["actual_dir"]).sum()
        n    = len(sub)
        hr   = hits / n
        binom_p = stats.binomtest(hits, n, .5).pvalue

        p1, p2 = sub["pred_dir"].mean(), sub["actual_dir"].mean()
        denom  = p1 * p2 * (1 - p1) * (1 - p2)
        if denom == 0:
            pt_p = 1.0
        else:
            joint = (sub["pred_dir"].astype(int) & sub["actual_dir"].astype(int)).mean()
            pt_stat = (joint - p1 * p2) / math.sqrt(denom / n)
            pt_p = 2 * (1 - stats.norm.cdf(abs(pt_stat)))

        rows.append({"Regime": lbl, "Obs": n,
                     "HitRate": hr, "Binom_p": binom_p, "PT_p": pt_p})

    return pd.DataFrame(rows)
# ------------------------------------------------------------
# 2.  Timeline & warm‑up discovery
# ------------------------------------------------------------
all_dates = sorted({pd.to_datetime(d)
                    for mdl in oos_maps.values()
                    for pnl in mdl.values()
                    for o in pnl.values()
                    for d in o["date"].unique()})

warmup_idx = 0
with tqdm(total=len(all_dates), desc="Finding warm‑up start") as pbar:
    while warmup_idx < len(all_dates):
        t0 = all_dates[warmup_idx]
        ok = True
        for mdl in oos_maps.values():
            for pnl in mdl.values():
                for oos in pnl.values():
                    realised = oos[(oos["date"] < t0) & oos["actual"].notna()]
                    if len(realised) < 24:
                        ok = False
                        break
                if not ok:
                    break
            if not ok:
                break
        if ok:
            break
        warmup_idx += 1
        pbar.update(1)

print(f"Warm‑up starts at {all_dates[warmup_idx].date()}")

# ------------------------------------------------------------
# 3.  Rolling evaluation with progress bar
# ------------------------------------------------------------
records = {tag: [] for tag in WINDOWS.values()}
dates_iter = tqdm(all_dates[warmup_idx:], desc="Rolling evaluation", unit="month")

# pick one oos frame just to fetch actuals quickly
sample_oos_any = next(iter(
    oos_maps[next(iter(oos_maps))]["Full"].values()
))

for t in dates_iter:
    # --- per‑spec stats -------------------------------------
    rows_stats = []
    for model_id, panel_map in oos_maps.items():
        for panel_name, spec_dict in panel_map.items():
            for spec_id, oos in spec_dict.items():
                metrics = _spec_metrics(oos[oos["date"] < t])
                metrics.update({"model": model_id,
                                "panel": panel_name,
                                "spec_id": spec_id})
                rows_stats.append(metrics)
    stats_df = pd.DataFrame(rows_stats)
    if stats_df["obs"].max() < 12:
        continue

    # --- candidate pool -------------------------------------
    pool_t = _candidate_pool(stats_df)
    if len(pool_t) < 3:
        continue

    # --- choose best combos for each window -----------------
    best_combo_dict = {}
    for W in WINDOWS:
        combo, hr = _choose_best_combo(pool_t, t, W)
        if combo is not None:
            best_combo_dict[WINDOWS[W]] = combo

    if not best_combo_dict:
        continue

    actual_known = not math.isnan(
        sample_oos_any[sample_oos_any["date"] == t]["actual"].iloc[0]
    )

    # --- live predictions & logging -------------------------
    for tag, combo in best_combo_dict.items():
        pred_dir = _ensemble_direction(combo, t)
        rec = {"date": t, "pred_dir": pred_dir}
        if actual_known:
            rec["actual_dir"] = _actual_direction(sample_oos_any, t)
            rec["hit"] = int(rec["pred_dir"] == rec["actual_dir"])
        records[tag].append(rec)

# -------------------- build overall summary -----------------------
summary_rows, strat_tables = [], {}
for tag, recs in records.items():
    df_preds = (pd.DataFrame(recs)
                  .dropna(subset=["hit"])
                  .assign(pred_dir=lambda d: d["pred_dir"].astype(int),
                          actual_dir=lambda d: d["actual_dir"].astype(int)))

    summary_rows.append({
        "Method": tag,
        "Obs": len(df_preds),
        "HitRate": df_preds["hit"].mean() if not df_preds.empty else np.nan
    })

    if not df_preds.empty:
        strat_tables[tag] = stratified_table(df_preds)

print("\n=== Dynamic robust‑ensemble back‑test summary ===")
print(pd.DataFrame(summary_rows)
        .to_string(index=False, float_format=lambda x: f"{x:0.3f}"))

for tag, tbl in strat_tables.items():
    print(f"\n--- Stratified diagnostics for {tag} ---")
    print(tbl.to_string(index=False, float_format=lambda x: f"{x:0.3f}"))


In [None]:
# OLD MWU
def backtest_mwu(
    panel: pd.DataFrame,
    windows: list[int] = CONT_WINDOWS,
    etas:    list[float] = ETA_GRID,
    ridge:   float       = RIDGE,
    eps:     float       = 0.0          # set to 1e-6 if you want a weight floor
):
    """
    Walk-forward grid over (window, eta) for Multiplicative-Weights-Update.
    • The active expert set is re-built every step (contiguous W-month history).
    • New entrants receive equal prior mass; dropped economists are removed.
    • Snapshots for LIVE_WEIGHT_SNAPSHOTS are taken after renormalisation.
    Returns: eval_df, live_df, oos_map keyed by spec_id.
    """
    pname     = getattr(panel, "name", "panel")
    dates     = np.sort(panel["release_date"].unique())
    # start with an empty weight vector – it will be initialised on first active set
    w0        = pd.Series(dtype=float)

    eval_rows, live_rows, oos_map = [], [], {}

    for W, eta in tqdm(product(windows, etas),
                       total=len(windows) * len(etas),
                       desc=f"{pname} grid"):
        spec_id = f"mwu_w{W}_eta{eta:.3f}"
        w       = w0.copy()          # (re)initialise per spec
        w_last  = None
        recs    = []

        # ---------------- walk-forward ----------------
        for idx, d in enumerate(dates):
            cur = panel.loc[panel["release_date"] == d]

            # 1️⃣ determine economists with *contiguous* history of length W
            if idx < W:
                active = pd.Index([])
            else:
                hist = panel.loc[panel["release_date"].isin(dates[idx-W:idx])]
                active = (
                    hist.groupby("economist")["forecast"]
                    .apply(lambda s: len(s) == W and s.notna().all())  # <-- strict check
                    .pipe(lambda s: s[s].index)
            )

            if active.empty:
                continue  # nothing to do for this date

            # 2️⃣ rebuild / extend the weight vector to the active set
            w = w.reindex(active)              # drop inactive, add new
            if w.isna().any():                 # give newcomers equal prior
                w.fillna(1.0, inplace=True)
            if w.sum() == 0.0:                 # all zeros (unlikely, but safe)
                w[:] = 1.0
            if eps > 0.0:                      # optional floor
                w = (w + eps) / (w + eps).sum()
            else:
                w /= w.sum()

            # 3️⃣ form today's forecast if we have ≥2 available economists
            f_t   = cur.set_index("economist")["forecast"].reindex(w.index)
            avail = f_t.notna()
            if avail.sum() >= 2:
                w_av   = w[avail].copy()
                w_av  /= w_av.sum()
                w_last = w_av.copy()           # snapshot before seeing actual
                smart  = float(np.dot(w_av, f_t[avail]))
                median = float(cur["median_forecast"].iloc[0])
                actual = float(cur["actual"].iloc[0])
                recs.append((d, smart, median, actual, int(smart > median)))

            # 4️⃣ update weights once actual is known
            if pd.notna(cur["actual"].iloc[0]):
                a_val = cur["actual"].iloc[0]
                loss  = (f_t - a_val).pow(2).fillna(0.0) + ridge
                w    *= np.exp(-eta * loss)
                if w.sum() == 0.0:             # numerical underflow fallback
                    w[:] = 1.0
                if eps > 0.0:
                    w = (w + eps) / (w + eps).sum()
                else:
                    w /= w.sum()

        if not recs:
            continue

        # ---------------- store results ----------------
        oos = pd.DataFrame(recs, columns=["date", "smart", "median",
                                          "actual", "pred_dir"])
        oos_map[spec_id] = oos

        # live row + snapshot
        last = oos.iloc[-1]
        if pd.isna(last["actual"]):
            live_rows.append({
                "spec_id": spec_id, "panel": pname,
                "window":  W,        "eta":   eta,
                "date":    last["date"],
                "smart":   last["smart"],
                "median":  last["median"],
                "pred_dir": last["pred_dir"]
            })
            if w_last is not None:
                snap_meta = {
                    "date":  last["date"],
                    "panel": pname,
                    "model": "mwu",
                    "spec":  spec_id
                }
                for econ, wt in w_last.items():
                    LIVE_WEIGHT_SNAPSHOTS.append(
                        {**snap_meta,
                         "economist": econ,
                         "weight":    float(wt)}
                    )

        # realised evaluation
        df_eval = oos.dropna(subset=["actual"]).copy()
        if df_eval.empty:
            continue
        df_eval["smart_err"]  = df_eval["smart"]  - df_eval["actual"]
        df_eval["median_err"] = df_eval["median"] - df_eval["actual"]
        df_eval["actual_dir"] = (df_eval["actual"] > df_eval["median"]).astype(int)

        obs      = len(df_eval)
        rmse_s   = np.sqrt((df_eval["smart_err"]**2).mean())
        rmse_m   = np.sqrt((df_eval["median_err"]**2).mean())
        diff     = df_eval["smart_err"]**2 - df_eval["median_err"]**2
        dm_p     = 2 * (1 - stats.norm.cdf(abs(diff.mean() /
                                               diff.std(ddof=1) * np.sqrt(obs))))
        hits     = (df_eval["pred_dir"] == df_eval["actual_dir"]).sum()
        hit_rate = hits / obs
        binom_p  = stats.binomtest(hits, obs, .5).pvalue
        p1, p2   = df_eval["pred_dir"].mean(), df_eval["actual_dir"].mean()
        c_joint  = (df_eval["pred_dir"] & df_eval["actual_dir"]).mean()
        pt_p     = 2 * (1 - stats.norm.cdf(abs((c_joint - p1*p2) /
                                               np.sqrt(p1*p2*(1-p1)*(1-p2)/obs))))

        eval_rows.append({
            "spec_id":     spec_id,
            "panel":       pname,
            "window":      W,
            "eta":         eta,
            "obs":         obs,
            "RMSE_smart":  rmse_s,
            "RMSE_median": rmse_m,
            "SmartBetter": int(rmse_s < rmse_m),
            "HitRate":     hit_rate,
            "Binom_p":     binom_p,
            "PT_p":        pt_p,
            "DM_p":        dm_p
        })

    # ------------- wrap-up -------------
    eval_df = pd.DataFrame(eval_rows)
    live_df = pd.DataFrame(live_rows)
    eval_df["model_id"] = "mwu"
    live_df["model_id"] = "mwu"
    return eval_df, live_df, oos_map

# ---------------- STRATIFIED DIAGNOSTICS -----------------------
def stratified_mwu(oos: pd.DataFrame, regimes=REGIMES) -> pd.DataFrame:
    rows = []
    df  = oos.dropna(subset=["actual"]).copy()
    if df.empty:
        return pd.DataFrame()

    df["smart_err"]  = df["smart"]  - df["actual"]
    df["median_err"] = df["median"] - df["actual"]
    df["pred_dir"]   = (df["smart"] > df["median"]).astype(int)
    df["actual_dir"] = (df["actual"] > df["median"]).astype(int)

    for lbl, (start, end) in regimes.items():
        sub = df[(df["date"] >= start) & (df["date"] <= end)]
        if sub.empty:
            continue
        obs      = len(sub)
        rm_s     = np.sqrt((sub["smart_err"]**2).mean())
        rm_m     = np.sqrt((sub["median_err"]**2).mean())
        diff     = sub["smart_err"]**2 - sub["median_err"]**2
        dm_p     = 2 * (1 - stats.norm.cdf(abs(diff.mean() / diff.std(ddof=1) * np.sqrt(obs))))
        hits     = (sub["pred_dir"] == sub["actual_dir"]).sum()
        hit_rate = hits / obs
        binom_p  = stats.binomtest(hits, obs, .5).pvalue
        p1, p2   = sub["pred_dir"].mean(), sub["actual_dir"].mean()
        c_joint  = (sub["pred_dir"] & sub["actual_dir"]).mean()
        pt_p     = 2 * (1 - stats.norm.cdf(abs((c_joint - p1*p2) /
                                               np.sqrt(p1*p2*(1-p1)*(1-p2)/obs))))

        rows.append({
            "Regime":      lbl,
            "Obs":         obs,
            "RMSE_smart":  rm_s,
            "RMSE_median": rm_m,
            "SmartBetter": int(rm_s < rm_m),
            "HitRate":     hit_rate,
            "Binom_p":     binom_p,
            "PT_p":        pt_p,
            "DM_p":        dm_p
        })

    return pd.DataFrame(rows)

# --------------------- DRIVER & PRINTING ----------------------
for name, pnl in PANELS.items():
    pnl.name = name
    ev, lv, om = backtest_mwu(pnl)
    eval_tables["mwu"][name] = ev
    live_tables["mwu"][name] = lv
    oos_maps   ["mwu"][name] = om
    _all_eval.append(ev)
    _all_live.append(lv)
    if om:
        realised = next(iter(om.values())).dropna(subset=["actual"])
        actual_dir["mwu"][name] = (realised["actual"] > realised["median"]).astype(int).values
    else:
        actual_dir["mwu"][name] = np.array([], dtype=int)

pd.set_option("display.float_format", "{:.3f}".format)

def print_key_specs_mwu(df: pd.DataFrame, panel_name: str) -> None:
    low  = df.loc[df["RMSE_smart"].idxmin()]
    high = df.loc[df["HitRate"].idxmax()]
    rob  = df[(df["DM_p"] < .10) & (df["PT_p"] < .10)]
    if not rob.empty:
        r = rob.loc[rob["RMSE_smart"].idxmin()]
        rob_str = f"{r['spec_id']} (w={int(r['window'])}, η={r['eta']:.3f})"
    else:
        rob_str = "None (DM_p & PT_p ≥ 0.10)"

    print(f"\n{panel_name} panel key specs:")
    print(f"  • Lowest RMSE    : {low['spec_id']} (w={int(low['window'])}, η={low['eta']:.3f})")
    print(f"  • Highest HitRate: {high['spec_id']} (w={int(high['window'])}, η={high['eta']:.3f})")
    print(f"  • Robust Winner  : {rob_str}")

# back‑test tables + key specs
print("\n=== Back‑test summary (COVID panel) ===")
print(eval_tables["mwu"]["COVID"].to_string(index=False))
print_key_specs_mwu(eval_tables["mwu"]["COVID"], "COVID")

print("\n=== Back‑test summary (Full panel) ===")
print(eval_tables["mwu"]["Full"].to_string(index=False))
print_key_specs_mwu(eval_tables["mwu"]["Full"], "Full")

# stratified diagnostics on best‑RMSE spec (Full)
best_spec = eval_tables["mwu"]["Full"].loc[
    eval_tables["mwu"]["Full"]["HitRate"].idxmax(), "spec_id"
]
print(f"\n=== Stratified diagnostics (FULL • best spec {best_spec}) ===")
st_tbl = stratified_mwu(oos_maps["mwu"]["Full"][best_spec])
print("No realised data." if st_tbl.empty else st_tbl.to_string(index=False))

# consolidated live forecasts
print("\n================ CONSOLIDATED LIVE FORECASTS ================\n")
for panel in ["COVID", "Full"]:
    lv = live_tables["mwu"][panel]
    if lv.empty:
        continue
    ev = eval_tables["mwu"][panel]

    selections = {
        "Lowest RMSE"     : ev.loc[ev["RMSE_smart"].idxmin()],
        "Highest HitRate" : ev.loc[ev["HitRate"].idxmax()]
    }
    rob = ev[(ev["DM_p"] < .10) & (ev["PT_p"] < .10)]
    if not rob.empty:
        selections["Robust Winner"] = rob.loc[rob["RMSE_smart"].idxmin()]

    from collections import defaultdict
    label_map, spec_info = defaultdict(set), {}
    for lbl, row in selections.items():
        label_map[row["spec_id"]].add(lbl)
        spec_info[row["spec_id"]] = row

    for sid, labels in label_map.items():
        row = lv[lv["spec_id"] == sid].iloc[-1]
        txt = " & ".join(sorted(labels))
        sig = "Beat" if row["pred_dir"] else "Miss"
        print(f"--- {panel} • {txt} ---")
        print(f"Date   : {row['date'].date()}")
        print(f"Smart  : {row['smart']:.1f} k")
        print(f"Median : {row['median']:.1f} k")
        print(f"Signal : {sig}  ({sid})\n")



**Old MWU prototyping**

In [None]:
# =============================================================
#  MWU-AddExp · ECONOMIST level
#    • 12-month probation with *dynamic* contiguity test
#    • back-test starts 2010-01-01
#    • first forecast only if ≥5 probated economists
#    • per-economist weight-cap = 50 %
#    • prints: COVID → Full → stratified (Full) → consolidated live
# =============================================================
import pandas as pd, numpy as np, itertools, math, scipy.stats as st
from tqdm.auto import tqdm

# ---------------- HYPER-PARAMETERS ---------------------------
PROBATION_M   = 12
INITIAL_T0    = pd.Timestamp("2010-01-01")
ALPHA_GRID    = [0.10, 0.20]
ETA_GRID      = np.arange(0.001, 0.011, 0.001)
CAP           = .50
MIN_EXPERTS   = 5
EPS_FLOOR     = 1e-8
RIDGE         = 1e-6

# ---------------- DYNAMIC REGIMES ---------------------------
TODAY             = pd.Timestamp.today().normalize()
FIRST_DATE        = pd.to_datetime(df_full["release_date"].min()).normalize()
TRAILING_START_12 = TODAY - pd.DateOffset(months=12)
TRAILING_START_24 = TODAY - pd.DateOffset(months=24)

REGIMES = {
    f"{FIRST_DATE:%Y-%m} to 2007-12 (pre-GFC)"      : (FIRST_DATE, "2007-12-31"),
    "2008-01 to 2009-12 (GFC)"                      : ("2008-01-01", "2009-12-31"),
    "2010-01-2014-12 (early-expansion, post GFC)"   : ("2010-01-01", "2014-12-31"),
    "2015-01-2019-12 (late-expansion, post GFC)"    : ("2015-01-01", "2019-12-31"),
    "2020-01 to 2022-12 (COVID)"                    : ("2020-01-01", "2022-12-31"),
    f"2023-01 to {TODAY.date()} (post-COVID)"       : ("2023-01-01", TODAY),
    "Trailing 24-months"                            : (TRAILING_START_24, TODAY),
    "Trailing 12-months"                            : (TRAILING_START_12, TODAY),
}

# ---------------- COVERAGE MATRICES --------------------------
df_full = df_full.copy()
df_full["release_date"] = pd.to_datetime(df_full["release_date"])

coverage = (df_full.assign(flag=1)
                     .pivot_table(index="economist",
                                  columns="release_date",
                                  values="flag",
                                  aggfunc="size")
                     .notna()
                     .sort_index(axis=1))

first_forecast = df_full.groupby("economist")["release_date"].min()
probation_date = first_forecast + pd.DateOffset(months=PROBATION_M)

# ---------------- CORE BACK-TESTER ---------------------------
def mwu_addexp_panel(panel: pd.DataFrame, alpha: float, eta: float):
    pname   = getattr(panel, "name", "panel")
    dates   = np.sort(panel.loc[panel["release_date"] >= INITIAL_T0,
                                "release_date"].unique())

    # initial pool (passed probation *and* contiguous up to INITIAL_T0)
    init_exp = [e for e in coverage.index
                if (probation_date[e] < INITIAL_T0)
                and coverage.loc[e, probation_date[e]:INITIAL_T0].all()]

    w = pd.Series(1/len(init_exp), index=init_exp, dtype=float) if init_exp else pd.Series(dtype=float)

    spec_id = f"mwu_eta{eta:.3f}_alpha{alpha:.2f}"
    recs, live_rows, oos_map = [], [], {}

    joined = set(init_exp)

    for d in dates:

        # ------ dynamic newcomer test ----------------------------------
        pot = [e for e in coverage.index if e not in joined and d >= probation_date[e]]
        newcomers = [e for e in pot if coverage.loc[e, probation_date[e]:d].all()]
        if newcomers:
            joined.update(newcomers)
            w *= (1 - alpha)
            w.update(pd.Series(alpha/len(newcomers), index=newcomers))
            w.clip(lower=EPS_FLOOR, inplace=True)
            w /= w.sum()

        # enforce cap
        if not w.empty:
            w = w.clip(upper=CAP)
            w /= w.sum()

        # ------ current forecasts -------------------------------------
        cur_month = panel.loc[panel["release_date"] == d]
        cur_last  = (cur_month[cur_month["economist"].isin(w.index)]
                     .sort_values("asof")
                     .groupby("economist", as_index=False).last())

        f_t   = cur_last.set_index("economist")["forecast"]
        avail = f_t.notna(); n_avail = avail.sum()

        if n_avail >= MIN_EXPERTS:
            w_av  = w.reindex(f_t.index).fillna(0.0) * avail
            w_av /= w_av.sum()
            smart   = float(np.dot(w_av, f_t.fillna(0.0)))
            median  = float(cur_month["median_forecast"].iloc[0])
            actual  = float(cur_month["actual"].iloc[0])
            recs.append((d, smart, median, actual, int(smart > median)))

        # ------ MWU weight update -------------------------------------
        if pd.notna(cur_month["actual"].iloc[0]) and n_avail:
            y    = cur_month["actual"].iloc[0]
            loss = (f_t.fillna(0.0) - y).pow(2) + RIDGE
            w *= np.exp(-eta * loss.reindex(w.index).fillna(0.0))
            w.clip(lower=EPS_FLOOR, inplace=True)
            w = w.clip(upper=CAP)
            w /= w.sum()

    # ---------- assemble outputs -------------------------------------
    if not recs:
        return pd.DataFrame(), pd.DataFrame(), {}

    oos = pd.DataFrame(recs, columns=["date","smart","median","actual","pred_dir"])
    oos_map[spec_id] = oos

    if pd.isna(oos.iloc[-1, 3]):   # live row
        last = oos.iloc[-1]
        live_rows.append({"spec_id":spec_id,"panel":pname,
                          "date":last["date"],"smart":last["smart"],
                          "median":last["median"],"pred_dir":last["pred_dir"]})

    realised = oos.dropna(subset=["actual"]).copy()
    realised["smart_err"]  = realised["smart"]  - realised["actual"]
    realised["median_err"] = realised["median"] - realised["actual"]
    realised["actual_dir"] = (realised["actual"] > realised["median"]).astype(int)

    diff = realised["smart_err"]**2 - realised["median_err"]**2
    dm_p = (2*(1-st.norm.cdf(abs(diff.mean()/diff.std(ddof=1)
                       * math.sqrt(len(realised)))))) \
           if diff.std(ddof=1) else 1.0

    eval_row = {
        "spec_id"    : spec_id,
        "panel"      : pname,
        "alpha"      : alpha,
        "eta"        : eta,
        "obs"        : len(realised),
        "RMSE_smart" : math.sqrt((realised["smart_err"]**2).mean()),
        "RMSE_median": math.sqrt((realised["median_err"]**2).mean()),
        "SmartBetter": int(((realised["smart_err"]**2).mean()
                            < (realised["median_err"]**2).mean())),
        "HitRate"    : (realised["pred_dir"] == realised["actual_dir"]).mean(),
        "Binom_p"    : st.binomtest(
                           (realised["pred_dir"] == realised["actual_dir"]).sum(),
                           len(realised), .5).pvalue,
        "PT_p"       : st.binomtest(
                           (realised["pred_dir"] & realised["actual_dir"]).sum(),
                           len(realised), .25).pvalue,
        "DM_p"       : dm_p
    }
    return pd.DataFrame([eval_row]), pd.DataFrame(live_rows), oos_map

# ---------------- stratified, printing helpers --------------- (unchanged)
def stratified_mwu(oos: pd.DataFrame) -> pd.DataFrame:
    # ... (same as previous message, keep unchanged)
    df = oos.dropna(subset=["actual"]).copy()
    if df.empty: return pd.DataFrame()
    df["smart_err"]  = df["smart"]  - df["actual"]
    df["median_err"] = df["median"] - df["actual"]
    df["pred_dir"]   = (df["smart"] > df["median"]).astype(int)
    df["actual_dir"] = (df["actual"] > df["median"]).astype(int)
    rows=[]
    for lbl,(s,e) in REGIMES.items():
        sub=df[(df["date"]>=s)&(df["date"]<=e)]
        if sub.empty: continue
        rows.append({
            "Regime":lbl,"Obs":len(sub),
            "RMSE_smart":math.sqrt((sub["smart_err"]**2).mean()),
            "RMSE_median":math.sqrt((sub["median_err"]**2).mean()),
            "SmartBetter":int(((sub["smart_err"]**2).mean()
                                < (sub["median_err"]**2).mean())),
            "HitRate":(sub["pred_dir"]==sub["actual_dir"]).mean(),
            "Binom_p":st.binomtest((sub["pred_dir"]==sub["actual_dir"]).sum(),
                                   len(sub),.5).pvalue,
            "PT_p":st.binomtest((sub["pred_dir"]&sub["actual_dir"]).sum(),
                                len(sub),.25).pvalue,
            "DM_p":st.ttest_rel(sub["smart_err"]**2,
                                sub["median_err"]**2).pvalue})
    return pd.DataFrame(rows)

# ---------------- GRID RUN -----------------------------------
PANELS = {"COVID": df.copy(), "Full": df_full.copy()}
eval_tbls, live_tbls, oos_maps = {}, {}, {}

for pname, pnl in PANELS.items():
    pnl.name = pname
    ev_list, lv_list, omap = [], [], {}
    for alpha, eta in tqdm(itertools.product(ALPHA_GRID, ETA_GRID),
                           total=len(ALPHA_GRID)*len(ETA_GRID),
                           desc=f"{pname} grid"):
        ev, lv, om = mwu_addexp_panel(pnl, alpha, eta)
        ev_list.append(ev); lv_list.append(lv); omap.update(om)
    eval_tbls[pname] = pd.concat(ev_list, ignore_index=True)\
                         .sort_values(["RMSE_smart","eta"]) if ev_list else pd.DataFrame()
    live_tbls[pname] = pd.concat(lv_list, ignore_index=True)
    oos_maps[pname]  = omap

pd.set_option("display.float_format","{:.3f}".format)

# ---------------- NICE PRINTING ------------------------------
def label_dict(table):
    if table.empty: return {}
    out={}
    out.setdefault(table.loc[table["RMSE_smart"].idxmin(),"spec_id"], {"Lowest RMSE"})
    out.setdefault(table.loc[table["HitRate"].idxmax(),"spec_id"], {"Highest HitRate"})
    robust = table[(table["SmartBetter"]==1)&(table["DM_p"]<.10)&(table["PT_p"]<.10)]
    if not robust.empty:
        out.setdefault(robust.loc[robust["RMSE_smart"].idxmin(),"spec_id"], {"Robust Winner"})
    return out

def print_backtests():
    for panel in ["COVID","Full"]:
        tbl = eval_tbls[panel]
        print(f"\n=== Back-test summary ({panel} panel) ===")
        if tbl.empty: print("No realised observations."); continue
        cols = ["spec_id","panel","alpha","eta","obs","RMSE_smart","RMSE_median",
                "SmartBetter","HitRate","Binom_p","PT_p","DM_p"]
        print(tbl[cols].to_string(index=False))
        keys = label_dict(tbl)
        if keys:
            print(f"\n{panel} panel key specs:")
            for sp,labs in keys.items():
                print(f"  • {' & '.join(sorted(labs))}: {sp}")

def print_stratified():
    full_tbl = eval_tbls["Full"]
    if full_tbl.empty: return
    best_spec = full_tbl.loc[full_tbl["HitRate"].idxmax(),"spec_id"]
    print(f"\n=== Stratified diagnostics (FULL • best spec {best_spec}) ===")
    print(stratified_mwu(oos_maps["Full"][best_spec]).to_string(index=False))

def print_live():
    combined={}
    for panel,tbl in eval_tbls.items():
        for sp,labs in label_dict(tbl).items():
            combined.setdefault((panel,sp),set()).update(labs)
    if not combined: return
    print("\n================ CONSOLIDATED LIVE FORECASTS ================\n")
    for (panel,sp),labs in combined.items():
        live = live_tbls[panel]
        if sp not in live["spec_id"].values: continue
        row = live[live["spec_id"]==sp].iloc[-1]
        verdict = "Beat" if row["pred_dir"] else "Miss"
        print(f"--- {panel} • {' & '.join(sorted(labs))} ---")
        print(f"Date   : {row['date'].date()}")
        print(f"Smart  : {row['smart']:.1f} k")
        print(f"Median : {row['median']:.1f} k")
        print(f"Signal : {verdict}  ({sp})\n")

print_backtests()
print_stratified()
print_live()


In [None]:
# =============================================================
#  MWU-AddExp  ·  ECONOMIST level  ·  regime-frozen expert pools
#    • 12-month contiguity filter at regime start
#    • experts may “sleep” ≤ MAX_SLEEP consecutive releases
#    • back-test runs separately over the fixed structural regimes
#    • optional per-economist weight-cap (set WEIGHT_CAP = 0 for “no cap”)
# =============================================================
import pandas as pd, numpy as np, itertools, math, scipy.stats as st
from tqdm.auto import tqdm

# ─────────── USER-TUNABLE KNOBS ───────────────────────────────
ETA_GRID     = np.arange(0.001, 0.011, .001)   # MWU learning-rates
WEIGHT_CAP   = 0                             # 0.0 → no cap
MIN_EXPERTS  = 5                               # smart forecast printed only if ≥ MIN_EXPERTS active
PROBATION_M  = 12                              # contiguity window at regime start
MAX_SLEEP    = 2                               # ≥ MAX_SLEEP + 1 misses ⇒ expert dropped
EPS_FLOOR    = 1e-8
RIDGE        = 1e-6
# ──────────────────────────────────────────────────────────────

# ---------- regime anchors -------------------------------------------------
TODAY      = pd.Timestamp.today().normalize()
FIRST_DATE = pd.to_datetime(df_full["release_date"].min()).normalize()

REGIMES = {
    f"{FIRST_DATE:%Y-%m} to 2007-12 (pre-GFC)"     : ("2003-06-01", "2007-12-31"),
    "2008-01 to 2009-12 (GFC)"                     : ("2008-01-01", "2009-12-31"),
    "2010-01-2014-12 (early-expansion post-GFC)"   : ("2010-01-01", "2014-12-31"),
    "2015-01-2019-12 (late-expansion post-GFC)"    : ("2015-01-01", "2019-12-31"),
    "2020-01 to 2022-12 (COVID)"                   : ("2020-01-01", "2022-12-31"),
    f"2023-01 to {TODAY.date()} (post-COVID)"      : ("2023-01-01", TODAY),
}

# ---------- coverage matrices ---------------------------------------------
df_full = df_full.copy()
df_full["release_date"] = pd.to_datetime(df_full["release_date"])

coverage = (df_full.assign(flag=1)
                     .pivot_table(index="economist",
                                  columns="release_date",
                                  values="flag",
                                  aggfunc="size")
                     .notna()
                     .sort_index(axis=1))

first_forecast = df_full.groupby("economist")["release_date"].min()
prob_date      = first_forecast + pd.DateOffset(months=PROBATION_M)

# ---------- helpers --------------------------------------------------------
def contiguous_pool(start):
    """
    Pool of economists that (i) satisfied probation by `start`
    and (ii) have no missing forecasts in the 12-month window
    [start-12M … start-1d].
    """
    lbeg = start - pd.DateOffset(months=PROBATION_M)
    lend = start - pd.DateOffset(days=1)
    return [e for e in coverage.index
            if prob_date[e] <= start
            and coverage.loc[e, lbeg:lend].all()]

def run_regime(panel, pool, eta):
    """MWU-AddExp within one regime for a *fixed* expert pool."""
    if not pool:
        return [], []

    w      = pd.Series(1/len(pool), index=pool, dtype=float)          # equal start
    sleep  = pd.Series(0, index=pool, dtype=int)                      # consecutive-miss counter
    rows   = []                                                       # OOS performance rows
    w_hist = []                                                       # (date, weight vector) snapshots

    for d, grp in panel.groupby("release_date", sort=True):
        # latest forecasts from active experts
        cur = (grp[grp["economist"].isin(w.index)]
               .sort_values("asof")
               .groupby("economist", as_index=False)
               .last())
        f_t     = cur.set_index("economist")["forecast"]
        avail   = f_t.notna()
        n_avail = avail.sum()

        if n_avail >= MIN_EXPERTS:  # produce smart forecast
            w_av  = w.reindex(f_t.index).fillna(0.0) * avail
            w_av /= w_av.sum()
            smart   = float(np.dot(w_av, f_t.fillna(0.0)))
            median  = float(grp["median_forecast"].iloc[0])
            actual  = float(grp["actual"].iloc[0])
            rows.append((d, smart, median, actual, int(smart > median)))
            w_hist.append((d, w.copy()))

        # MWU weight update after realisation
        if pd.notna(grp["actual"].iloc[0]) and n_avail:
            y    = grp["actual"].iloc[0]
            loss = (f_t.fillna(0.0) - y).pow(2) + RIDGE
            w *= np.exp(-eta * loss.reindex(w.index).fillna(0.0))

        # sleep bookkeeping
        sleepers = sleep.index.difference(cur["economist"])
        sleep.loc[sleepers] += 1
        sleep.loc[cur["economist"]] = 0
        to_drop = sleep[sleep > MAX_SLEEP].index
        if len(to_drop):
            w.drop(to_drop, inplace=True)
            sleep.drop(to_drop, inplace=True)

        # cap + renormalise
        if WEIGHT_CAP > 0:
            w.clip(upper=WEIGHT_CAP, inplace=True)
        w.clip(lower=EPS_FLOOR, inplace=True)
        w /= w.sum()

    return rows, w_hist

# ---------- grid runner ----------------------------------------------------
def backtest(panel_name, df_panel):
    eval_rows, live_rows, oos_map = [], [], {}
    for eta in tqdm(ETA_GRID, desc=f"{panel_name} grid"):
        all_oos = []
        for label, (start, end) in REGIMES.items():
            start_ts = pd.Timestamp(start); end_ts = pd.Timestamp(end)
            mask = (df_panel["release_date"] >= start_ts) & (df_panel["release_date"] <= end_ts)
            if not mask.any(): continue
            pool = contiguous_pool(start_ts)
            oos, _ = run_regime(df_panel.loc[mask], pool, eta)
            all_oos.extend(oos)

        if not all_oos:
            continue

        spec_id = f"mwu_eta{eta:.3f}"
        oos_df  = pd.DataFrame(all_oos, columns=["date","smart","median","actual","pred_dir"])
        oos_map[spec_id] = oos_df

        # live row (if last actual still NA)
        if oos_df["actual"].isna().iloc[-1]:
            last = oos_df.iloc[-1]
            live_rows.append({"spec_id":spec_id,"panel":panel_name,
                              "date":last["date"],"smart":last["smart"],
                              "median":last["median"],"pred_dir":last["pred_dir"]})

        # evaluation
        realised = oos_df.dropna(subset=["actual"])
        if realised.empty: continue
        realised["smart_err"]  = realised["smart"]  - realised["actual"]
        realised["median_err"] = realised["median"] - realised["actual"]
        realised["actual_dir"] = (realised["actual"] > realised["median"]).astype(int)
        diff = realised["smart_err"]**2 - realised["median_err"]**2
        dm_p = (2*(1-st.norm.cdf(abs(diff.mean()/diff.std(ddof=1)
                           * math.sqrt(len(realised)))))) if diff.std(ddof=1) else 1.0
        eval_rows.append({
            "spec_id":spec_id,"panel":panel_name,"eta":eta,
            "obs":len(realised),
            "RMSE_smart":math.sqrt((realised["smart_err"]**2).mean()),
            "RMSE_median":math.sqrt((realised["median_err"]**2).mean()),
            "SmartBetter":int(((realised["smart_err"]**2).mean()
                               < (realised["median_err"]**2).mean())),
            "HitRate":(realised["pred_dir"]==realised["actual_dir"]).mean(),
            "Binom_p":st.binomtest((realised["pred_dir"]==realised["actual_dir"]).sum(),
                                   len(realised),.5).pvalue,
            "PT_p":st.binomtest((realised["pred_dir"]&realised["actual_dir"]).sum(),
                                len(realised),.25).pvalue,
            "DM_p":dm_p
        })
    return pd.DataFrame(eval_rows), pd.DataFrame(live_rows), oos_map

# ---------- run COVID & FULL panels ---------------------------------------
PANELS = {"COVID": df.copy(),          # df = COVID subset already in memory
          "Full" : df_full.copy()}

eval_tbls, live_tbls, oos_maps = {}, {}, {}
for name, pnl in PANELS.items():
    pnl.name = name
    eval_tbls[name], live_tbls[name], oos_maps[name] = backtest(name, pnl)

pd.set_option("display.float_format", "{:.3f}".format)

# ---------- reporting helpers ---------------------------------------------
def label_specs(tbl):
    if tbl.empty: return {}
    out={}
    out.setdefault(tbl.loc[tbl["RMSE_smart"].idxmin(),"spec_id"], {"Lowest RMSE"})
    out.setdefault(tbl.loc[tbl["HitRate"].idxmax(),"spec_id"], {"Highest HitRate"})
    robust = tbl[(tbl["SmartBetter"]==1)&(tbl["DM_p"]<.10)&(tbl["PT_p"]<.10)]
    if not robust.empty:
        out.setdefault(robust.loc[robust["RMSE_smart"].idxmin(),"spec_id"], {"Robust Winner"})
    return out

def print_backtests():
    for name in ["COVID","Full"]:
        tbl = eval_tbls[name]
        print(f"\n=== Back-test summary ({name} panel) ===")
        if tbl.empty: print("No realised observations."); continue
        cols = ["spec_id","eta","obs","RMSE_smart","RMSE_median",
                "SmartBetter","HitRate","Binom_p","PT_p","DM_p"]
        print(tbl[cols].to_string(index=False))
        for sp,lbs in label_specs(tbl).items():
            print(f"  • {' & '.join(sorted(lbs))}: {sp}")

def print_stratified():
    full = eval_tbls["Full"]
    if full.empty: return
    best = full.loc[full["HitRate"].idxmax(),"spec_id"]
    print(f"\n=== Stratified diagnostics (FULL • best spec {best}) ===")
    print(stratified_mwu(oos_maps["Full"][best]).to_string(index=False))

def print_live():
    combined={}
    for p,t in eval_tbls.items():
        for sp,lbs in label_specs(t).items():
            combined.setdefault((p,sp),set()).update(lbs)
    if not combined: return
    print("\n================ CONSOLIDATED LIVE FORECASTS ================\n")
    for (p,sp),lbs in combined.items():
        live = live_tbls[p]
        if sp not in live["spec_id"].values: continue
        r = live[live["spec_id"]==sp].iloc[-1]
        verdict = "Beat" if r["pred_dir"] else "Miss"
        print(f"--- {p} • {' & '.join(sorted(lbs))} ---")
        print(f"Date   : {r['date'].date()}")
        print(f"Smart  : {r['smart']:.1f} k")
        print(f"Median : {r['median']:.1f} k")
        print(f"Signal : {verdict}  ({sp})\n")

print_backtests()
print_stratified()
print_live()


Legacy MWU implementation 

In [None]:
# =============================================================
#  Multiplicative Weights Update
# =============================================================
ETA_GRID     = np.arange(0.001, 0.011, .001)   # MWU learning-rates
WEIGHT_CAP   = 0.50                            # highest weight we're willing to weigh a single forecaster; 0 = no cap
MIN_EXPERTS  = 5
PROBATION_M  = 12                              # contiguity window
MAX_SLEEP    = 3                               # drop after 4 misses
EPS_FLOOR    = 1e-8
RIDGE        = 1e-6                            # tiny ridge for stability

# define coverage matrixs
df_full = df_full.copy()
df_full["release_date"] = pd.to_datetime(df_full["release_date"])

coverage = (df_full.assign(flag=1)
                     .pivot_table(index="economist",
                                  columns="release_date",
                                  values="flag",
                                  aggfunc="size")
                     .notna()
                     .sort_index(axis=1))

first_forecast = df_full.groupby("economist")["release_date"].min()
prob_date      = first_forecast + pd.DateOffset(months=PROBATION_M)

def contiguous_pool(start: pd.Timestamp) -> list[str]:
    """Experts with a clean 12-month window (passing 12-release probation) ending just before `start`."""
    lbeg = start - pd.DateOffset(months=PROBATION_M)
    lend = start - pd.DateOffset(days=1)
    return [e for e in coverage.index
            if prob_date[e] <= start and coverage.loc[e, lbeg:lend].all()]

def run_regime(panel: pd.DataFrame, pool: list[str], eta: float):
    """
    Performs MWU for a single regime. 
    """
    if not pool:
        return [], []                                   # no experts

    w      = pd.Series(1/len(pool), index=pool, dtype=float)
    sleep  = pd.Series(0, index=pool, dtype=int)
    rows, hist = [], []                                # OOS rows, weight history

    for d, grp in panel.groupby("release_date", sort=True):
        cur = (grp[grp["economist"].isin(w.index)]
               .sort_values("asof")
               .groupby("economist", as_index=False)
               .last())
        f_t   = cur.set_index("economist")["forecast"]
        avail = f_t.notna()

        # record smart forecast
        if avail.sum() >= MIN_EXPERTS:
            w_av = w.reindex(f_t.index).fillna(0.0) * avail
            w_av /= w_av.sum()
            smart  = float(np.dot(w_av, f_t.fillna(0.0)))
            median = float(grp["median_forecast"].iloc[0])
            actual = float(grp["actual"].iloc[0])
            rows.append((d, smart, median, actual, int(smart > median)))
            hist.append((d, w.copy()))

        # MWU weight update
        if pd.notna(grp["actual"].iloc[0]) and avail.any():
            loss = (f_t.fillna(0.0) - grp["actual"].iloc[0]).pow(2) + RIDGE
            w *= np.exp(-eta * loss.reindex(w.index).fillna(0.0))

        # "sleep" bookkeeping 
        sleepers = sleep.index.difference(cur["economist"])
        sleep.loc[sleepers] += 1
        sleep.loc[cur["economist"]] = 0
        to_drop = sleep[sleep > MAX_SLEEP].index
        if len(to_drop):
            w.drop(to_drop, inplace=True)
            sleep.drop(to_drop, inplace=True)

        # cap and renormalize if any single forecaster exceeds cap
        if WEIGHT_CAP > 0:
            w.clip(upper=WEIGHT_CAP, inplace=True)
        w.clip(lower=EPS_FLOOR, inplace=True)
        w /= w.sum()

    return rows, hist                                    # list, list[(date,w)]


def backtest_mwu(panel_name: str, df_panel: pd.DataFrame):
    """
    Runs the MWU backtesting grid.
    """
    eval_rows, live_rows, oos_map = [], [], {}

    for eta in tqdm(ETA_GRID, desc=f"{panel_name} grid"):
        all_oos, all_hist = [], []
        for start, end in REGIMES.values():
            mask = (df_panel["release_date"] >= start) & (df_panel["release_date"] <= end)
            if not mask.any():
                continue
            pool = contiguous_pool(pd.Timestamp(start))
            oos, hist = run_regime(df_panel.loc[mask], pool, eta)
            all_oos.extend(oos)
            all_hist.extend(hist)

        if not all_oos:
            continue

        spec_id = f"mwu_eta{eta:.3f}"
        oos_df  = pd.DataFrame(all_oos,
                               columns=["date","smart","median","actual","pred_dir"])
        oos_map[spec_id] = oos_df

        # live row + weight snapshot
        if oos_df["actual"].isna().iloc[-1]:
            last = oos_df.iloc[-1]
            live_rows.append({
                "spec_id": spec_id, "panel": panel_name,
                "date":    last["date"], "smart":  last["smart"],
                "median":  last["median"], "pred_dir": last["pred_dir"]
            })
            live_dt = last["date"]
            w_last  = next((w for d, w in reversed(all_hist) if d == live_dt), None)
            if w_last is not None:
                meta = {"date": live_dt, "panel": panel_name,
                        "model": "mwu", "spec": spec_id}
                for econ, wt in w_last.items():
                    LIVE_WEIGHT_SNAPSHOTS.append(
                        {**meta, "economist": econ, "weight": float(wt)}
                    )

        # realized evaluation
        realised = oos_df.dropna(subset=["actual"])
        if realised.empty:
            continue
        realised["smart_err"]  = realised["smart"]  - realised["actual"]
        realised["median_err"] = realised["median"] - realised["actual"]
        realised["actual_dir"] = (realised["actual"] > realised["median"]).astype(int)
        diff = realised["smart_err"]**2 - realised["median_err"]**2
        dm_p = (1.0 if diff.std(ddof=1)==0 else
                2*(1 - st.norm.cdf(abs(diff.mean()/diff.std(ddof=1)
                                        * math.sqrt(len(realised))))))
        eval_rows.append({
            "spec_id":     spec_id,
            "panel":       panel_name,
            "eta":         eta,
            "obs":         len(realised),
            "RMSE_smart":  math.sqrt((realised["smart_err"]**2).mean()),
            "RMSE_median": math.sqrt((realised["median_err"]**2).mean()),
            "SmartBetter": int(((realised["smart_err"]**2).mean()
                                < (realised["median_err"]**2).mean())),
            "HitRate":     (realised["pred_dir"] == realised["actual_dir"]).mean(),
            "Binom_p":     st.binomtest(
                               (realised["pred_dir"] == realised["actual_dir"]).sum(),
                               len(realised), .5).pvalue,
            "PT_p":        st.binomtest(
                               (realised["pred_dir"] & realised["actual_dir"]).sum(),
                               len(realised), .25).pvalue,
            "DM_p":        dm_p
        })

    return pd.DataFrame(eval_rows), pd.DataFrame(live_rows), oos_map

# ---------- run for both panels ------------------------------
PANELS = {"COVID": df.copy(), "Full": df_full.copy()}

# scratch dicts used only by the legacy print helpers
mwu_eval_tbls = {}
mwu_live_tbls = {}
mwu_oos_tbls  = {}

for pname, pnl in PANELS.items():
    pnl.name = pname               # nice for tqdm

    ev, lv, om = backtest_mwu(pname, pnl)

    mwu_eval_tbls[pname] = ev
    mwu_live_tbls[pname] = lv
    mwu_oos_tbls [pname] = om

    # push into the global, 3-level container
    #    (model  ➜  panel  ➜  spec_id  ➜  DataFrame)
    eval_tables.setdefault("mwu", {})[pname] = ev
    live_tables.setdefault("mwu", {})[pname] = lv
    oos_maps   .setdefault("mwu", {})[pname] = om

    # actual directions cache (for later diagnostics)
    actual_dir.setdefault("mwu", {})[pname] = (
        np.array([], dtype=int) if ev.empty else
        (next(iter(om.values()))
            .dropna(subset=["actual"])
            .assign(flag=lambda d: (d["actual"] > d["median"]).astype(int))["flag"]
            .values)
    )

    _all_eval.append(ev)
    _all_live.append(lv)

def label_specs(tbl):
    """Return {spec_id → set(labels)} accumulating multiple titles."""
    if tbl.empty:
        return {}
    out = defaultdict(set)
    out[tbl.loc[tbl["RMSE_smart"].idxmin(), "spec_id"]].add("Lowest RMSE")
    out[tbl.loc[tbl["HitRate"].idxmax(),   "spec_id"]].add("Highest HitRate")
    rob = tbl[(tbl["SmartBetter"]==1) & (tbl["DM_p"]<.10) & (tbl["PT_p"]<.10)]
    if not rob.empty:
        out[rob.loc[rob["RMSE_smart"].idxmin(), "spec_id"]].add("Robust Winner")
    return out

# ---------- printing and stratified helpers ------------------------------
def print_backtests():
    for name in ["COVID", "Full"]:
        tbl = mwu_eval_tbls[name]                  
        print(f"\n=== Back-test summary ({name} panel) ===")
        if tbl.empty:
            print("No realised observations."); continue
        cols = ["spec_id","eta","obs","RMSE_smart","RMSE_median",
                "SmartBetter","HitRate","Binom_p","PT_p","DM_p"]
        print(tbl[cols].to_string(index=False))
        for sp, lbs in label_specs(tbl).items():
            print(f"  • {' & '.join(sorted(lbs))}: {sp}")

def stratified_mwu(oos_df: pd.DataFrame) -> pd.DataFrame:
    df = oos_df.dropna(subset=["actual"]).copy()
    if df.empty:
        return pd.DataFrame()
    df["smart_err"]  = df["smart"]  - df["actual"]
    df["median_err"] = df["median"] - df["actual"]
    df["pred_dir"]   = (df["smart"] > df["median"]).astype(int)
    df["actual_dir"] = (df["actual"] > df["median"]).astype(int)

    rows = []
    for lbl, (s, e) in REGIMES.items():
        sub = df[(df["date"] >= s) & (df["date"] <= e)]
        if sub.empty:
            continue
        rows.append({
            "Regime"      : lbl,
            "Obs"         : len(sub),
            "RMSE_smart"  : math.sqrt((sub["smart_err"]**2).mean()),
            "RMSE_median" : math.sqrt((sub["median_err"]**2).mean()),
            "SmartBetter" : int(((sub["smart_err"]**2).mean()
                                  < (sub["median_err"]**2).mean())),
            "HitRate"     : (sub["pred_dir"] == sub["actual_dir"]).mean(),
            "Binom_p"     : st.binomtest(
                               (sub["pred_dir"] == sub["actual_dir"]).sum(),
                               len(sub), .5).pvalue,
            "PT_p"        : st.binomtest(
                               (sub["pred_dir"] & sub["actual_dir"]).sum(),
                               len(sub), .25).pvalue,
            "DM_p"        : st.ttest_rel(sub["smart_err"]**2,
                                         sub["median_err"]**2).pvalue
        })
    return pd.DataFrame(rows)

def print_stratified():
    full = mwu_eval_tbls["Full"]                    
    if full.empty:
        return
    best = full.loc[full["HitRate"].idxmax(), "spec_id"]
    print(f"\n=== Stratified diagnostics (FULL • best spec {best}) ===")
    print(stratified_mwu(mwu_oos_tbls["Full"][best]).to_string(index=False))  

def print_live():
    combined = {}
    for p, tbl in mwu_eval_tbls.items():            
        for sp, lbs in label_specs(tbl).items():
            combined.setdefault((p, sp), set()).update(lbs)
    if not combined:
        return
    print("\n================ CONSOLIDATED LIVE FORECASTS ================\n")
    for (p, sp), lbs in combined.items():
        lv = mwu_live_tbls[p]                       
        if sp not in lv["spec_id"].values:
            continue
        r = lv[lv["spec_id"] == sp].iloc[-1]
        verdict = "Beat" if r["pred_dir"] else "Miss"
        print(f"--- {p} • {' & '.join(sorted(lbs))} ---")
        print(f"Date   : {r['date'].date()}")
        print(f"Smart  : {r['smart']:.1f} k")
        print(f"Median : {r['median']:.1f} k")
        print(f"Signal : {verdict}  ({sp})\n")

# console output
pd.set_option("display.float_format", "{:.3f}".format)
print_backtests()
print_stratified()
print_live()
