In [None]:
### Beta version: Let's listen to only the most accurate forecasters

# ──────────────────────────────────────────────────────────────
# Top-N forecaster ensemble • mean vs inverse-MSE weighting
# ──────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import warnings
from tqdm.auto import tqdm
from scipy.stats import norm, binomtest

# 🔇 suppress every warning category
warnings.filterwarnings("ignore")          # ← one line does it all
np.seterr(all="ignore")                    # silence NumPy runtime warnings

WINDOWS = [3, 6, 12]
TOP_NS  = [5, 10, 15, 20, 25]
RIDGE   = 1e-6

PANELS = {"Full panel": df_full, "COVID-filtered panel": df}

def evaluate_panel(panel: pd.DataFrame, label: str) -> pd.DataFrame:
    dates = np.sort(panel["release_date"].unique())
    metrics = []

    for W in tqdm(WINDOWS, desc=f"{label}: windows"):
        for N in TOP_NS:
            rows = []

            for idx in range(W, len(dates)):
                t = dates[idx]
                hist = panel[panel["release_date"]
                             .between(dates[idx - W], dates[idx - 1])]

                elig = hist.groupby("economist")["forecast"].apply(lambda s: s.notna().all())
                econs = elig[elig].index
                if econs.empty:
                    continue

                mse = (hist[hist["economist"].isin(econs)]
                       .groupby("economist")["error"]
                       .apply(lambda s: np.mean(s**2)))
                top = mse.nsmallest(N).index          # up to N forecasters

                cur = panel[(panel["release_date"] == t) &
                            (panel["economist"].isin(top))]
                f_t = cur.set_index("economist")["forecast"].dropna()
                if f_t.empty:
                    continue

                weights = {
                    "mean": pd.Series(1 / len(f_t), index=f_t.index),
                    "inv_mse": 1 / (mse.loc[f_t.index] + RIDGE)
                }
                weights["inv_mse"] /= weights["inv_mse"].sum()

                median_all = panel.loc[panel["release_date"] == t,
                                       "forecast"].dropna().median()
                actual = panel.loc[panel["release_date"] == t,
                                   "actual"].iloc[0]

                for method, w in weights.items():
                    smart = np.dot(w, f_t)
                    pred_dir = int(smart > median_all)
                    actual_dir = int(actual > median_all) if pd.notna(actual) else np.nan
                    rows.append((W, len(f_t), method, t,
                                 smart, median_all, actual,
                                 pred_dir, actual_dir))

            if not rows:
                continue

            df_all = pd.DataFrame(rows, columns=[
                "window", "top_N", "method", "date",
                "smart", "median", "actual", "pred_dir", "actual_dir"
            ])
            eval_df = df_all.dropna(subset=["actual"])
            if eval_df.empty:
                continue

            eval_df["smart_err"]  = eval_df["smart"]  - eval_df["actual"]
            eval_df["median_err"] = eval_df["median"] - eval_df["actual"]

            obs     = len(eval_df)
            rmse_s  = np.sqrt((eval_df["smart_err"]**2).mean())
            rmse_m  = np.sqrt((eval_df["median_err"]**2).mean())

            diff = eval_df["smart_err"]**2 - eval_df["median_err"]**2
            dm_p = 2 * (1 - norm.cdf(abs(diff.mean() / diff.std(ddof=1) *
                                         np.sqrt(obs))))

            # directional metrics
            hits     = (eval_df["pred_dir"] == eval_df["actual_dir"]).sum()
            hit_rate = hits / obs
            binom_p  = binomtest(hits, obs, 0.5).pvalue

            p1, p2 = eval_df["pred_dir"].mean(), eval_df["actual_dir"].mean()
            joint  = ((eval_df["pred_dir"].astype(int) &
                       eval_df["actual_dir"].astype(int))).mean()
            pt_stat = (joint - p1 * p2) / np.sqrt(p1 * p2 * (1 - p1) * (1 - p2) / obs)
            pt_p    = 2 * (1 - norm.cdf(abs(pt_stat)))

            metrics.append({
                "window": W, "top_N": N, "method": method, "obs": obs,
                "RMSE_smart": rmse_s, "RMSE_median": rmse_m,
                "HitRate": hit_rate, "Binom_p": binom_p,
                "PT_stat": pt_stat, "PT_p": pt_p, "DM_p": dm_p
            })

    return pd.DataFrame(metrics)

pd.set_option("display.float_format", "{:.3f}".format)

for name, pdf in PANELS.items():
    tbl = (evaluate_panel(pdf, name)
           .sort_values(["window", "top_N", "method"])
           .reset_index(drop=True))
    print(f"\n--- {name} : Top-N ensemble (mean vs inv_mse) ---")
    print(tbl.to_string(index=False))

### Beta version: Let's listen only to forecasters that reacted to the ADP print

# ──────────────────────────────────────────────────────────────
# “Fresh-update” ensemble:
# • keep only forecasts with |asof − release_date| ≤ 3 days
# • smart forecasts:   mean  &  median   of those “fresh” forecasts
# • three directional signals:
#       pred_dir_mean   (mean > crowd median)
#       pred_dir_med    (median > crowd median)
#       pred_dir_vote   (majority of fresh forecasters above crowd median)
# • evaluation on realised months
# ──────────────────────────────────────────────────────────────
import numpy as np, pandas as pd, warnings
from tqdm.auto import tqdm
from scipy.stats import norm, binomtest

warnings.filterwarnings("ignore")
np.seterr(all="ignore")

PANELS = {
    "Full panel"          : df_full,   # assume in memory
    "COVID-filtered panel": df
}

def evaluate_fresh(panel: pd.DataFrame, label: str) -> pd.DataFrame:
    # ±3-day mask (absolute difference ≤ 3 days)
    panel = panel.copy()
    panel["asof_delta"] = (panel["release_date"] - panel["asof"]).abs().dt.days
    panel["is_fresh"]   = panel["asof_delta"] <= 3

    dates   = np.sort(panel["release_date"].unique())
    rows    = []

    for t in tqdm(dates, desc=label):
        month_all   = panel[panel["release_date"] == t]
        month_fresh = month_all[month_all["is_fresh"]]

        if month_fresh.empty:        # no qualifying forecasts
            continue

        crowd_med = month_all["forecast"].dropna().median()
        smart_mean = month_fresh["forecast"].mean()
        smart_med  = month_fresh["forecast"].median()

        # individual vote: 1 if their forecast > crowd median
        voter_flags = (month_fresh["forecast"] > crowd_med).astype(int)
        pred_dir_vote = int(voter_flags.mean() > 0.5)   # strict majority

        # level-based directions
        pred_dir_mean = int(smart_mean > crowd_med)
        pred_dir_med  = int(smart_med  > crowd_med)

        actual = month_all["actual"].iloc[0]

        rows.append((t, smart_mean, smart_med, crowd_med, actual,
                     pred_dir_mean, pred_dir_med, pred_dir_vote))

    cols = ["date","smart_mean","smart_med","crowd_median","actual",
            "dir_mean","dir_med","dir_vote"]
    df = pd.DataFrame(rows, columns=cols)

    eval_df = df.dropna(subset=["actual"])
    if eval_df.empty:
        return pd.DataFrame()       # nothing to score

    # errors versus actual
    eval_df["err_mean"] = eval_df["smart_mean"] - eval_df["actual"]
    eval_df["err_med"]  = eval_df["smart_med"]  - eval_df["actual"]
    eval_df["err_crowd"]= eval_df["crowd_median"] - eval_df["actual"]

    # helper for directional metrics
    def dir_metrics(flag_col):
        hits = (eval_df[flag_col] == (eval_df["actual"] > eval_df["crowd_median"]).astype(int)).sum()
        hit_rate = hits / len(eval_df)
        binom_p  = binomtest(hits, len(eval_df), 0.5).pvalue
        p1, p2 = eval_df[flag_col].mean(), (eval_df["actual"] > eval_df["crowd_median"]).mean()
        joint  = (eval_df[flag_col].astype(int) &
                  (eval_df["actual"] > eval_df["crowd_median"]).astype(int)).mean()
        pt_stat = (joint - p1*p2) / np.sqrt(p1*p2*(1-p1)*(1-p2)/len(eval_df))
        pt_p    = 2*(1 - norm.cdf(abs(pt_stat)))
        return hit_rate, binom_p, pt_stat, pt_p

    rmse_mean = np.sqrt((eval_df["err_mean"]**2).mean())
    rmse_med  = np.sqrt((eval_df["err_med"]**2 ).mean())
    rmse_crowd= np.sqrt((eval_df["err_crowd"]**2).mean())

    # Diebold-Mariano (smart_mean vs crowd median)
    diff_mean = eval_df["err_mean"]**2 - eval_df["err_crowd"]**2
    dm_p_mean = 2*(1 - norm.cdf(abs(diff_mean.mean()/diff_mean.std(ddof=1) *
                                     np.sqrt(len(eval_df)))))

    # Diebold-Mariano (smart_med vs crowd median)
    diff_med  = eval_df["err_med"]**2 - eval_df["err_crowd"]**2
    dm_p_med  = 2*(1 - norm.cdf(abs(diff_med.mean()/diff_med.std(ddof=1) *
                                     np.sqrt(len(eval_df)))))

    # directional stats
    hit_mn, bin_mn, pt_mn, ptp_mn = dir_metrics("dir_mean")
    hit_md, bin_md, pt_md, ptp_md = dir_metrics("dir_med")
    hit_vt, bin_vt, pt_vt, ptp_vt = dir_metrics("dir_vote")

    return pd.DataFrame([{
        "obs": len(eval_df),
        "RMSE_mean":   rmse_mean,
        "RMSE_medianSmart": rmse_med,
        "RMSE_crowdMedian": rmse_crowd,
        "DM_p_mean":   dm_p_mean,
        "DM_p_medianSmart": dm_p_med,
        # directional metrics
        "HitRate_mean": hit_mn,   "Binom_p_mean": bin_mn, "PT_p_mean": ptp_mn,
        "HitRate_med":  hit_md,   "Binom_p_med":  bin_md, "PT_p_med":  ptp_md,
        "HitRate_vote": hit_vt,   "Binom_p_vote": bin_vt, "PT_p_vote": ptp_vt
    }])

# ---------- run & display ----------
pd.set_option("display.float_format", "{:.3f}".format)

for name, pdf in PANELS.items():
    tbl = evaluate_fresh(pdf, name)
    print(f"\n--- {name} : 3-day fresh-update ensemble ---")
    if tbl.empty:
        print("No months qualified (no fresh forecasts found).")
    else:
        print(tbl.to_string(index=False))


## 0: Baseline static forecast on full sample

Rolling 6-month fixed window. For every valid economist (for a prediction at time t, has a contiguous 6-month forecast history for previous 6 releases), weight prediction by inverse MSE. 


This implements an out-of-sample error estimate with a rolling 6-month estimation window. Weights don't use information from the target month and actual value at month *t* is unseen. In other words, all errors are "live" errors that could have been observed in real time.

Briefly, the procedure: 
1. Starts at 7th release (for 6 month release prior)
2. From estimation window, keep economists that supplied a forecast for all six months (per contiguity rule)
3. Compute MSE for each economist using errors against already known actuals (no lookahead)
4. Generate forecast for release t 
5. Store OOS evaluation error
6. Roll window forward a month and repeat