# Forecasting Consensus Expectations: Nonfarm Payrolls (NFP) 

## Data preprocessing

**Imports**

In [12]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from glob import glob

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

**Helper functions**

In [13]:
def load_hist_claims(path: str) -> pd.DataFrame:
    """
    Loads and pivots Bloomberg's historical continuing claims data to a long dataframe format (one row per economist forecast).
    """
    raw = pd.read_excel(path, header=None, engine="openpyxl")
    name_row, firm_row = raw.iloc[1], raw.iloc[2]

    # find summary columns once
    col_actual = name_row.str.contains("Actual", case=False, na=False).idxmax()
    col_median = name_row.str.contains("Median", case=False, na=False).idxmax()

    dates  = pd.to_datetime(raw.iloc[3:, 0])
    period = pd.to_datetime(raw.iloc[3:, 1])
    actual = pd.to_numeric(raw.iloc[3:, col_actual], errors="coerce")
    median = pd.to_numeric(raw.iloc[3:, col_median], errors="coerce")

    base = pd.DataFrame(
        {"release_date": dates,
         "period":       period,
         "actual":       actual,
         "median_forecast": median}
    )

    skip_tokens = {"median", "average", "survey", "high",
                   "low", "previous", "prior", "actual"}
    frames = []
    ASOF_FMT = "%m/%d/%Y"

    for col in range(2, raw.shape[1]):
        hdr = name_row[col]
        if (not isinstance(hdr, str)
            or any(tok in hdr.lower() for tok in skip_tokens)):
            continue

        tmp = base.copy()
        tmp["economist"] = hdr
        tmp["firm"]      = firm_row[col]
        tmp["forecast"]  = pd.to_numeric(raw.iloc[3:, col], errors="coerce")
        tmp["asof"]      = pd.to_datetime(raw.iloc[3:, col-1],
                                          format=ASOF_FMT, errors="coerce")
        frames.append(tmp)

    return (pd.concat(frames, ignore_index=True)
              .dropna(subset=["forecast"]))


TODO: the single release logic has to be checked that it works with continuing claims (this was for NFP)

In [14]:
def load_single_release(raw_path: str,
                        release_date: str,
                        period_start: str) -> pd.DataFrame:
    """
    Converts a single Bloomberg continuing claims release into the long table layout for merge.
    """
    raw = pd.read_excel(raw_path, sheet_name=0, header=None,
                        engine="openpyxl")

    # --- forecaster block ------------------------------------------------
    fcst_num = pd.to_numeric(raw[3], errors="coerce")
    mask = raw[0].isna() & fcst_num.notna()

    block = (raw.loc[mask, [1, 2, 3, 4, 5]]
               .rename(columns={1: "economist",
                                2: "firm",
                                3: "forecast",
                                4: "asof",
                                5: "rank"}))

    block["forecast"] = fcst_num[mask]
    block["asof"]     = pd.to_datetime(block["asof"], errors="coerce")

    # fallback: if economist name missing, use firm in parentheses
    block["economist"] = block.apply(
        lambda r: r["economist"] if pd.notna(r["economist"])
        else f"({r['firm']})", axis=1
    )

    # --- actual print ----------------------------------------------------
    actual_mask = raw.apply(
        lambda r: r.astype(str).str.contains("Actual", case=False).any(),
        axis=1
    )
    actual_val = (pd.to_numeric(raw.loc[actual_mask].stack(),
                                errors="coerce").dropna()
                  .iloc[0] if actual_mask.any() else np.nan)

    rel_date = pd.to_datetime(release_date)
    if rel_date > pd.Timestamp.today().normalize():
        actual_val = np.nan       # future month – unknown actual yet

    # --- tidy output -----------------------------------------------------
    tidy = (block.assign(release_date=rel_date,
                         period=pd.to_datetime(period_start),
                         actual=actual_val)
                 .loc[:, ["release_date", "period", "economist",
                          "firm", "forecast", "actual", "asof"]])

    return tidy

In [15]:
def build_claims_long(hist_path: str,
                   singles: list[tuple[str, str, str]] | None = None
) -> pd.DataFrame:
    """
    Simple helper for concatenating single releases with historical claims data.
    """

    claims_long = load_hist_claims(hist_path)

    if singles:
        for fp, rdate, pstart in singles:
            if os.path.exists(fp):
                claims_long = pd.concat(
                    [claims_long,
                     load_single_release(fp, rdate, pstart)],
                    ignore_index=True)
    return claims_long

**Build NFP long dataframe + run checks**

In [16]:
# Example single release tuple format, (file_path, release_date, observation_date)
# SINGLE_RELEASES: list[tuple[str, str, str]] = [
#     # ("../raw/nfp_aug25.xlsx", "2025-09-05", "2025-08-01"),
# ]

In [17]:
HIST_PATH = "../raw/ContinuingClaims_historical.xlsx"


SINGLE_RELEASES = []        # specify single releases here if applicable, format as above
claims_long = build_claims_long(HIST_PATH, singles=SINGLE_RELEASES)
claims_long.head()

Unnamed: 0,release_date,period,actual,median_forecast,economist,firm,forecast,asof
1247,2003-07-10,2003-06-29,3818.0,3732.5,David H Sloan,4CAST/Continuum Economics,3730.0,2003-07-03
1369,2005-11-10,2005-10-29,2818.0,2792.0,David H Sloan,4CAST/Continuum Economics,2760.0,2005-11-08
2126,2020-05-14,2020-05-02,22833.0,25120.0,David H Sloan,4CAST/Continuum Economics,26000.0,2020-05-08
2492,2004-05-27,2004-05-15,2948.0,2922.5,Michael R Englund,Action Economics LLC,2930.0,2004-05-26
2493,2004-06-03,2004-05-22,3003.0,2925.0,Michael R Englund,Action Economics LLC,2940.0,2004-05-28


In [18]:
# sort and keep last forecast per economist
claims_last = (claims_long
            .sort_values(["release_date", "economist", "asof"])
            .groupby(["release_date", "economist"], as_index=False)
            .tail(1)
            .reset_index(drop=True))

# surprise
claims_last["surprise"] = claims_last["actual"] - claims_last["median_forecast"]
claims_last["error"] = claims_last["forecast"] - claims_last["actual"]


claims_last.head()

Unnamed: 0,release_date,period,actual,median_forecast,economist,firm,forecast,asof,surprise,error
0,2002-08-08,2002-07-27,3532.0,3480.0,Bill Sharp,JPMorgan Chase & Co,3475.0,2002-08-06,52.0,-57.0
1,2002-08-08,2002-07-27,3532.0,3480.0,Ethan S Harris,BofA Securities Inc,3480.0,2002-08-02,52.0,-52.0
2,2002-08-08,2002-07-27,3532.0,3480.0,Maxwell Clarke,Idea Global,3519.0,2002-08-08,52.0,-13.0
3,2002-08-15,2002-08-03,3576.0,,Ethan S Harris,BofA Securities Inc,3500.0,2002-08-09,,-76.0
4,2002-08-15,2002-08-03,3576.0,,Maxwell Clarke,Idea Global,3540.0,2002-08-12,,-36.0


In [19]:
# sanity checks

# {release_date, economist} pairs should be unique
assert claims_last.duplicated(["release_date", "economist"]).sum() == 0

# asof has to strictly precede or be equal to release date 
valid_asof = claims_last["asof"].notna()
assert (claims_last.loc[valid_asof, "asof"]
        <= claims_last.loc[valid_asof, "release_date"]).all()

# should have no more than one unique realized print per date 
assert (claims_last.groupby("release_date")["actual"].nunique() <= 1).all()

In [20]:
# Beginning or moving seasonal factors
df_full = (claims_last
      .query("release_date >= '2002-04-01'"))

# covid subset
df = (df_full
      .query("~release_date.between('2020-01-01', '2022-12-31')"))

In [21]:
df_full.head()

Unnamed: 0,release_date,period,actual,median_forecast,economist,firm,forecast,asof,surprise,error
0,2002-08-08,2002-07-27,3532.0,3480.0,Bill Sharp,JPMorgan Chase & Co,3475.0,2002-08-06,52.0,-57.0
1,2002-08-08,2002-07-27,3532.0,3480.0,Ethan S Harris,BofA Securities Inc,3480.0,2002-08-02,52.0,-52.0
2,2002-08-08,2002-07-27,3532.0,3480.0,Maxwell Clarke,Idea Global,3519.0,2002-08-08,52.0,-13.0
3,2002-08-15,2002-08-03,3576.0,,Ethan S Harris,BofA Securities Inc,3500.0,2002-08-09,,-76.0
4,2002-08-15,2002-08-03,3576.0,,Maxwell Clarke,Idea Global,3540.0,2002-08-12,,-36.0


In [22]:
df.head()

Unnamed: 0,release_date,period,actual,median_forecast,economist,firm,forecast,asof,surprise,error
0,2002-08-08,2002-07-27,3532.0,3480.0,Bill Sharp,JPMorgan Chase & Co,3475.0,2002-08-06,52.0,-57.0
1,2002-08-08,2002-07-27,3532.0,3480.0,Ethan S Harris,BofA Securities Inc,3480.0,2002-08-02,52.0,-52.0
2,2002-08-08,2002-07-27,3532.0,3480.0,Maxwell Clarke,Idea Global,3519.0,2002-08-08,52.0,-13.0
3,2002-08-15,2002-08-03,3576.0,,Ethan S Harris,BofA Securities Inc,3500.0,2002-08-09,,-76.0
4,2002-08-15,2002-08-03,3576.0,,Maxwell Clarke,Idea Global,3540.0,2002-08-12,,-36.0


**Export**

In [23]:
OUT_DIR       = "../out"
DF_FILE       = "contClaims_df.parquet"        # filtered (post‑COVID exclusions)
DF_FULL_FILE  = "contClaims_df_full.parquet"   # full history

os.makedirs(OUT_DIR, exist_ok=True)

df.to_parquet(os.path.join(OUT_DIR, DF_FILE),
              engine="pyarrow", index=False)
df_full.to_parquet(os.path.join(OUT_DIR, DF_FULL_FILE),
                    engine="pyarrow", index=False)

print(f"Saved clean  ➜  {OUT_DIR}/{DF_FILE}")
print(f"Saved full   ➜  {OUT_DIR}/{DF_FULL_FILE}")

Saved clean  ➜  ../out/contClaims_df.parquet
Saved full   ➜  ../out/contClaims_df_full.parquet
