# Forecasting Consensus Expectations: Nonfarm Payrolls (NFP) 

## Data preprocessing

**Imports**

In [48]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

**Load & Preprocess**

In [49]:
HIST_PATH = "../raw/nfp_historical.xlsx"
JUNE_PATH = "../raw/nfp_june.xlsx"
JULY_PATH  = "../raw/nfp_july.xlsx"

hist_raw = pd.read_excel(HIST_PATH,
                         sheet_name=0,
                         header=None,
                         engine='openpyxl')

In [50]:
hist_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,727,728,729,730,731,732,733,734,735,736
0,NFP TCH,,,,,,,,,,...,,,,,,,,,,
1,Summary,,Median Survey,Actual,Economist,,Peter Dixon,,Sonia Meskin,,...,,James A Shugg,,John McAuley,,Nash Peyton,,Stephanie Roth,,Crandall/Jordan
2,Release Date,Observation Date,,,Firm,As of,,As of,,As of,...,As of,Westpac Banking Corp,As of,Wilkinson Boyd Captl Mrkts Inc,As of,Wilmington Trust Company,As of,Wolfe Research LLC,As of,Wrightson ICAP LLC
3,1997-01-10 00:00:00,1996-12-31 00:00:00,--,262,,--,--,--,--,--,...,--,--,--,--,--,--,--,--,--,--
4,1997-02-07 00:00:00,1997-01-31 00:00:00,--,271,,--,--,--,--,--,...,--,--,--,--,--,--,--,--,--,--


In [51]:
# Raw data is messy. Want to process this into a long table format for our smart NFP forecasting (where each row is a forecast by a single economist/sell-side institution)

name_row = hist_raw.iloc[1]
firm_row = hist_raw.iloc[2]

In [52]:
base = hist_raw.iloc[3:, :3].copy()
base.columns = ["release_date", "period", "actual"]
base["release_date"] = pd.to_datetime(base["release_date"])
base["period"] = pd.to_datetime(base["period"])
base["actual"] = pd.to_numeric(base["actual"], errors="coerce")

In [53]:
# numeric forecast cols
econ_cols = [idx for idx, val in name_row.items()
             if isinstance(val,str) and val not in {'Summary', 'Actual', 'Economist'}
]

In [54]:
# collect long dataframes, one per economist 

ASOF_FMT = "%m/%d/%Y"

long_frames = []

for col in econ_cols:
    asof_col = col - 1
    
    tmp = base.copy()
    
    tmp["economist"] = name_row[col]
    tmp["firm"]      = firm_row[col]
    
    tmp["forecast"]  = pd.to_numeric(hist_raw.iloc[3:, col], errors="coerce")
    
    tmp["asof"]      = pd.to_datetime(hist_raw.iloc[3:, asof_col], format=ASOF_FMT, errors="coerce")
    
    long_frames.append(tmp)
    
hist_long = pd.concat(long_frames, ignore_index=True)

In [55]:
hist_long

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof
0,1997-01-10,1996-12-31,,Median Survey,,,1996-12-31
1,1997-02-07,1997-01-31,,Median Survey,,,1997-01-31
2,1997-03-07,1997-02-28,,Median Survey,,,1997-02-28
3,1997-04-04,1997-03-31,,Median Survey,,,1997-03-31
4,1997-05-02,1997-04-30,,Median Survey,,,1997-04-30
...,...,...,...,...,...,...,...
111107,2025-04-04,2025-03-31,140.0,Crandall/Jordan,Wrightson ICAP LLC,80.0,2025-03-14
111108,2025-05-02,2025-04-30,137.5,Crandall/Jordan,Wrightson ICAP LLC,60.0,2025-04-11
111109,2025-06-06,2025-05-31,126.0,Crandall/Jordan,Wrightson ICAP LLC,150.0,2025-05-16
111110,2025-07-03,2025-06-30,106.0,Crandall/Jordan,Wrightson ICAP LLC,110.0,2025-06-12


In [56]:
# Helper to parse new releases
def parse_one_release(raw_path: str,
                      release_date: str,
                      period_start: str) -> pd.DataFrame:
    """
    Convert a Bloomberg NFP release.xls file to the long format
    used in df_full (one row = one economist forecast).
    """
    raw = pd.read_excel(raw_path,
                        sheet_name=0,
                        header=None,
                        engine='openpyxl')     

    # identify rows that hold the forecaster block
    forecast_col = pd.to_numeric(raw[3], errors='coerce')
    mask = raw[0].isna() & forecast_col.notna()

    block = (raw.loc[mask, [1, 2, 3, 4, 5]]
                .rename(columns={1: "economist",
                                 2: "firm",
                                 3: "forecast",
                                 4: "asof",
                                 5: "rank"}))

    block["forecast"] = forecast_col[mask]
    block["asof"] = pd.to_datetime(block["asof"], errors='coerce')

    # if economist name is blank, fall back to firm in parentheses
    block["economist"] = block.apply(
        lambda r: r["economist"] if pd.notna(r["economist"])
        else f"({r['firm']})",
        axis=1
    )

    # pull the actual NFP print from the same sheet
    actual_mask = raw.apply(
        lambda r: r.astype(str).str.contains("Actual", case=False).any(),
        axis=1
    )
    actual_candidates = (
    pd.to_numeric(raw.loc[actual_mask].stack(), errors="coerce")
      .dropna()
    )
    
    rel_date = pd.to_datetime(release_date)          # string → Timestamp
    if rel_date > pd.Timestamp.today().normalize():
        actual_val = np.nan                          # unreleased month
    else:
        actual_val = (actual_candidates.iloc[0]
                    if not actual_candidates.empty
                    else np.nan)

    # final tidy frame 
    tidy = (block.assign(release_date=pd.to_datetime(release_date),
                         period      =pd.to_datetime(period_start),
                         actual      =actual_val)
                  .loc[:, ["release_date","period","economist","firm",
                            "forecast","actual","asof"]])

    return tidy


In [57]:
# june_long = parse_one_release(JUNE_PATH,
#                               release_date="2025-06-06",
#                               period_start="2025-05-01")

# july_long = parse_one_release(JULY_PATH,
#                               release_date="2025-07-03",   
#                               period_start="2025-06-01")

In [58]:
# Stack panels  
# nfp_long = pd.concat([hist_long, june_long, july_long],
#                      ignore_index=True, sort=False)

nfp_long = hist_long


# Compute forecast error 
nfp_long["error"] = nfp_long["forecast"] - nfp_long["actual"]

# Per release, we only want the last forecast 
econ_forecasts_long = (
    nfp_long
    .sort_values(["release_date", "economist", "asof"])
    .groupby(["release_date", "economist"], as_index=False)
    .tail(1)
    .reset_index(drop=True)
)

In [59]:
df_full = econ_forecasts_long.copy()
df_full.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error
0,1997-01-10,1996-12-31,,Adam Chester,Lloyds Bank PLC,,NaT,
1,1997-01-10,1996-12-31,,Alessandro Truppia,Aletti Gestielle Sgr Spa,,NaT,
2,1997-01-10,1996-12-31,,Alexandre De Azara,Banco UBS SA,,NaT,
3,1997-01-10,1996-12-31,,Alison Lynn Reaser,Point Loma Nazarene University,,NaT,
4,1997-01-10,1996-12-31,,Allan Von Mehren,Danske Bank AS,,NaT,


In [60]:
# Quick sanity checks

# should only have one forecast per economist, per release
assert df_full.duplicated(subset=["release_date", "economist"]).sum() == 0,\
    "Duplicate (release date, economist) combinations detected"

# forecast timestamp has to be before release date. caveat: NA values will make the assertion fail, so we account NAs.
valid_asof = df_full["asof"].notna()
assert (df_full.loc[valid_asof, "asof"] <= df_full.loc[valid_asof, "release_date"]).all(),\
    "Some forecasts have asof (non-NA) after release date."

# each release date should have only one realized (actual) value, or 0 (if actual is NaN, current print not yet released) 
assert (df_full.groupby("release_date")["actual"].nunique() <= 1).all(), \
    "Multiple actual values found for the same release date"
    

# Note (economist, release_date) is not unique as some banks have multiple economist forecasts for a single release

In [61]:
# get median

median_by_release = (
    df_full.groupby("release_date")["forecast"]
    .median()
    .rename("median_forecast")
)

df_full = df_full.merge(median_by_release, on="release_date", how="left")

In [62]:
df_full.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error,median_forecast
0,1997-01-10,1996-12-31,,Adam Chester,Lloyds Bank PLC,,NaT,,
1,1997-01-10,1996-12-31,,Alessandro Truppia,Aletti Gestielle Sgr Spa,,NaT,,
2,1997-01-10,1996-12-31,,Alexandre De Azara,Banco UBS SA,,NaT,,
3,1997-01-10,1996-12-31,,Alison Lynn Reaser,Point Loma Nazarene University,,NaT,,
4,1997-01-10,1996-12-31,,Allan Von Mehren,Danske Bank AS,,NaT,,


In [63]:
# Sanity output 
print("NAs: ")
print(df_full.isna().sum(), "\n")
print("Rows: ", econ_forecasts_long.shape[0])
print("Economists: ", econ_forecasts_long['economist'].nunique())
print("NFP releases: ", econ_forecasts_long['release_date'].nunique())

NAs: 
release_date           0
period                 0
actual              5168
economist              0
firm                1032
forecast           89046
asof               89030
error              89046
median_forecast     5168
dtype: int64 

Rows:  111112
Economists:  323
NFP releases:  344


**Filter COVID, define directional label**

In [67]:
# Surprise = actual minus consensus (median forecast) for directional forecasting
df_full["surprise"] = df_full["actual"] - df_full["median_forecast"]

# Sort dates
df_full["release_date"] = pd.to_datetime(df_full["release_date"])
df_full = df_full.sort_values("release_date")

# Begin in April 2003 (first month fully under NAICS + birth-death)
df_full = df_full[df_full["release_date"] >= "2003-06-01"]

# Filter out COVID dates (avoid pandemic volatility in df)
df = df_full[~df_full["release_date"].between("2020-01-01", "2022-12-31")]
df.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error,median_forecast,surprise
25089,2003-06-06,2003-05-31,-30.0,Nathaniel Karp,BBVA USA,,NaT,,-30.0,0.0
24888,2003-06-06,2003-05-31,-30.0,Avery Shenfeld,Canadian Imperial Bank of Commerce,-20.0,2003-05-30,10.0,-30.0,0.0
24878,2003-06-06,2003-05-31,-30.0,Andrew Gretzinger,Manulife Asset Management Limited,,NaT,,-30.0,0.0
24871,2003-06-06,2003-05-31,-30.0,Adam Chester,Lloyds Bank PLC,,NaT,,-30.0,0.0
24872,2003-06-06,2003-05-31,-30.0,Alessandro Truppia,Aletti Gestielle Sgr Spa,-15.0,2003-06-04,15.0,-30.0,0.0


In [69]:
sum(df["actual"] == df["median_forecast"])

58463

TypeError: 'method' object is not iterable

In [68]:
import os

# ------------------------------------------------------------------
# export 2008‑09 vintage to Excel
# ------------------------------------------------------------------
out_dir = "../out"
os.makedirs(out_dir, exist_ok=True)

gfc_mask = (df["release_date"] >= "2008-01-01") & (df["release_date"] <= "2009-12-31")
df.loc[gfc_mask].to_excel(os.path.join(out_dir, "old_gfc.xlsx"),
                          index=False, engine="openpyxl")  # engine optional in 2.1+
print("✓ Saved to", os.path.join(out_dir, "old_gfc.xlsx"))


PermissionError: [Errno 13] Permission denied: '../out\\old_gfc.xlsx'

In [None]:
# OUT_DIR = "../out"         
# DF_FILE       = "nfp_df.parquet"
# DF_FULL_FILE  = "nfp_df_full.parquet"

# # ensure directory exists 
# os.makedirs(OUT_DIR, exist_ok=True)

# df.to_parquet(os.path.join(OUT_DIR, DF_FILE),  engine="pyarrow", index=False)
# df_full.to_parquet(os.path.join(OUT_DIR, DF_FULL_FILE), engine="pyarrow", index=False)

# # write
# print(f"✔️  Saved df  ➜  {OUT_DIR}/{DF_FILE}")
# print(f"✔️  Saved df_full ➜  {OUT_DIR}/{DF_FULL_FILE}")