# Forecasting Consensus Expectations: Nonfarm Payrolls (NFP) 

## Data preprocessing

**Imports**

In [63]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

**Load & Preprocess**

In [64]:
HIST_PATH = "../raw/nfp_historical.xls"
JUNE_PATH = "../raw/nfp_june.xls"
JULY_PATH  = "../raw/nfp_july.xls"

hist_raw = pd.read_html(HIST_PATH)[0]
june_raw = pd.read_html(JUNE_PATH)[0]

In [65]:
hist_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,712,713,714,715,716,717,718,719,720,721
0,NFP TCH,,,,,,,,,,...,,,,,,,,,,
1,Summary,,Actual,Economist,,Peter Dixon,,Sonia Meskin,,Nikola Mirtchev,...,,James A Shugg,,John McAuley,,Nash Peyton,,Stephanie Roth,,Crandall/Jordan
2,Release Date,Observation Date,,Firm,As of,,As of,,As of,3d Currency Management Limited,...,As of,Westpac Banking Corp,As of,Wilkinson Boyd Captl Mrkts Inc,As of,Wilmington Trust Company,As of,Wolfe Research LLC,As of,Wrightson ICAP LLC
3,02/04/2000,01/31/2000,387,,--,--,--,--,--,--,...,--,--,02/04/2000,200,--,--,--,--,02/04/2000,225
4,03/03/2000,02/29/2000,43,,--,--,--,--,--,--,...,--,--,02/29/2000,200,--,--,--,--,03/03/2000,180


In [66]:
june_raw.head()

Unnamed: 0,0,1,2,3,4,5
0,NFP TCH Index,,,,,
1,,Release Date,6/6/2025,,,
2,,Time,08:30,,,
3,,Country/Region,US,,,
4,,Event,Change in Nonfarm Payrolls,,,


In [67]:
# Raw data is messy. Want to process this into a long table format for our smart NFP forecasting (where each row is a forecast by a single economist/sell-side institution)

name_row = hist_raw.iloc[1]
firm_row = hist_raw.iloc[2]

In [68]:
base = hist_raw.iloc[3:, :3].copy()
base.columns = ["release_date", "period", "actual"]
base["release_date"] = pd.to_datetime(base["release_date"])
base["period"] = pd.to_datetime(base["period"])
base["actual"] = pd.to_numeric(base["actual"], errors="coerce")

In [69]:
# numeric forecast cols
econ_cols = [idx for idx, val in name_row.items()
             if isinstance(val,str) and val not in {'Summary', 'Actual', 'Economist'}
]

In [70]:
# collect long dataframes, one per economist 

ASOF_FMT = "%m/%d/%Y"

long_frames = []

for col in econ_cols:
    asof_col = col - 1
    
    tmp = base.copy()
    
    tmp["economist"] = name_row[col]
    tmp["firm"]      = firm_row[col]
    
    tmp["forecast"]  = pd.to_numeric(hist_raw.iloc[3:, col], errors="coerce")
    
    tmp["asof"]      = pd.to_datetime(hist_raw.iloc[3:, asof_col], format=ASOF_FMT, errors="coerce")
    
    long_frames.append(tmp)
    
hist_long = pd.concat(long_frames, ignore_index=True)

In [71]:
hist_long

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof
0,2000-02-04,2000-01-31,387,Peter Dixon,,,NaT
1,2000-03-03,2000-02-29,43,Peter Dixon,,,NaT
2,2000-04-07,2000-03-31,416,Peter Dixon,,,NaT
3,2000-05-05,2000-04-30,340,Peter Dixon,,,NaT
4,2000-06-02,2000-05-31,231,Peter Dixon,,340.0,2000-05-30
...,...,...,...,...,...,...,...
96667,2025-01-10,2024-12-31,256,Crandall/Jordan,Wrightson ICAP LLC,150.0,2024-12-20
96668,2025-02-07,2025-01-31,143,Crandall/Jordan,Wrightson ICAP LLC,125.0,2025-01-17
96669,2025-03-07,2025-02-28,151,Crandall/Jordan,Wrightson ICAP LLC,100.0,2025-02-14
96670,2025-04-04,2025-03-31,228,Crandall/Jordan,Wrightson ICAP LLC,80.0,2025-03-14


In [72]:
# Helper to parse new releases
def parse_one_release(raw_path: str,
                      release_date: str,
                      period_start: str) -> pd.DataFrame:
    """
    Convert a Bloomberg NFP release.xls file to the long format
    used in df_full (one row = one economist forecast).
    """
    raw = pd.read_html(raw_path)[0]          

    # identify rows that hold the forecaster block
    forecast_col = pd.to_numeric(raw[3], errors='coerce')
    mask = raw[0].isna() & forecast_col.notna()

    block = (raw.loc[mask, [1, 2, 3, 4, 5]]
                .rename(columns={1: "economist",
                                 2: "firm",
                                 3: "forecast",
                                 4: "asof",
                                 5: "rank"}))

    block["forecast"] = forecast_col[mask]
    block["asof"] = pd.to_datetime(block["asof"], errors='coerce')

    # if economist name is blank, fall back to firm in parentheses
    block["economist"] = block.apply(
        lambda r: r["economist"] if pd.notna(r["economist"])
        else f"({r['firm']})",
        axis=1
    )

    # pull the actual NFP print from the same sheet
    actual_mask = raw.apply(
        lambda r: r.astype(str).str.contains("Actual", case=False).any(),
        axis=1
    )
    actual_candidates = (
    pd.to_numeric(raw.loc[actual_mask].stack(), errors="coerce")
      .dropna()
    )
    
    rel_date = pd.to_datetime(release_date)          # string → Timestamp
    if rel_date > pd.Timestamp.today().normalize():
        actual_val = np.nan                          # unreleased month
    else:
        actual_val = (actual_candidates.iloc[0]
                    if not actual_candidates.empty
                    else np.nan)

    # final tidy frame 
    tidy = (block.assign(release_date=pd.to_datetime(release_date),
                         period      =pd.to_datetime(period_start),
                         actual      =actual_val)
                  .loc[:, ["release_date","period","economist","firm",
                            "forecast","actual","asof"]])

    return tidy


In [73]:
june_long = parse_one_release(JUNE_PATH,
                              release_date="2025-06-06",
                              period_start="2025-05-01")

july_long = parse_one_release(JULY_PATH,
                              release_date="2025-07-03",   
                              period_start="2025-06-01")

In [74]:
# Stack panels  
nfp_long = pd.concat([hist_long, june_long, july_long],
                     ignore_index=True, sort=False)


# Compute forecast error 
nfp_long["error"] = nfp_long["forecast"] - nfp_long["actual"]

# Per release, we only want the last forecast 
econ_forecasts_long = (
    nfp_long
    .sort_values(["release_date", "economist", "asof"])
    .groupby(["release_date", "economist"], as_index=False)
    .tail(1)
    .reset_index(drop=True)
)

In [75]:
df_full = econ_forecasts_long.copy()
df_full.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error
0,2000-02-04,2000-01-31,387.0,Adam Chester,Lloyds Bank PLC,,NaT,
1,2000-02-04,2000-01-31,387.0,Alessandro Truppia,Aletti Gestielle Sgr Spa,,NaT,
2,2000-02-04,2000-01-31,387.0,Alexandre De Azara,Banco UBS SA,,NaT,
3,2000-02-04,2000-01-31,387.0,Alison Lynn Reaser,Point Loma Nazarene University,,NaT,
4,2000-02-04,2000-01-31,387.0,Allan Von Mehren,Danske Bank AS,,NaT,


In [76]:
# Quick sanity checks

# should only have one forecast per economist, per release
assert df_full.duplicated(subset=["release_date", "economist"]).sum() == 0,\
    "Duplicate (release date, economist) combinations detected"

# forecast timestamp has to be before release date. caveat: NA values will make the assertion fail, so we account NAs.
valid_asof = df_full["asof"].notna()
assert (df_full.loc[valid_asof, "asof"] <= df_full.loc[valid_asof, "release_date"]).all(),\
    "Some forecasts have asof (non-NA) after release date."

# each release date should have only one realized (actual) value, or 0 (if actual is NaN, current print not yet released) 
assert (df_full.groupby("release_date")["actual"].nunique() <= 1).all(), \
    "Multiple actual values found for the same release date"
    

# Note (economist, release_date) is not unique as some banks have multiple economist forecasts for a single release

In [77]:
# get median

median_by_release = (
    df_full.groupby("release_date")["forecast"]
    .median()
    .rename("median_forecast")
)

df_full = df_full.merge(median_by_release, on="release_date", how="left")

In [78]:
df_full.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error,median_forecast
0,2000-02-04,2000-01-31,387.0,Adam Chester,Lloyds Bank PLC,,NaT,,267.5
1,2000-02-04,2000-01-31,387.0,Alessandro Truppia,Aletti Gestielle Sgr Spa,,NaT,,267.5
2,2000-02-04,2000-01-31,387.0,Alexandre De Azara,Banco UBS SA,,NaT,,267.5
3,2000-02-04,2000-01-31,387.0,Alison Lynn Reaser,Point Loma Nazarene University,,NaT,,267.5
4,2000-02-04,2000-01-31,387.0,Allan Von Mehren,Danske Bank AS,,NaT,,267.5


In [79]:
# Sanity output 
print("NAs: ")
print(df_full.isna().sum(), "\n")
print("Rows: ", econ_forecasts_long.shape[0])
print("Economists: ", econ_forecasts_long['economist'].nunique())
print("NFP releases: ", econ_forecasts_long['release_date'].nunique())

NAs: 
release_date           0
period                 0
actual                 0
economist              0
firm                 608
forecast           75507
asof               75507
error              75507
median_forecast        0
dtype: int64 

Rows:  96830
Economists:  341
NFP releases:  306


**Filter COVID, define directional label**

In [80]:
# Surprise = actual minus consensus (median forecast) for directional forecasting
df_full["surprise"] = df_full["actual"] - df_full["median_forecast"]

# Sort dates
df_full["release_date"] = pd.to_datetime(df_full["release_date"])
df_full = df_full.sort_values("release_date")

# Begin in April 2003 (first month fully under NAICS + birth-death)
df_full = df_full[df_full["release_date"] >= "2003-06-01"]

# Filter out COVID dates (avoid pandemic volatility in df)
df = df_full[~df_full["release_date"].between("2020-01-01", "2022-12-31")]
df.head()

Unnamed: 0,release_date,period,actual,economist,firm,forecast,asof,error,median_forecast,surprise
12932,2003-06-06,2003-05-31,-17.0,Nash Peyton,Wilmington Trust Company,,NaT,,-30.0,13.0
12937,2003-06-06,2003-05-31,-17.0,Nicholas Van Ness,Credit Agricole CIB,,NaT,,-30.0,13.0
12936,2003-06-06,2003-05-31,-17.0,Nela Richardson,Automatic Data Processing Inc,,NaT,,-30.0,13.0
12935,2003-06-06,2003-05-31,-17.0,Neil Dutta,Renaissance Macro Research LLC,,NaT,,-30.0,13.0
12934,2003-06-06,2003-05-31,-17.0,Neal M Soss,Credit Suisse Securities USA LLC,-25.0,2003-05-30,-8.0,-30.0,13.0


In [81]:
OUT_DIR = "../out"         
DF_FILE       = "nfp_df.parquet"
DF_FULL_FILE  = "nfp_df_full.parquet"

# ensure directory exists 
os.makedirs(OUT_DIR, exist_ok=True)

df.to_parquet(os.path.join(OUT_DIR, DF_FILE),  engine="pyarrow", index=False)
df_full.to_parquet(os.path.join(OUT_DIR, DF_FULL_FILE), engine="pyarrow", index=False)

# write
print(f"✔️  Saved df  ➜  {OUT_DIR}/{DF_FILE}")
print(f"✔️  Saved df_full ➜  {OUT_DIR}/{DF_FULL_FILE}")

✔️  Saved df  ➜  ../out/nfp_df.parquet
✔️  Saved df_full ➜  ../out/nfp_df_full.parquet
