# Forecasting Consensus Expectations: Initial Jobless Claims

## Data preprocessing

**Imports**

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

**Load & Preprocess**

In [3]:
HIST_PATH = "../raw/ijc_historical.xlsx"
JULY_REL1_PATH = "../raw/ijc_73.xlsx"
JULY_REL2_PATH  = "../raw/ijc_710.xlsx"

hist_raw = pd.read_excel(HIST_PATH,
                         sheet_name=0,
                         header=None,
                         engine='openpyxl')
july_first_raw = pd.read_excel(JULY_REL1_PATH,
                         sheet_name=0,
                         header=None,
                         engine='openpyxl')
july_second_raw = pd.read_excel(JULY_REL2_PATH,
                         sheet_name=0,
                         header=None,
                         engine='openpyxl')

In [4]:
hist_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,463,464,465,466,467,468,469,470,471,472
0,INJCJC,,,,,,,,,,...,,,,,,,,,,
1,Summary,,Median Survey,Actual,Economist,,David H Sloan,,Bruin/Cabezas,,...,,Jay H Bryson,,,,James A Shugg,,Nash Peyton,,Crandall/Jordan
2,Release Date,Observation Date,,,Firm,As of,4CAST/Continuum Economics,As of,ABN Amro Bank NV,As of,...,As of,Wells Fargo Bank NA,As of,Westpac Banking Corp,As of,Westpac Banking Corp,As of,Wilmington Trust Company,As of,Wrightson ICAP LLC
3,2005-02-17 00:00:00,2005-02-12 00:00:00,315,302,,2005-02-15 00:00:00,310,--,--,2005-02-11 00:00:00,...,--,--,--,--,2005-02-11 00:00:00,315,--,--,2005-02-16 00:00:00,330
4,2005-02-24 00:00:00,2005-02-19 00:00:00,309,312,,2005-02-22 00:00:00,295,--,--,--,...,--,--,--,--,2005-02-18 00:00:00,305,--,--,2005-02-22 00:00:00,315


In [5]:
july_first_raw.head()

Unnamed: 0,0,1,2,3,4,5
0,INJCJC Index,,,,,
1,,Release Date,7/3/2025,,,
2,,Time,08:30,,,
3,,Country/Region,US,,,
4,,Event,Initial Jobless Claims,,,


In [89]:
# Raw data is messy. Want to process this into a long table format for our smart NFP forecasting (where each row is a forecast by a single economist/sell-side institution)

name_row = hist_raw.iloc[1]
firm_row = hist_raw.iloc[2]

In [90]:
base = hist_raw.iloc[3:, :4].copy()
base.columns = ["release_date", "period", "median_survey", "actual"]
base["release_date"] = pd.to_datetime(base["release_date"])
base["period"] = pd.to_datetime(base["period"])
base["actual"] = pd.to_numeric(base["actual"], errors="coerce")
base["median_survey"] = pd.to_numeric(base["median_survey"], errors="coerce")

In [91]:
# ID economist forecast columns by position
skip_labels = {"Summary", "Actual", "Economist", "Median Survey"}
# enumerate() gives (pos, cell-value)
econ_cols = [pos                             
             for pos, val in enumerate(name_row)
             if isinstance(val, str) and val not in skip_labels]

# build long dataframes
ASOF_FMT = "%m/%d/%Y"

long_frames = []

for pos in econ_cols:                       # pos is an int index
    asof_pos = pos - 1                      # as-of column sits immediately left

    tmp = base.copy()

    tmp["economist"] = name_row.iloc[pos]
    tmp["firm"]      = firm_row.iloc[pos]

    tmp["forecast"]  = pd.to_numeric(
        hist_raw.iloc[3:, pos], errors="coerce")

    tmp["asof"]      = pd.to_datetime(
        hist_raw.iloc[3:, asof_pos],
        format=ASOF_FMT, errors="coerce", cache=True)

    long_frames.append(tmp)

hist_long = pd.concat(long_frames, ignore_index=True)


In [92]:
hist_long

Unnamed: 0,release_date,period,median_survey,actual,economist,firm,forecast,asof
0,2005-02-17,2005-02-12,315.000,302,David H Sloan,4CAST/Continuum Economics,310.0,2005-02-15
1,2005-02-24,2005-02-19,309.000,312,David H Sloan,4CAST/Continuum Economics,295.0,2005-02-22
2,2005-03-03,2005-02-26,310.000,310,David H Sloan,4CAST/Continuum Economics,305.0,2005-03-01
3,2005-03-10,2005-03-05,310.000,327,David H Sloan,4CAST/Continuum Economics,305.0,2005-03-07
4,2005-03-17,2005-03-12,315.000,318,David H Sloan,4CAST/Continuum Economics,320.0,2005-03-14
...,...,...,...,...,...,...,...,...
223225,2025-05-29,2025-05-24,230.000,240,Crandall/Jordan,Wrightson ICAP LLC,230.0,2025-05-23
223226,2025-06-05,2025-05-31,235.000,247,Crandall/Jordan,Wrightson ICAP LLC,240.0,2025-05-30
223227,2025-06-12,2025-06-07,242.000,248,Crandall/Jordan,Wrightson ICAP LLC,245.0,2025-06-06
223228,2025-06-18,2025-06-14,245.000,245,Crandall/Jordan,Wrightson ICAP LLC,245.0,2025-06-13


In [93]:
# Helper to parse new releases
def parse_one_release(raw_path: str,
                      release_date: str,
                      period_start: str) -> pd.DataFrame:
    """
    Convert a Bloomberg IJC release.xls file to the long format
    used in df_full (one row = one economist forecast).
    """
    raw = pd.read_html(raw_path)[0]          

    # identify rows that hold the forecaster block
    forecast_col = pd.to_numeric(raw[3], errors='coerce')
    mask = raw[0].isna() & forecast_col.notna()

    block = (raw.loc[mask, [1, 2, 3, 4, 5]]
                .rename(columns={1: "economist",
                                 2: "firm",
                                 3: "forecast",
                                 4: "asof",
                                 5: "rank"}))

    block["forecast"] = forecast_col[mask]
    block["asof"] = pd.to_datetime(block["asof"], errors='coerce')

    # if economist name is blank, fall back to firm in parentheses
    block["economist"] = block.apply(
        lambda r: r["economist"] if pd.notna(r["economist"])
        else f"({r['firm']})",
        axis=1
    )

    # pull the actual NFP print from the same sheet
    actual_mask = raw.apply(
        lambda r: r.astype(str).str.contains("Actual", case=False).any(),
        axis=1
    )
    actual_candidates = (
    pd.to_numeric(raw.loc[actual_mask].stack(), errors="coerce")
      .dropna()
    )
    
    rel_date = pd.to_datetime(release_date)          # string → Timestamp
    if rel_date > pd.Timestamp.today().normalize():
        actual_val = np.nan                          # unreleased month
    else:
        actual_val = (actual_candidates.iloc[0]
                    if not actual_candidates.empty
                    else np.nan)

    # final tidy frame 
    tidy = (block.assign(release_date=pd.to_datetime(release_date),
                         period      =pd.to_datetime(period_start),
                         actual      =actual_val)
                  .loc[:, ["release_date","period","economist","firm",
                            "forecast","actual","asof"]])

    return tidy


In [94]:
july_rel1_long = parse_one_release(JULY_REL1_PATH,
                              release_date="2025-07-03",
                              period_start="2025-06-28")

july_rel2_long = parse_one_release(JULY_REL2_PATH,
                              release_date="2025-07-10",   
                              period_start="2025-06-01")

In [95]:
# Stack panels  
nfp_long = pd.concat([hist_long, july_rel1_long, july_rel2_long],
                     ignore_index=True, sort=False)


# Compute forecast error 
nfp_long["error"] = nfp_long["forecast"] - nfp_long["actual"]

# Per release, we only want the last forecast 
econ_forecasts_long = (
    nfp_long
    .sort_values(["release_date", "economist", "asof"])
    .groupby(["release_date", "economist"], as_index=False)
    .tail(1)
    .reset_index(drop=True)
)

In [96]:
df_full = econ_forecasts_long.copy()
df_full.head()

Unnamed: 0,release_date,period,median_survey,actual,economist,firm,forecast,asof,error
0,2005-02-17,2005-02-12,315.0,302.0,Adam Chester,Lloyds Bank PLC,320.0,2005-02-17,18.0
1,2005-02-17,2005-02-12,315.0,302.0,Alan Chernoff,Rutgers The State University of NJ,,NaT,
2,2005-02-17,2005-02-12,315.0,302.0,Alison Lynn Reaser,Point Loma Nazarene University,310.0,2005-02-14,8.0
3,2005-02-17,2005-02-12,315.0,302.0,Andreas Busch,Bantleon AG,,NaT,
4,2005-02-17,2005-02-12,315.0,302.0,Andrew Douglas,Dubuque Bank & Trust,,NaT,


In [97]:
# Quick sanity checks

# should only have one forecast per economist, per release
assert df_full.duplicated(subset=["release_date", "economist"]).sum() == 0,\
    "Duplicate (release date, economist) combinations detected"

# forecast timestamp has to be before release date. caveat: NA values will make the assertion fail, so we account NAs.
valid_asof = df_full["asof"].notna()
assert (df_full.loc[valid_asof, "asof"] <= df_full.loc[valid_asof, "release_date"]).all(),\
    "Some forecasts have asof (non-NA) after release date."

# each release date should have only one realized (actual) value, or 0 (if actual is NaN, current print not yet released) 
assert (df_full.groupby("release_date")["actual"].nunique() <= 1).all(), \
    "Multiple actual values found for the same release date"

In [98]:
df_full.head()

Unnamed: 0,release_date,period,median_survey,actual,economist,firm,forecast,asof,error
0,2005-02-17,2005-02-12,315.0,302.0,Adam Chester,Lloyds Bank PLC,320.0,2005-02-17,18.0
1,2005-02-17,2005-02-12,315.0,302.0,Alan Chernoff,Rutgers The State University of NJ,,NaT,
2,2005-02-17,2005-02-12,315.0,302.0,Alison Lynn Reaser,Point Loma Nazarene University,310.0,2005-02-14,8.0
3,2005-02-17,2005-02-12,315.0,302.0,Andreas Busch,Bantleon AG,,NaT,
4,2005-02-17,2005-02-12,315.0,302.0,Andrew Douglas,Dubuque Bank & Trust,,NaT,


In [99]:
# Sanity output 
print("NAs: ")
print(df_full.isna().sum(), "\n")
print("Rows: ", econ_forecasts_long.shape[0])
print("Economists: ", econ_forecasts_long['economist'].nunique())
print("NFP releases: ", econ_forecasts_long['release_date'].nunique())

NAs: 
release_date          0
period                0
median_survey        81
actual               37
economist             0
firm                  0
forecast         182273
asof             182273
error            182310
dtype: int64 

Rows:  223311
Economists:  222
NFP releases:  1065


**Filter COVID, define directional label**

In [100]:
# Surprise = actual minus consensus (median forecast) for directional forecasting
df_full["surprise"] = df_full["actual"] - df_full["median_survey"]

# Sort dates
df_full["release_date"] = pd.to_datetime(df_full["release_date"])
df_full = df_full.sort_values("release_date")

# Begin in January 2014 (First week after EUC-08)
df_full = df_full[df_full["release_date"] >= "2006-01-01"]

# Filter out COVID dates (avoid pandemic volatility in df)
df = df_full[~df_full["release_date"].between("2020-01-01", "2022-12-31")]
df.head()

Unnamed: 0,release_date,period,median_survey,actual,economist,firm,forecast,asof,error,surprise
9802,2006-01-05,2005-12-31,320.0,291.0,Michelle Meyer,Mastercard International Inc,,NaT,,-29.0
9793,2006-01-05,2005-12-31,320.0,291.0,Michael Derks,Fxpro Financial Services Ltd,,NaT,,-29.0
9794,2006-01-05,2005-12-31,320.0,291.0,Michael E Feroli,JP Morgan Securities LLC,,NaT,,-29.0
9795,2006-01-05,2005-12-31,320.0,291.0,Michael Gapen,Morgan Stanley & Co LLC,,NaT,,-29.0
9796,2006-01-05,2005-12-31,320.0,291.0,Michael Moran,Daiwa Capital Mkts,,NaT,,-29.0


In [101]:
OUT_DIR = "../out"         
DF_FILE       = "ijc_df.parquet"
DF_FULL_FILE  = "ijc_df_full.parquet"

# ensure directory exists 
os.makedirs(OUT_DIR, exist_ok=True)

df.to_parquet(os.path.join(OUT_DIR, DF_FILE),  engine="pyarrow", index=False)
df_full.to_parquet(os.path.join(OUT_DIR, DF_FULL_FILE), engine="pyarrow", index=False)

# write
print(f"✔️  Saved df  ➜  {OUT_DIR}/{DF_FILE}")
print(f"✔️  Saved df_full ➜  {OUT_DIR}/{DF_FULL_FILE}")

✔️  Saved df  ➜  ../out/ijc_df.parquet
✔️  Saved df_full ➜  ../out/ijc_df_full.parquet
