# Forecasting Consensus Expectations: Initial Jobless Claims

## Data preprocessing

**Imports**

In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

In [9]:
def load_hist_ijc(path: str) -> pd.DataFrame:
    """
    Parse the Bloomberg IJC history workbook into long format.
    Returns one row per (release_date, economist) forecast.
    """
    raw = pd.read_excel(path, sheet_name=0, header=None, engine="openpyxl")

    name_row = raw.iloc[1]        # row holding economist names
    firm_row = raw.iloc[2]        # row holding firm names

    base = raw.iloc[3:, :4].copy()
    base.columns = ["release_date", "period", "median_survey", "actual"]
    base["release_date"]  = pd.to_datetime(base["release_date"])
    base["period"]        = pd.to_datetime(base["period"])
    base["median_survey"] = pd.to_numeric(base["median_survey"], errors="coerce")
    base["actual"]        = pd.to_numeric(base["actual"], errors="coerce")

    skip = {"Summary", "Actual", "Economist", "Median Survey"}
    econ_cols = [i for i, v in enumerate(name_row)
                 if isinstance(v, str) and v not in skip]

    frames = []
    for pos in econ_cols:
        asof_pos = pos - 1                          # “as-of” column sits left
        tmp = base.copy()
        tmp["economist"] = name_row.iloc[pos]
        tmp["firm"]      = firm_row.iloc[pos]
        tmp["forecast"]  = pd.to_numeric(raw.iloc[3:, pos], errors="coerce")
        tmp["asof"]      = pd.to_datetime(raw.iloc[3:, asof_pos],
                                          format="%m/%d/%Y", errors="coerce")
        frames.append(tmp)

    return pd.concat(frames, ignore_index=True, copy=False)


def load_single_release(path: str,
                        release_date: str,
                        period_start: str) -> pd.DataFrame:
    """
    Parse a Bloomberg single-release .xls/.xlsx into long format.
    Keeps the column layout identical to load_hist_ijc().
    """
    raw = pd.read_excel(path, sheet_name=0, header=None, engine="openpyxl")

    # identify forecaster rows
    fc_col = pd.to_numeric(raw[3], errors="coerce")
    mask   = raw[0].isna() & fc_col.notna()
    block  = (raw.loc[mask, [1, 2, 3, 4, 5]]
                .rename(columns={1:"economist", 2:"firm",
                                 3:"forecast",  4:"asof", 5:"rank"}))

    block["forecast"] = fc_col[mask]
    block["asof"]     = pd.to_datetime(block["asof"], errors="coerce")

    # fill blank economist cells with “(Firm)”
    block["economist"] = block.apply(
        lambda r: r["economist"] if pd.notna(r["economist"])
        else f"({r['firm']})", axis=1
    )

    # pull actual print if available
    act_mask  = raw.apply(lambda r: r.astype(str)
                          .str.contains("Actual", case=False).any(), axis=1)
    act_vals  = pd.to_numeric(raw.loc[act_mask].stack(), errors="coerce").dropna()
    rel_ts    = pd.to_datetime(release_date)
    actual    = np.nan if rel_ts > pd.Timestamp.today().normalize() \
                       else (act_vals.iloc[0] if not act_vals.empty else np.nan)

    return (block.assign(release_date = rel_ts,
                         period       = pd.to_datetime(period_start),
                         actual       = actual)
                 .loc[:, ["release_date","period","economist","firm",
                          "forecast","actual","asof"]])

In [None]:
HIST_PATH = "../raw/ijc_historical.xlsx"
SINGLE_RELEASES = [
    ("../raw/ijc_73.xlsx",  "2025-07-03", "2025-06-28"),
    ("../raw/ijc_710.xlsx", "2025-07-10", "2025-07-05"),
    ("../raw/ijc_717.xlsx", "2025-07-17", "2025-07-12")
]

# ingest + load
hist_df   = load_hist_ijc(HIST_PATH)
single_df = [load_single_release(p, d, s) for p, d, s in SINGLE_RELEASES]

ijc_long  = pd.concat([hist_df, *single_df], ignore_index=True, sort=False)

# ─── 2. latest forecast per economist ─────────
ijc_long["error"] = ijc_long["forecast"] - ijc_long["actual"]

ijc_full = (ijc_long
            .sort_values(["release_date","economist","asof"])
            .groupby(["release_date","economist"], as_index=False)
            .tail(1)
            .reset_index(drop=True))

# checks
assert ijc_full.duplicated(["release_date","economist"]).sum() == 0
valid_asof = ijc_full["asof"].notna()
assert (ijc_full.loc[valid_asof,"asof"]
        <= ijc_full.loc[valid_asof,"release_date"]).all()
assert (ijc_full.groupby("release_date")["actual"].nunique() <= 1).all()

# subset
ijc_full["surprise"] = ijc_full["actual"] - ijc_full["median_survey"]
ijc_full = ijc_full.sort_values("release_date")
ijc_full = ijc_full[ijc_full["release_date"] >= "2006-01-01"]

sample = ijc_full[
    (ijc_full["release_date"] >= "2006-01-01") &
    ~ijc_full["release_date"].between("2020-01-01", "2022-12-31")
]

# to parquet
OUT_DIR = "../out"
os.makedirs(OUT_DIR, exist_ok=True)
sample.to_parquet(f"{OUT_DIR}/ijc_df.parquet",      engine="pyarrow", index=False)
ijc_full.to_parquet(f"{OUT_DIR}/ijc_df_full.parquet", engine="pyarrow", index=False)

print("✔ Saved cleaned sample  ➜ ijc_df.parquet")
print("✔ Saved full panel      ➜ ijc_df_full.parquet")


✔ Saved cleaned sample  ➜ ijc_df.parquet
✔ Saved full panel      ➜ ijc_df_full.parquet
