# Forecasting Consensus Expectations: Consumer Price Index (CPI)

## Data preprocessing

**Imports**

In [50]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

from tqdm.auto import tqdm
from scipy import stats, special
from scipy.optimize import brentq
from collections import defaultdict
from itertools import product
from scipy.stats import t as student_t, norm, binomtest, jarque_bera

**Load & Preprocess**

In [51]:
CPI_MOM_PATH = "../raw/CORECPI_MoM_historical.xlsx"
CPI_YOY_PATH = "../raw/CORECPI_YoY_historical.xlsx"

In [52]:

#general historical loader

def load_bbg_history(path: str, series_name: str) -> pd.DataFrame:
    """
    Convert a Bloomberg *historical.xlsx* file to one-row-per-forecaster long panel.

    Returns a DataFrame with columns:
    release_date | period | economist | firm | forecast | actual | asof
    plus         median_survey | error | surprise | series
    """
    hist_raw = pd.read_excel(path, sheet_name=0, header=None, engine="openpyxl")

    # --- identify header rows -------------------------------------------------
    name_row, firm_row = hist_raw.iloc[1], hist_raw.iloc[2]

    base = hist_raw.iloc[3:, :4].copy()
    base.columns = ["release_date", "period", "median_survey", "actual"]
    base["release_date"]  = pd.to_datetime(base["release_date"])
    base["period"]        = pd.to_datetime(base["period"])
    base["median_survey"] = pd.to_numeric(base["median_survey"], errors="coerce")
    base["actual"]        = pd.to_numeric(base["actual"],        errors="coerce")

    # --- long-form economist block -------------------------------------------
    skip = {"Summary", "Actual", "Economist", "Median Survey"}
    econ_cols = [i for i, v in enumerate(name_row) if isinstance(v, str) and v not in skip]
    ASOF_FMT  = "%m/%d/%Y"

    long_frames = []
    for pos in econ_cols:
        asof_pos = pos - 1
        tmp = base.copy()

        tmp["economist"] = name_row.iloc[pos]
        tmp["firm"]      = firm_row.iloc[pos]
        tmp["forecast"]  = pd.to_numeric(hist_raw.iloc[3:, pos], errors="coerce")
        tmp["asof"]      = pd.to_datetime(hist_raw.iloc[3:, asof_pos],
                                          format=ASOF_FMT, errors="coerce", cache=True)
        long_frames.append(tmp)

    hist_long = pd.concat(long_frames, ignore_index=True)

    # --- keep only the last forecast per economist per release ---------------
    df_full = (hist_long
               .sort_values(["release_date", "economist", "asof"])
               .groupby(["release_date", "economist"], as_index=False)
               .tail(1)
               .reset_index(drop=True))
    
    # economists must have ≥ 6 non-NaN forecasts overall
    valid_counts = (df_full
                    .groupby("economist")["forecast"]
                    .transform(lambda s: s.notna().sum()))
    df_full = df_full[valid_counts >= 3].copy()
    
    # --- sanity checks (same as IJC) -----------------------------------------
    assert df_full.duplicated(subset=["release_date", "economist"]).sum() == 0
    valid_asof = df_full["asof"].notna()
    assert (df_full.loc[valid_asof, "asof"]
            <= df_full.loc[valid_asof, "release_date"]).all()
    assert (df_full.groupby("release_date")["actual"].nunique() <= 1).all()

    # --- diagnostic columns ---------------------------------------------------
    df_full["error"]    = df_full["forecast"] - df_full["actual"]
    df_full["surprise"] = df_full["actual"]   - df_full["median_survey"]
    df_full["series"]   = series_name         # "Core CPI M/M" or "Core CPI Y/Y"

    # optional: match your NFP time cut-offs
    df_full = df_full[df_full["release_date"] >= "2006-01-01"]

    # COVID exclusion (mirrors IJC logic)
    df      = df_full[~df_full["release_date"].between("2020-01-01", "2022-12-31")]

    return df, df_full


In [53]:
cpi_mom, cpi_mom_full = load_bbg_history(CPI_MOM_PATH, "Core CPI M/M")
cpi_yoy, cpi_yoy_full = load_bbg_history(CPI_YOY_PATH, "Core CPI Y/Y")

In [54]:
OUT_DIR = "../out"         

# ensure directory exists 
os.makedirs(OUT_DIR, exist_ok=True)

cpi_out = {
    "cpi_mom_df.parquet"      : cpi_mom,
    "cpi_mom_df_full.parquet" : cpi_mom_full,
    "cpi_yoy_df.parquet"      : cpi_yoy,
    "cpi_yoy_df_full.parquet" : cpi_yoy_full,
}

for fname, frame in cpi_out.items():
    frame.to_parquet(os.path.join(OUT_DIR, fname),
                     engine="pyarrow", index=False)
    print(f"✔️  Saved ➜  {OUT_DIR}/{fname}")

✔️  Saved ➜  ../out/cpi_mom_df.parquet
✔️  Saved ➜  ../out/cpi_mom_df_full.parquet
✔️  Saved ➜  ../out/cpi_yoy_df.parquet
✔️  Saved ➜  ../out/cpi_yoy_df_full.parquet
