# PCR + Baseline 30m IC Evaluation

This notebook evaluates two signals at the same entry times:

- `baseline`: `data/processed/futures_only_signal`
- `baseline_plus_pcr`: baseline signal + PCR signal from `data/processed/index_russell_pcr_signal_30m`

It computes:

1. Cross-sectional IC by date
2. IC by ticker
3. IC by ticker by year

All calculations use the same entry-time sample and realized unhedged ADR return from entry-mid to daily close.


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


In [None]:
DATA_DIR = Path('data') if Path('data/raw/adr_info.csv').exists() else Path('../data')
assert (DATA_DIR / 'raw' / 'adr_info.csv').exists(), 'Could not resolve data directory.'

PCR_SIGNAL_DIR = DATA_DIR / 'processed' / 'index_russell_pcr_signal'
BASELINE_SIGNAL_DIR = DATA_DIR / 'processed' / 'futures_only_signal'
ADR_NBBO_DIR = DATA_DIR / 'raw' / 'adrs' / 'bbo-1m' / 'nbbo'
ADR_CLOSE_FILE = DATA_DIR / 'raw' / 'adrs' / 'adr_PX_LAST_adjust_none.csv'

ENTRY_TIMES = ['13:00', '13:30', '14:00', '14:30', '15:00', '15:30']
MIN_OBS_TICKER = 30
MIN_OBS_CROSS_SECTION = 8


In [None]:
def _safe_corr(x: pd.Series, y: pd.Series, min_obs: int = 2) -> float:
    z = pd.concat([x, y], axis=1).dropna()
    if len(z) < min_obs:
        return np.nan
    if z.iloc[:, 0].std() == 0 or z.iloc[:, 1].std() == 0:
        return np.nan
    return float(z.iloc[:, 0].corr(z.iloc[:, 1]))


def _coerce_datetime_index(df: pd.DataFrame) -> pd.DataFrame:
    """Return a copy with a DatetimeIndex when possible."""
    out = df.copy()
    if isinstance(out.index, pd.DatetimeIndex):
        return out

    candidate_cols = ['timestamp', 'datetime', 'date_time', 'index', '__index_level_0__']
    for col in candidate_cols:
        if col in out.columns:
            ts = pd.to_datetime(out[col], errors='coerce')
            if ts.notna().any():
                out = out.loc[ts.notna()].copy()
                out.index = pd.DatetimeIndex(ts[ts.notna()])
                return out

    idx = pd.to_datetime(out.index, errors='coerce')
    if idx.notna().any():
        out = out.loc[idx.notna()].copy()
        out.index = pd.DatetimeIndex(idx[idx.notna()])
        return out

    return out


def _extract_signal_at_times(signal_df: pd.DataFrame, entry_times):
    signal_df = _coerce_datetime_index(signal_df)
    if not isinstance(signal_df.index, pd.DatetimeIndex) or 'signal' not in signal_df.columns:
        return {t: pd.Series(dtype=float) for t in entry_times}

    out = {}
    for t in entry_times:
        h, m = map(int, t.split(':'))
        s = signal_df.loc[(signal_df.index.hour == h) & (signal_df.index.minute == m), 'signal']
        if len(s) == 0:
            out[t] = pd.Series(dtype=float)
            continue
        idx = s.index.tz_localize(None).normalize() if s.index.tz is not None else s.index.normalize()
        out[t] = pd.Series(s.values, index=idx).groupby(level=0).first().sort_index()
    return out


def _extract_nbbo_mid_at_times(nbbo_df: pd.DataFrame, entry_times):
    nbbo_df = _coerce_datetime_index(nbbo_df)
    if not isinstance(nbbo_df.index, pd.DatetimeIndex):
        return {t: pd.Series(dtype=float) for t in entry_times}

    mid = (nbbo_df['nbbo_bid'] + nbbo_df['nbbo_ask']) / 2
    out = {}
    for t in entry_times:
        s = mid.between_time(t, t)
        if len(s) == 0:
            out[t] = pd.Series(dtype=float)
            continue
        idx = s.index.tz_localize(None).normalize() if s.index.tz is not None else s.index.normalize()
        out[t] = pd.Series(s.values, index=idx).groupby(level=0).first().sort_index()
    return out


In [None]:
adr_info = pd.read_csv(DATA_DIR / 'raw' / 'adr_info.csv')
adr_info['ticker'] = adr_info['adr'].str.split().str[0]
all_tickers = sorted(adr_info['ticker'].unique())

pcr_tickers = sorted([p.name.split('=', 1)[1] for p in PCR_SIGNAL_DIR.glob('ticker=*') if p.is_dir()])
baseline_tickers = sorted([p.name.split('=', 1)[1] for p in BASELINE_SIGNAL_DIR.glob('ticker=*') if p.is_dir()])

# Evaluate on common tickers where both signals exist.
tickers = sorted(set(all_tickers).intersection(pcr_tickers).intersection(baseline_tickers))
print(f'Tickers in evaluation universe: {len(tickers)}')

close_df = pd.read_csv(ADR_CLOSE_FILE, index_col=0, parse_dates=True)
close_df.index = pd.DatetimeIndex(close_df.index).normalize()
close_df = close_df.reindex(columns=tickers)


In [None]:
rows = []

for ticker in tqdm(tickers, desc='Building panel'):
    pcr_path = PCR_SIGNAL_DIR / f'ticker={ticker}' / 'data.parquet'
    base_path = BASELINE_SIGNAL_DIR / f'ticker={ticker}' / 'data.parquet'
    nbbo_path = ADR_NBBO_DIR / f'ticker={ticker}' / 'data.parquet'

    if not (pcr_path.exists() and base_path.exists() and nbbo_path.exists()):
        continue

    pcr_df = pd.read_parquet(pcr_path)
    base_df = pd.read_parquet(base_path)
    nbbo_df = pd.read_parquet(nbbo_path, columns=['nbbo_bid', 'nbbo_ask'])

    pcr_by_time = _extract_signal_at_times(pcr_df, ENTRY_TIMES)
    base_by_time = _extract_signal_at_times(base_df, ENTRY_TIMES)
    entry_mid_by_time = _extract_nbbo_mid_at_times(nbbo_df, ENTRY_TIMES)

    if ticker not in close_df.columns:
        continue
    daily_close = close_df[ticker].dropna()

    for et in ENTRY_TIMES:
        pcr_s = pcr_by_time.get(et, pd.Series(dtype=float))
        base_s = base_by_time.get(et, pd.Series(dtype=float))
        entry_mid = entry_mid_by_time.get(et, pd.Series(dtype=float))

        if len(pcr_s) == 0 or len(base_s) == 0 or len(entry_mid) == 0:
            continue

        common = pcr_s.index.intersection(base_s.index).intersection(entry_mid.index).intersection(daily_close.index)
        if len(common) == 0:
            continue

        p = pcr_s.loc[common].astype(float)
        b = base_s.loc[common].astype(float)
        em = entry_mid.loc[common].astype(float)
        dc = daily_close.loc[common].astype(float)

        # Realized unhedged ADR return from entry mid to daily close.
        r = (dc / em) - 1.0

        panel = pd.DataFrame({
            'date': common,
            'ticker': ticker,
            'entry_time': et,
            'baseline_signal': b.values,
            'pcr_signal': p.values,
            'realized_return': r.values,
        }).dropna()

        if len(panel) == 0:
            continue

        panel['baseline_plus_pcr_signal'] = panel['baseline_signal'] + panel['pcr_signal']
        rows.append(panel)

panel_df = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
print('Panel rows:', len(panel_df))
print('Unique tickers:', panel_df['ticker'].nunique() if len(panel_df) else 0)
print('Date range:', panel_df['date'].min() if len(panel_df) else None, '->', panel_df['date'].max() if len(panel_df) else None)


In [None]:
# 1) Cross-sectional IC by date (pooled across entry times)
def cs_ic_by_group(df, signal_col, ret_col, min_obs=MIN_OBS_CROSS_SECTION):
    out = {}
    for key, g in df.groupby('date'):
        c = _safe_corr(g[signal_col], g[ret_col], min_obs=min_obs)
        out[key] = c
    return pd.Series(out).sort_index()


cs_date_baseline = cs_ic_by_group(panel_df, 'baseline_signal', 'realized_return')
cs_date_plus_pcr = cs_ic_by_group(panel_df, 'baseline_plus_pcr_signal', 'realized_return')

cs_date = pd.DataFrame({
    'baseline_cs_ic': cs_date_baseline,
    'baseline_plus_pcr_cs_ic': cs_date_plus_pcr,
})
cs_date['improvement'] = cs_date['baseline_plus_pcr_cs_ic'] - cs_date['baseline_cs_ic']

print('Cross-sectional IC by date (head):')
display(cs_date.head())
print('Mean CS IC by date:')
display(cs_date.mean(numeric_only=True).to_frame('mean').T)


In [None]:
# Optional: cross-sectional IC by (date, entry_time)
def cs_ic_by_date_time(df, signal_col, ret_col, min_obs=MIN_OBS_CROSS_SECTION):
    out = {}
    for (d, et), g in df.groupby(['date', 'entry_time']):
        out[(d, et)] = _safe_corr(g[signal_col], g[ret_col], min_obs=min_obs)
    idx = pd.MultiIndex.from_tuples(list(out.keys()), names=['date', 'entry_time']) if out else pd.MultiIndex.from_tuples([], names=['date', 'entry_time'])
    return pd.Series(list(out.values()), index=idx).sort_index()

cs_dt_baseline = cs_ic_by_date_time(panel_df, 'baseline_signal', 'realized_return')
cs_dt_plus_pcr = cs_ic_by_date_time(panel_df, 'baseline_plus_pcr_signal', 'realized_return')

cs_dt = pd.DataFrame({
    'baseline_cs_ic': cs_dt_baseline,
    'baseline_plus_pcr_cs_ic': cs_dt_plus_pcr,
})
cs_dt['improvement'] = cs_dt['baseline_plus_pcr_cs_ic'] - cs_dt['baseline_cs_ic']

print('Cross-sectional IC by (date, entry_time), sample:')
display(cs_dt.head(12))


In [None]:
# 2) IC by ticker (time-series IC across all dates + entry times)
ticker_rows = []
for ticker, g in panel_df.groupby('ticker'):
    base_ic = _safe_corr(g['baseline_signal'], g['realized_return'], min_obs=MIN_OBS_TICKER)
    plus_ic = _safe_corr(g['baseline_plus_pcr_signal'], g['realized_return'], min_obs=MIN_OBS_TICKER)
    ticker_rows.append({
        'ticker': ticker,
        'n_obs': len(g),
        'baseline_ic': base_ic,
        'baseline_plus_pcr_ic': plus_ic,
        'improvement': plus_ic - base_ic if pd.notna(base_ic) and pd.notna(plus_ic) else np.nan,
    })

ticker_ic = pd.DataFrame(ticker_rows).set_index('ticker').sort_values('improvement', ascending=False)
print('IC by ticker:')
display(ticker_ic)
print('Mean ticker IC:')
display(ticker_ic[['baseline_ic', 'baseline_plus_pcr_ic', 'improvement']].mean(numeric_only=True).to_frame('mean').T)


In [None]:
# 3) IC by ticker by year
panel_df['year'] = pd.DatetimeIndex(panel_df['date']).year

ty_rows = []
for (ticker, year), g in panel_df.groupby(['ticker', 'year']):
    base_ic = _safe_corr(g['baseline_signal'], g['realized_return'], min_obs=MIN_OBS_TICKER)
    plus_ic = _safe_corr(g['baseline_plus_pcr_signal'], g['realized_return'], min_obs=MIN_OBS_TICKER)
    ty_rows.append({
        'ticker': ticker,
        'year': int(year),
        'n_obs': len(g),
        'baseline_ic': base_ic,
        'baseline_plus_pcr_ic': plus_ic,
        'improvement': plus_ic - base_ic if pd.notna(base_ic) and pd.notna(plus_ic) else np.nan,
    })

ticker_year_ic = pd.DataFrame(ty_rows).sort_values(['ticker', 'year'])
print('Ticker-by-year IC (sample):')
display(ticker_year_ic.head(50))

print('Yearly mean IC across ticker-years:')
yearly_summary = ticker_year_ic.groupby('year')[['baseline_ic', 'baseline_plus_pcr_ic', 'improvement']].mean()
display(yearly_summary)


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# CS IC by date cumulative mean
cs_date[['baseline_cs_ic', 'baseline_plus_pcr_cs_ic']].expanding().mean().plot(ax=axes[0])
axes[0].set_title('Expanding Mean CS IC by Date')
axes[0].axhline(0.0, color='gray', linewidth=0.8)

# Ticker IC scatter
tmp = ticker_ic.dropna(subset=['baseline_ic', 'baseline_plus_pcr_ic'])
axes[1].scatter(tmp['baseline_ic'], tmp['baseline_plus_pcr_ic'], alpha=0.7)
lims = [
    np.nanmin([tmp['baseline_ic'].min(), tmp['baseline_plus_pcr_ic'].min()]),
    np.nanmax([tmp['baseline_ic'].max(), tmp['baseline_plus_pcr_ic'].max()]),
]
axes[1].plot(lims, lims, 'k--', linewidth=1)
axes[1].set_xlabel('Baseline IC')
axes[1].set_ylabel('Baseline + PCR IC')
axes[1].set_title('IC by Ticker')

# Yearly improvement bars
yearly_summary['improvement'].plot(kind='bar', ax=axes[2], color='steelblue')
axes[2].axhline(0.0, color='gray', linewidth=0.8)
axes[2].set_title('Yearly Mean IC Improvement (Baseline + PCR - Baseline)')

plt.tight_layout()


In [None]:
# Save analysis tables for reuse in scripts
out_dir = DATA_DIR / 'processed' / 'reports' / 'pcr_plus_baseline_30m_ic'
out_dir.mkdir(parents=True, exist_ok=True)

cs_date.to_csv(out_dir / 'cross_sectional_ic_by_date.csv')
cs_dt.to_csv(out_dir / 'cross_sectional_ic_by_date_entry_time.csv')
ticker_ic.to_csv(out_dir / 'ic_by_ticker.csv')
ticker_year_ic.to_csv(out_dir / 'ic_by_ticker_by_year.csv', index=False)

print('Saved outputs to:', out_dir)
