# 📊 Factors IC Analysis

This notebook computes **Information Coefficient (IC)** for alpha factors in your research pipeline.

**IC** = Spearman correlation between factor scores and *next-period* returns.

It supports:
- Daily IC per factor
- Rolling mean IC and ICIR
- IC **decay** (multi-horizon)
- **Quintile portfolios** and Q5–Q1 long–short backtest
- IC **heatmap** across time and factors

If data files are missing, it auto-generates demo data so you can run the notebook immediately.

In [None]:
import os, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

plt.rcParams['figure.figsize'] = (10,4)

# ---- Config: set your paths or leave defaults ----
FACTORS_CSV = './data/factors.csv'   # expected index: date,symbol ; columns: factor names
RETURNS_CSV = './data/returns.csv'   # expected index: date,symbol ; column: 'fwd_ret' (or daily returns to be shifted)
RETURNS_COL = 'fwd_ret'              # if not found, we'll derive from 'ret' or 'price'
MAX_LAG = 10                         # horizons for IC decay
N_BUCKETS = 5                        # for quantile portfolios
ROLL_WIN = 60                        # rolling IC mean window (days)

def _demo():
    np.random.seed(42)
    dates = pd.date_range('2024-01-01', periods=180, freq='B')
    symbols = [f'STK{i:03d}' for i in range(80)]
    idx = pd.MultiIndex.from_product([dates, symbols], names=['date','symbol'])
    dfF = pd.DataFrame(index=idx)
    # latent true driver
    latent = np.random.normal(0,1,len(idx))
    dfF['momentum'] = latent + np.random.normal(0,1,len(idx))
    dfF['value']    = np.random.normal(0,1,len(idx))
    dfF['sentiment']= 0.5*latent + np.random.normal(0,1,len(idx))
    # forward returns with momentum linkage
    dfR = pd.DataFrame(index=idx)
    dfR['fwd_ret'] = 0.04*dfF['momentum'] + np.random.normal(0,1,len(idx))
    # scale to daily return-like numbers
    dfR['fwd_ret'] = dfR['fwd_ret']*0.01
    return dfF, dfR

def load_factors_returns():
    hasF = os.path.exists(FACTORS_CSV)
    hasR = os.path.exists(RETURNS_CSV)
    if hasF:
        dfF = pd.read_csv(FACTORS_CSV)
        if {'date','symbol'}.issubset(dfF.columns):
            dfF['date'] = pd.to_datetime(dfF['date'])
            dfF = dfF.set_index(['date','symbol']).sort_index()
        else:
            raise ValueError('factors.csv must have columns: date, symbol, <factors...>')
    if hasR:
        dfR = pd.read_csv(RETURNS_CSV)
        if {'date','symbol'}.issubset(dfR.columns):
            dfR['date'] = pd.to_datetime(dfR['date'])
            dfR = dfR.set_index(['date','symbol']).sort_index()
            if RETURNS_COL not in dfR.columns:
                # derive fwd returns heuristically
                if 'ret' in dfR.columns:
                    dfR[RETURNS_COL] = dfR['ret'].groupby(level='symbol').shift(-1)
                elif 'price' in dfR.columns:
                    pr = dfR['price'].unstack('symbol').sort_index()
                    rets = pr.pct_change().stack().rename('ret') # type: ignore
                    dfR = dfR.join(rets, how='left')
                    dfR[RETURNS_COL] = dfR['ret'].groupby(level='symbol').shift(-1)
                else:
                    raise ValueError('RETURNS_CSV missing fwd_ret/ret/price columns')
        else:
            raise ValueError('returns.csv must have columns: date, symbol, <fwd_ret or ret/price>')
    if not (hasF and hasR):
        return _demo()
    return dfF, dfR[[RETURNS_COL]]

dfF, dfR = load_factors_returns()
factors = [c for c in dfF.columns if c not in ('date','symbol')]
print('Loaded factors:', factors[:6], '... (total', len(factors), ')')
print('Obs:', len(dfF))


## Daily IC per Factor

In [None]:
def daily_ic(dfF, dfR, factor):
    ics = []
    for d, X in dfF[[factor]].groupby(level='date'):
        Y = dfR.loc[dfR.index.get_level_values('date')==d]
        G = X.join(Y, how='inner')
        x = G[factor].values
        y = G.iloc[:, -1].values  # last column = fwd_ret
        if len(x) > 3 and np.nanstd(x)>0 and np.nanstd(y)>0:
            mask = ~np.isnan(x) & ~np.isnan(y)
            if mask.sum()>3:
                ic,_ = spearmanr(x[mask], y[mask])
                ics.append((d, ic))
    return pd.Series(dict(ics)).sort_index()

ICs = {}
for f in factors:
    ICs[f] = daily_ic(dfF, dfR, f)
ICdf = pd.DataFrame(ICs)
ICdf.plot(title='Daily IC by Factor'); plt.axhline(0, linestyle='--'); plt.show()
ICdf.tail()

## IC Summary (mean, std, ICIR)

In [None]:
ic_mean = ICdf.mean()
ic_std  = ICdf.std()
ic_ir   = ic_mean / (ic_std + 1e-12)
summary = pd.DataFrame({'mean_ic': ic_mean, 'ic_std': ic_std, 'ic_ir': ic_ir}).sort_values('ic_ir', ascending=False)
summary.round(3)

## Rolling Mean IC

In [None]:
roll = ICdf.rolling(ROLL_WIN, min_periods=max(5, ROLL_WIN//5)).mean()
roll.plot(title=f'Rolling Mean IC (window={ROLL_WIN})'); plt.axhline(0, linestyle='--'); plt.show()

## IC Decay (1..MAX_LAG days ahead)

In [None]:
def ic_decay(dfF, dfR, factor, max_lag=5):
    out = {}
    for L in range(1, max_lag+1):
        # shift returns back by L (so factor at t is compared to return t+L)
        dfLag = dfR.copy()
        dfLag.iloc[:,0] = dfLag.iloc[:,0].groupby(level='symbol').shift(-(L-1))
        s = daily_ic(dfF, dfLag, factor)
        out[L] = s.mean()
    return out

decay_tbl = pd.DataFrame({f: ic_decay(dfF, dfR, f, MAX_LAG) for f in factors}).T
decay_tbl.columns = [f'lag_{i}' for i in range(1, MAX_LAG+1)]
decay_tbl.round(3)

## Quintile Portfolios & Q5–Q1 Long–Short

In [None]:
def quintile_portfolio(dfF, dfR, factor, n_buckets=5):
    # For each date, sort by factor and split into buckets, compute next return by bucket
    buckets = {i: [] for i in range(1, n_buckets+1)}
    for d, X in dfF[[factor]].groupby(level='date'):
        Y = dfR.loc[dfR.index.get_level_values('date')==d]
        G = X.join(Y, how='inner').dropna()
        if len(G) < n_buckets:
            continue
        G = G.sort_values(factor)
        splits = np.array_split(G, n_buckets)
        for i, part in enumerate(splits, start=1):
            buckets[i].append(part.iloc[:,-1].mean())  # type: ignore # mean fwd_ret of bucket
    # series per bucket
    ser = {f'Q{i}': pd.Series(v).cumsum() for i, v in buckets.items() if len(v)>0}
    return ser

factor_ = factors[0] if len(factors)>0 else None
if factor_:
    ser = quintile_portfolio(dfF, dfR, factor_, N_BUCKETS)
    if ser:
        dfq = pd.DataFrame(ser)
        dfq.plot(title=f'Quintile Cum. Returns (factor={factor_})'); plt.show()
        if 'Q1' in dfq.columns and f'Q{N_BUCKETS}' in dfq.columns:
            ls = dfq[f'Q{N_BUCKETS}'] - dfq['Q1']
            plt.figure(); plt.plot(ls.index, ls.values); plt.title('Long–Short (Q5 - Q1) Cum. Return'); plt.show() # type: ignore
    else:
        print('Insufficient data for quintiles.')
else:
    print('No factors found.')

## IC Heatmap (time x factor)

In [None]:
if len(ICdf.columns)>0:
    plt.figure(figsize=(min(12, 1+len(ICdf.columns)*0.8), 5))
    M = ICdf.fillna(0.0).values.T
    plt.imshow(M, aspect='auto', interpolation='nearest')
    plt.colorbar(label='IC')
    plt.yticks(range(len(ICdf.columns)), ICdf.columns) # type: ignore
    plt.title('IC Heatmap (factors x time)')
    plt.xlabel('time index')
    plt.tight_layout(); plt.show()