
# Chapter 4 — Depth of Influence: Lag Correlations, HAC Regressions, Event Study

This notebook performs the **core quantitative analysis**:
- Lagged correlations between daily sentiment indices and changes of RSI/MACD;
- Distributed-lag regressions (OLS with HAC / Newey–West covariance);
- Event study around **strong news days** using a simple market model.

**Inputs (expected)** from previous notebooks:
- `./outputs/panel_AAPL.csv`
- `./outputs/panel_XOM.csv`


## 0. Environment (Colab-friendly)

In [None]:

# If running in Colab, uncomment:
# !pip install -q pandas==2.2.2 numpy==1.26.4 statsmodels==0.14.2 matplotlib==3.8.4


## 1. Imports & Config

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import statsmodels.api as sm

OUT_DIR = Path('./outputs')
FIG_DIR = Path('./figures')
TAB_DIR = Path('./tables')
for d in [OUT_DIR, FIG_DIR, TAB_DIR]:
    d.mkdir(parents=True, exist_ok=True)

TICKERS = ['AAPL','XOM']
LAGS = [0,1,2]


## 2. Load Panels

In [None]:

def load_panel(ticker: str) -> pd.DataFrame:
    p = pd.read_csv(OUT_DIR / f'panel_{ticker}.csv')
    p['date'] = pd.to_datetime(p['date'])
    p = p.sort_values('date').reset_index(drop=True)
    return p

panels = {t: load_panel(t) for t in TICKERS}
panels['AAPL'].head(3)


## 3. Lagged Correlations

In [None]:

def lagged_correlations(df: pd.DataFrame, sentiment_col: str, target_col: str, max_lag:int=2):
    out = []
    for k in range(-max_lag, max_lag+1):
        if k == 0:
            x = df[sentiment_col].values
            y = df[target_col].values
        elif k > 0:
            x = df[sentiment_col].shift(k).values
            y = df[target_col].values
        else:  # lead
            x = df[sentiment_col].values
            y = df[target_col].shift(-k).values
        mask = np.isfinite(x) & np.isfinite(y)
        r = np.corrcoef(x[mask], y[mask])[0,1] if mask.sum()>3 else np.nan
        out.append((k, r))
    return pd.DataFrame(out, columns=['lag','corr'])

def plot_lagcorr(df_res: pd.DataFrame, title: str, path: Path):
    plt.figure(figsize=(6,4))
    markerline, stemlines, baseline = plt.stem(df_res['lag'], df_res['corr'], use_line_collection=True)
    plt.axhline(0, linestyle='--')
    plt.title(title); plt.xlabel('Lag (k)'); plt.ylabel('Correlation')
    plt.tight_layout(); plt.savefig(path, dpi=200); plt.close()

for t in TICKERS:
    df = panels[t]
    for target in ['dRSI','dMACD','ret']:
        for s_col in ['s_mean','s_absmax','n_news']:
            res = lagged_correlations(df, s_col, target, max_lag=2)
            res.to_csv(TAB_DIR / f'lagcorr_{t}_{s_col}_{target}.csv', index=False)
            plot_lagcorr(res, f'{t} Lag Corr: {s_col} → {target}', FIG_DIR / f'lagcorr_{t}_{s_col}_{target}.png')


## 4. Distributed-Lag Regressions (HAC/Newey–West)

In [None]:

def ols_hac(df: pd.DataFrame, y_col: str, s_prefix: str='s_mean', lags=[0,1,2], controls: list=None, nw_maxlags:int=2):
    X_cols = [f'{s_prefix}_lag{k}' for k in lags]
    if controls:
        X_cols += controls
    X = df[X_cols].copy()
    X = sm.add_constant(X, has_constant='add')
    y = df[y_col].copy()
    mask = X.notna().all(axis=1) & y.notna()
    model = sm.OLS(y[mask], X[mask])
    res = model.fit(cov_type='HAC', cov_kwds={'maxlags': nw_maxlags})
    return res, X_cols

def collect_table(res, X_cols, model_name: str, ticker: str, y_col: str) -> pd.DataFrame:
    coefs = res.params.reindex(['const'] + X_cols)
    ses   = res.bse.reindex(['const'] + X_cols)
    pvals = res.pvalues.reindex(['const'] + X_cols)
    df = pd.DataFrame({'term': ['const'] + X_cols, 'coef': coefs.values, 'se': ses.values, 'pval': pvals.values})
    df['model'] = model_name; df['ticker'] = ticker; df['target'] = y_col
    df['adjR2'] = res.rsquared_adj; df['n'] = int(res.nobs)
    return df

summary_rows = []
for t in TICKERS:
    df = panels[t].copy()
    controls = []
    if 'SPY_ret' in df.columns: controls.append('SPY_ret')
    if 'Sector_ret' in df.columns: controls.append('Sector_ret')

    for y_col in ['dRSI','dMACD','ret']:
        for s_prefix in ['s_mean','s_absmax']:
            res, Xc = ols_hac(df, y_col=y_col, s_prefix=s_prefix, lags=[0,1,2], controls=controls, nw_maxlags=2)
            tab = collect_table(res, Xc, f'HAC_DLR_{s_prefix}', t, y_col)
            summary_rows.append(tab)

summary = pd.concat(summary_rows, ignore_index=True)
summary.to_csv(TAB_DIR/'dlr_hac_summary.csv', index=False)

def plot_dlr_betas(df_tab: pd.DataFrame, ticker: str, target: str, s_prefix: str, path: Path):
    d = df_tab[(df_tab['ticker']==ticker)&(df_tab['target']==target)&(df_tab['model']==f'HAC_DLR_{s_prefix}')].copy()
    d = d[d['term'].str.contains(f'{s_prefix}_lag')]
    d['k'] = d['term'].str.extract(r'lag(\d+)').astype(int)
    d = d.sort_values('k')
    x = d['k'].values; y = d['coef'].values; e = d['se'].values
    plt.figure(figsize=(6,4))
    plt.errorbar(x, y, yerr=1.96*e, fmt='o-')
    plt.axhline(0, linestyle='--')
    plt.title(f'{ticker} Betas (HAC) — {s_prefix} → {target}')
    plt.xlabel('Lag k'); plt.ylabel('Beta')
    plt.tight_layout(); plt.savefig(path, dpi=200); plt.close()

for t in TICKERS:
    for target in ['dRSI','dMACD','ret']:
        plot_dlr_betas(summary, t, target, 's_mean',  FIG_DIR/f'betas_{t}_{target}_smean.png')
        plot_dlr_betas(summary, t, target, 's_absmax',FIG_DIR/f'betas_{t}_{target}_sabsmax.png')


## 5. Event Study (Strong News Days)

In [None]:

def event_study_simple(df: pd.DataFrame, strength_col: str='s_absmax', pct: float=0.8, win=(-1,2), market_col: str='SPY_ret'):
    thr = df[strength_col].abs().quantile(pct)
    ev_idx = df.index[df[strength_col].abs() >= thr].tolist()
    if not ev_idx:
        return pd.DataFrame(columns=['rel_day','CAR_pos','CAR_neg']), pd.DataFrame()

    if market_col in df.columns:
        X = sm.add_constant(df[market_col].fillna(0.0))
        y = df['ret'].fillna(0.0)
        mm = sm.OLS(y, X).fit()
        df['ret_hat'] = mm.predict(X)
    else:
        df['ret_hat'] = 0.0

    df['AR'] = df['ret'] - df['ret_hat']

    rows = []; ev_table = []
    for idx in ev_idx:
        for tau in range(win[0], win[1]+1):
            j = idx + tau
            if j < 0 or j >= len(df):
                continue
            rows.append({'event_idx': idx, 'rel_day': tau, 'AR': df.iloc[j]['AR'],
                         'pos_event': 1 if df.iloc[idx][strength_col] > 0 else 0})
        ev_table.append({'date': df.iloc[idx]['date'], 'strength': df.iloc[idx][strength_col]})
    ev = pd.DataFrame(ev_table)
    es = pd.DataFrame(rows)
    if es.empty:
        return pd.DataFrame(columns=['rel_day','CAR_pos','CAR_neg']), ev

    agg = es.groupby(['rel_day','pos_event'])['AR'].mean().reset_index()

    days = sorted(agg['rel_day'].unique())
    car_pos, car_neg = [], []
    s_pos = 0.0; s_neg = 0.0
    for d in days:
        ap = agg[(agg['rel_day']==d)&(agg['pos_event']==1)]['AR'].mean() if not agg[(agg['rel_day']==d)&(agg['pos_event']==1)].empty else 0.0
        an = agg[(agg['rel_day']==d)&(agg['pos_event']==0)]['AR'].mean() if not agg[(agg['rel_day']==d)&(agg['pos_event']==0)].empty else 0.0
        s_pos += ap; s_neg += an
        car_pos.append(s_pos); car_neg.append(s_neg)
    car_df = pd.DataFrame({'rel_day': days, 'CAR_pos': car_pos, 'CAR_neg': car_neg})
    return car_df, ev

def plot_car(car_df: pd.DataFrame, title: str, path: Path):
    plt.figure(figsize=(6,4))
    plt.plot(car_df['rel_day'], car_df['CAR_pos'], marker='o', label='Positive events')
    plt.plot(car_df['rel_day'], car_df['CAR_neg'], marker='o', label='Negative events')
    plt.axhline(0, linestyle='--')
    plt.title(title); plt.xlabel('Event window (days)'); plt.ylabel('Mean CAR')
    plt.legend(); plt.tight_layout(); plt.savefig(path, dpi=200); plt.close()

for t in TICKERS:
    df = panels[t].copy()
    car, evtab = event_study_simple(df, strength_col='s_absmax', pct=0.8, win=(-1,2),
                                    market_col='SPY_ret' if 'SPY_ret' in df.columns else None)
    car.to_csv(TAB_DIR/f'event_car_{t}.csv', index=False)
    evtab.to_csv(TAB_DIR/f'event_dates_{t}.csv', index=False)
    if not car.empty:
        plot_car(car, f'{t} Event Study (Top-Quantile |s_absmax|)', FIG_DIR/f'event_car_{t}.png')
