
# Chapter 3 â€” Merge Prices & Sentiment, Build TA Features

This notebook prepares the integrated daily panels per ticker:
- **Prices** (Open, High, Low, Close, AdjClose, Volume)
- **Technical indicators**: RSI(14), MACD(12,26,9), Signal, Histogram
- **Aggregated sentiment** per day: mean s*, abs-max s*, news count
- **Lagged sentiment features** for Chapter 4 (k = 0..2)
- **Optional** market controls (SPY, XLK for AAPL; XLE for XOM; VIX placeholder)

**Inputs (expected)**
- Outputs of `02_sentiment_models.ipynb`: `./outputs/*_inference.csv`
- Optional news CSVs with a `time` column for post-close mapping

**Outputs**
- `./outputs/panel_AAPL.csv`
- `./outputs/panel_XOM.csv`
- Figures (sanity checks) in `./figures/`


## 0. Environment (Colab-friendly)

In [None]:

# If running in Colab, uncomment:
# !pip install -q yfinance==0.2.40 pandas==2.2.2 numpy==1.26.4 matplotlib==3.8.4
# !pip install -q ta==0.11.0  # optional, we compute RSI/MACD manually anyway


## 1. Imports & Config

In [None]:

import os
from pathlib import Path
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from datetime import datetime

DATA_DIR = Path('./data')
OUT_DIR  = Path('./outputs')
FIG_DIR  = Path('./figures')
for d in [OUT_DIR, FIG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

TICKERS = ['AAPL','XOM']
MARKET_CONTROLS = {'SPY': 'SPY', 'XLK':'XLK', 'XLE':'XLE'}  # optional
START = '2024-09-01'
END   = '2024-12-31'

print("Config OK")


## 2. Technical Indicators (RSI & MACD)

In [None]:

def compute_RSI(close: pd.Series, n:int=14) -> pd.Series:
    delta = close.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    gain = up.rolling(n, min_periods=n).mean()
    loss = down.rolling(n, min_periods=n).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_MACD(close: pd.Series, fast:int=12, slow:int=26, signal:int=9):
    ema_fast = close.ewm(span=fast, adjust=False).mean()
    ema_slow = close.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    sig  = macd.ewm(span=signal, adjust=False).mean()
    hist = macd - sig
    return macd, sig, hist


## 3. Download Prices (AAPL/XOM)

In [None]:

def load_prices(ticker: str, start=START, end=END) -> pd.DataFrame:
    df = yf.download(ticker, start=start, end=end, auto_adjust=False, progress=False)
    # Flatten potential multiindex columns (defensive)
    df.columns = ['_'.join(c) if isinstance(c, tuple) else c for c in df.columns]
    # Standardize names
    rename_map = {}
    for c in df.columns:
        base = c.split('_')[0].capitalize()
        if base in ['Open','High','Low','Close','Volume']:
            rename_map[c] = base
        elif base == 'Adjclose' or c.lower().startswith('adj close'):
            rename_map[c] = 'AdjClose'
    df = df.rename(columns=rename_map).reset_index().rename(columns={'Date':'date'})
    df['date'] = pd.to_datetime(df['date']).dt.date.astype(str)
    # Indicators
    df['RSI'] = compute_RSI(df['Close'])
    macd, sig, hist = compute_MACD(df['Close'])
    df['MACD'] = macd
    df['Signal'] = sig
    df['Hist'] = hist
    df['ret'] = np.log(df['Close'] / df['Close'].shift(1))
    df['dRSI'] = df['RSI'].diff()
    df['dMACD'] = df['MACD'].diff()
    df['ticker'] = ticker
    return df

prices = {t: load_prices(t) for t in TICKERS}
{t: prices[t].head(3) for t in TICKERS}


## 4. Load Sentiment Inference & Aggregate by Day

In [None]:

from glob import glob

def find_inference_files():
    files = glob(str(OUT_DIR / "*_inference.csv"))
    return files

def aggregate_daily(inf_csv: Path) -> pd.DataFrame:
    dfp = pd.read_csv(inf_csv)
    # Build signed confidence s*
    def signed_strength(row):
        pred = row['pred']
        sign = -1 if pred=='negative' else (1 if pred=='positive' else 0)
        strength = max(row['proba_negative'], row['proba_neutral'], row['proba_positive'])
        return sign * strength

    dfp['s_star'] = dfp.apply(signed_strength, axis=1)
    daily = (dfp.groupby(['date','ticker'], as_index=False)
                .agg(s_mean=('s_star','mean'),
                     s_absmax=('s_star', lambda x: np.abs(x).max()),
                     n_news=('s_star','size')))
    daily['date'] = pd.to_datetime(daily['date']).dt.date.astype(str)
    return daily

inf_files = find_inference_files()
print("Found inference files:", inf_files)
daily_agg = {}
for f in inf_files:
    name = Path(f).stem.replace('_inference','')
    daily_agg[name] = aggregate_daily(Path(f))

# Prefer FinBERT aggregation if available; else fall back to any
PREFERRED = 'finbert-prosus'
if PREFERRED in daily_agg:
    news_daily = daily_agg[PREFERRED]
else:
    # take the first available
    k = list(daily_agg.keys())[0] if daily_agg else None
    news_daily = daily_agg.get(k, pd.DataFrame(columns=['date','ticker','s_mean','s_absmax','n_news']))
news_daily.head(3)


## 5. Optional: Post-Close Mapping (if time is available)

In [None]:

# If your raw news CSVs include 'time' (HH:MM in exchange TZ), you can remap post-close news to T+1.
# Here we provide a placeholder function. Adjust 'close_hour' per exchange if needed.
def map_to_trading_date(date_str: str, time_str: str, close_hour:int=16) -> str:
    dt = pd.to_datetime(f"{date_str} {time_str}")
    if dt.hour >= close_hour:
        return (dt + pd.Timedelta(days=1)).date().astype(str)
    return dt.date().astype(str)

# In this notebook we assume inference files already have 'date' aligned to trading date.


## 6. Merge Prices + Sentiment by Trading Date

In [None]:

def merge_panel(df_prices: pd.DataFrame, df_news_daily: pd.DataFrame) -> pd.DataFrame:
    m = df_prices.merge(df_news_daily, on=['date','ticker'], how='left')
    # Fill missing sentiment with neutral zeros
    m[['s_mean','s_absmax','n_news']] = m[['s_mean','s_absmax','n_news']].fillna({'s_mean':0.0,'s_absmax':0.0,'n_news':0})
    # Lags for Chapter 4
    for k in [0,1,2]:
        m[f's_mean_lag{k}'] = m['s_mean'].shift(k)
        m[f's_absmax_lag{k}'] = m['s_absmax'].shift(k)
        m[f'n_news_lag{k}'] = m['n_news'].shift(k)
    # Drop initial NaNs from lags at export time (user can also leave them)
    return m

panel = {}
for t in TICKERS:
    p = prices[t].copy()
    news_t = news_daily[news_daily['ticker']==t].copy()
    panel[t] = merge_panel(p, news_t)
    outp = OUT_DIR / f'panel_{t}.csv'
    panel[t].to_csv(outp, index=False)
    print("[Saved]", outp)

panel['AAPL'].head(5)


## 7. (Optional) Market Controls (SPY/XLK/XLE)

In [None]:

def add_controls(panel_df: pd.DataFrame, ctrl_ticker: str, col_prefix:str) -> pd.DataFrame:
    ctrl = yf.download(ctrl_ticker, start=START, end=END, progress=False)
    ctrl = ctrl.rename(columns={'Adj Close':'AdjClose'}).reset_index().rename(columns={'Date':'date'})
    ctrl['date'] = pd.to_datetime(ctrl['date']).dt.date.astype(str)
    ctrl['ret_ctrl'] = np.log(ctrl['AdjClose'] / ctrl['AdjClose'].shift(1))
    ctrl = ctrl[['date','ret_ctrl']]
    out = panel_df.merge(ctrl, on='date', how='left')
    out = out.rename(columns={'ret_ctrl': f'{col_prefix}_ret'})
    return out

# Example: add SPY to both; XLK for AAPL, XLE for XOM
panel['AAPL'] = add_controls(panel['AAPL'], 'SPY', 'SPY')
panel['AAPL'] = add_controls(panel['AAPL'], 'XLK', 'Sector')
panel['XOM']  = add_controls(panel['XOM'],  'SPY', 'SPY')
panel['XOM']  = add_controls(panel['XOM'],  'XLE', 'Sector')

# Save updated panels
for t in TICKERS:
    outp = OUT_DIR / f'panel_{t}.csv'
    panel[t].to_csv(outp, index=False)
    print("[Updated]", outp)


## 8. Sanity-Check Plots

In [None]:

def sanity_plot(panel_df: pd.DataFrame, ticker:str, fig_path:Path):
    fig, ax1 = plt.subplots(figsize=(10,4))
    ax1.plot(pd.to_datetime(panel_df['date']), panel_df['Close'])
    ax1.set_title(f'{ticker}: Close with daily sentiment markers')
    ax1.set_xlabel('Date'); ax1.set_ylabel('Close')
    # sentiment markers (size by |s_mean|, color omitted by guideline; default used)
    dates = pd.to_datetime(panel_df['date'])
    sizes = (panel_df['s_absmax'].abs().fillna(0.0) * 200) + 10
    ax1.scatter(dates, panel_df['Close'], s=sizes)
    ax1.grid(True); fig.tight_layout()
    fig.savefig(fig_path, dpi=200)
    plt.close(fig)

sanity_plot(panel['AAPL'], 'AAPL', FIG_DIR/'AAPL_close_sentiment.png')
sanity_plot(panel['XOM'],  'XOM',  FIG_DIR/'XOM_close_sentiment.png')
print("Saved sanity plots to ./figures")
