
# 🗞️ Sentiment Models — Research & Backtest

_Date generated: 2025-09-03_

End-to-end notebook to prototype **news / transcript sentiment** and test if it predicts future returns.

**What you get**
- Data loaders (CSV) with **synthetic fallback**
- Simple **lexicon** rule-based sentiment
- Lightweight **Multinomial Naive Bayes** text model (no external deps)
- Optional **transformer** inference (if `transformers` is available locally)
- Aggregation to **daily ticker sentiment**
- Predictive tests: **IC**, **long/short bucket**, and **event study**


## 0) Parameters

In [None]:

# Expected CSVs (optional). If absent, the notebook synthesizes examples.
# news.csv columns: date, ticker, title, body
# transcripts.csv columns: date, ticker, speaker, text
# returns.csv columns: date, TICK1, TICK2, ... (daily returns in decimal)
PATH_NEWS = "data/news.csv"
PATH_TRANS = "data/transcripts.csv"
PATH_RETURNS = "data/returns.csv"

# Controls
TOP_K_WORDS = 5000          # vocab size for NB model
MIN_DOCS_PER_TICKER = 10    # filter sparse tickers for tests
EVENT_WINDOW = 5            # ± days around events for event study
IC_HORIZON = 1              # next-day prediction by default


## 1) Setup & Helpers

In [None]:

import os, re, math, random, warnings
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
pd.options.display.float_format = "{:,.4f}".format

random.seed(7); np.random.seed(7)

# Basic tokenizer
_word_re = re.compile(r"[A-Za-z']+")
def tokenize(text: str) -> List[str]:
    if not isinstance(text, str): return []
    return [w.lower() for w in _word_re.findall(text)]

def train_test_split(df, test_frac=0.2, seed=7):
    idx = np.arange(len(df))
    rng = np.random.default_rng(seed)
    rng.shuffle(idx)
    split = int(len(idx) * (1 - test_frac))
    return df.iloc[idx[:split]], df.iloc[idx[split:]]


## 2) Load Data (CSV or synthetic)

In [None]:

def load_news(path=PATH_NEWS) -> pd.DataFrame:
    if os.path.exists(path):
        df = pd.read_csv(path, parse_dates=["date"])
        return df
    # Synthetic news
    dates = pd.bdate_range("2024-01-01", periods=220)
    tickers = ["AAPL","MSFT","AMZN","GOOG","TSLA","META","NFLX","NVDA"]
    pos_phr = ["beats expectations", "strong demand", "surge", "upgrade", "record revenue", "partnership"]
    neg_phr = ["misses estimates", "probe", "downgrade", "weak guidance", "lawsuit", "recall"]
    rows = []
    rng = np.random.default_rng(1)
    for d in dates:
        for t in rng.choice(tickers, size=rng.integers(1, 4), replace=False):
            polarity = rng.choice([1, -1], p=[0.55, 0.45])
            if polarity == 1:
                title = f"{t} {rng.choice(pos_phr)}"
                body = f"{t} reports {rng.choice(['robust','solid','accelerating'])} growth and {rng.choice(pos_phr)}."
            else:
                title = f"{t} {rng.choice(neg_phr)}"
                body = f"{t} faces {rng.choice(['headwinds','concerns','slowing'])} and {rng.choice(neg_phr)}."
            rows.append({"date": d, "ticker": t, "title": title, "body": body, "label": 1 if polarity==1 else 0})
    df = pd.DataFrame(rows)
    return df

def load_transcripts(path=PATH_TRANS) -> pd.DataFrame:
    if os.path.exists(path):
        return pd.read_csv(path, parse_dates=["date"])
    # Synthetic transcripts: neutral to mildly polarized
    dates = pd.bdate_range("2024-01-01", periods=60, freq='B')
    tickers = ["AAPL","MSFT","AMZN","GOOG","TSLA","META","NFLX","NVDA"]
    speakers = ["CEO","CFO","Analyst"]
    rows = []
    rng = np.random.default_rng(2)
    phrases_pos = ["confident", "pipeline strong", "margin expansion", "tailwinds"]
    phrases_neg = ["cautious", "pressure", "uncertain", "headwinds"]
    for d in dates:
        for t in rng.choice(tickers, size=rng.integers(1,4), replace=False):
            for _ in range(rng.integers(1,3)):
                pol = rng.choice([1,0], p=[0.5, 0.5])
                if pol:
                    text = f"We are {rng.choice(phrases_pos)} and see {rng.choice(phrases_pos)} ahead."
                else:
                    text = f"We remain {rng.choice(phrases_neg)} due to {rng.choice(phrases_neg)}."
                rows.append({"date": d, "ticker": t, "speaker": rng.choice(speakers), "text": text})
    return pd.DataFrame(rows)

def load_returns(path=PATH_RETURNS, n_assets=8, days=260) -> pd.DataFrame:
    if os.path.exists(path):
        return pd.read_csv(path, parse_dates=["date"]).set_index("date").sort_index()
    rng = np.random.default_rng(3)
    dates = pd.bdate_range("2024-01-01", periods=days)
    cols = ["AAPL","MSFT","AMZN","GOOG","TSLA","META","NFLX","NVDA"]
    mu = rng.normal(0.0005, 0.0002, size=len(cols))
    sd = rng.uniform(0.01, 0.02, size=len(cols))
    data = [rng.normal(mu[i], sd[i], size=len(dates)) for i in range(len(cols))]
    return pd.DataFrame(np.array(data).T, index=dates, columns=cols)

news = load_news()
trans = load_transcripts()
rets = load_returns()

news.head(), trans.head(), rets.head()


## 3) Lexicon Sentiment (rule-based)

In [None]:

POS = set(['beat','beats','strong','surge','upgrade','record','growth','bullish','confident','tailwinds','expansion'])
NEG = set(['miss','misses','probe','downgrade','weak','lawsuit','recall','headwinds','uncertain','pressure','cautious'])

def lexicon_score(text: str) -> float:
    toks = tokenize(text)
    pos = sum(1 for w in toks if w in POS)
    neg = sum(1 for w in toks if w in NEG)
    if pos==0 and neg==0: return 0.0
    return (pos - neg) / max(1, pos + neg)

def apply_lexicon(df: pd.DataFrame, text_cols: List[str]) -> pd.Series:
    scores = []
    for _, row in df.iterrows():
        txt = " ".join(str(row[c]) for c in text_cols if c in df.columns)
        scores.append(lexicon_score(txt))
    return pd.Series(scores, index=df.index)

news["lexicon_score"] = apply_lexicon(news, ["title", "body"])
trans["lexicon_score"] = apply_lexicon(trans, ["text"])
news[["date","ticker","title","lexicon_score"]].head()


## 4) Lightweight Text Model — Multinomial Naive Bayes

In [None]:

class MiniCountVectorizer:
    def __init__(self, max_features=5000):
        self.max_features = max_features
        self.vocab_ = {}
        self.id2tok_ = []

    def fit(self, texts: List[str]):
        counts = Counter()
        for t in texts:
            counts.update(tokenize(t))
        most = counts.most_common(self.max_features)
        self.id2tok_ = [w for w,_ in most]
        self.vocab_ = {w:i for i,w in enumerate(self.id2tok_)}
        return self

    def transform(self, texts: List[str]) -> np.ndarray:
        X = np.zeros((len(texts), len(self.vocab_)), dtype=np.float32)
        for i,t in enumerate(texts):
            for w in tokenize(t):
                j = self.vocab_.get(w, -1)
                if j>=0:
                    X[i,j]+=1.0
        return X

class MultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_log_prior_ = None
        self.feature_log_prob_ = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        n_classes = int(y.max()) + 1
        self.class_log_prior_ = np.log(np.bincount(y) / len(y))
        # Laplace smoothing
        smoothed_fc = []
        for c in range(n_classes):
            Xc = X[y==c]
            fc = Xc.sum(axis=0) + self.alpha
            smoothed_fc.append(fc / fc.sum())
        self.feature_log_prob_ = np.log(np.vstack(smoothed_fc))
        return self

    def predict_log_proba(self, X: np.ndarray) -> np.ndarray:
        # log P(c) + sum x_i * log P(w_i|c)
        return self.class_log_prior_[None,:] + X @ self.feature_log_prob_.T # type: ignore

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        logp = self.predict_log_proba(X)
        # softmax
        logp -= logp.max(axis=1, keepdims=True)
        p = np.exp(logp)
        p /= p.sum(axis=1, keepdims=True)
        return p

# Use synthetic labels in news if present; otherwise weak labels via lexicon
if "label" not in news.columns:
    news["label"] = (news["lexicon_score"] > 0).astype(int)

train_df, test_df = train_test_split(news, test_frac=0.2, seed=7)
vec = MiniCountVectorizer(max_features=TOP_K_WORDS).fit((train_df["title"] + " " + train_df["body"]).tolist())
Xtr = vec.transform((train_df["title"] + " " + train_df["body"]).tolist())
Xte = vec.transform((test_df["title"] + " " + test_df["body"]).tolist())
ytr = train_df["label"].astype(int).values
yte = test_df["label"].astype(int).values

nb = MultinomialNB(alpha=0.5).fit(Xtr, ytr) # type: ignore
proba = nb.predict_proba(Xte)[:,1]
pred = (proba >= 0.5).astype(int)

acc = (pred==yte).mean()
from sklearn.metrics import roc_auc_score if False else None # type: ignore
# Manual AUC (fallback) to avoid sklearn dep
def auc_approx(y_true, y_score, n_bins=100):
    df = pd.DataFrame({"y": y_true, "p": y_score}).sort_values("p")
    # trapezoidal ROC approximation
    thresholds = np.linspace(0,1,n_bins)
    tprs, fprs = [], []
    P = (df["y"]==1).sum(); N = (df["y"]==0).sum()
    for th in thresholds:
        tp = ((df["p"]>=th) & (df["y"]==1)).sum()
        fp = ((df["p"]>=th) & (df["y"]==0)).sum()
        tprs.append(tp/max(1,P)); fprs.append(fp/max(1,N))
    a = 0.0
    for i in range(1,len(thresholds)):
        a += 0.5*(tprs[i]+tprs[i-1])*(fprs[i]-fprs[i-1])
    return 1 - abs(a)

auc = auc_approx(yte, proba)
acc, auc


### Precision by Threshold (approx)

In [None]:

ths = np.linspace(0,1,31)
prec, rec = [], []
for th in ths:
    p = (proba>=th).sum()
    tp = ((proba>=th) & (yte==1)).sum()
    fn = ((proba<th) & (yte==1)).sum()
    precision = (tp/max(1,p))
    recall = (tp/max(1,tp+fn))
    prec.append(precision); rec.append(recall)

plt.figure(figsize=(8,3.2))
plt.plot(rec, prec)
plt.title("Precision vs Recall (threshold sweep)")
plt.tight_layout(); plt.show()


## 5) Optional Transformer Inference

In [None]:

HAS_TRANS = False
try:
    import transformers  # noqa
    HAS_TRANS = True
except Exception:
    HAS_TRANS = False

if HAS_TRANS:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    import torch
    # NOTE: Requires model to be available locally (no internet). If not, this block will be skipped.
    model_name = os.environ.get("FINBERT_MODEL_PATH", "")  # e.g., a local dir of ProsusAI/finbert
    transformer_ok = False
    if model_name and os.path.exists(model_name):
        try:
            tok = AutoTokenizer.from_pretrained(model_name)
            mdl = AutoModelForSequenceClassification.from_pretrained(model_name)
            mdl.eval()
            transformer_ok = True
        except Exception:
            transformer_ok = False

    if transformer_ok:
        texts = (test_df["title"] + ". " + test_df["body"]).tolist()[:64]
        with torch.no_grad():
            enc = tok(texts, padding=True, truncation=True, return_tensors="pt")
            out = mdl(**enc)
            probs = out.logits.softmax(-1).numpy()  # assume [neg, neutral, pos]
            finbert_sent = probs[:, -1] - probs[:, 0]
        print("Transformer sentiment preview (size):", finbert_sent.shape)
    else:
        print("Transformers installed but no local model path set; skipping.")
else:
    print("Transformers not available; skipping.")


## 6) Aggregate Daily Sentiment per Ticker

In [None]:

# Use NB probability as signal (fallback to lexicon when unavailable)
test_df = test_df.copy()
test_df["prob_pos"] = nb.predict_proba(vec.transform((test_df["title"] + " " + test_df["body"]).tolist()))[:,1]
test_df["sentiment"] = test_df["prob_pos"].where(~test_df["prob_pos"].isna(), test_df["lexicon_score"])

daily_sent = test_df.groupby(["date","ticker"])["sentiment"].mean().unstack().reindex(rets.index).fillna(0.0)
daily_sent.tail()


## 7) Predictive Tests — IC & Long/Short Buckets

In [None]:

next_ret = rets.shift(-IC_HORIZON)
# Cross-sectional IC (Spearman approx via rank corr)
def cs_ic(signal_row, return_row):
    s = signal_row.dropna(); r = return_row.dropna()
    com = s.index.intersection(r.index)
    if len(com) < 4: return np.nan
    sr = s.loc[com].rank()
    rr = r.loc[com].rank()
    cov = np.cov(sr, rr)[0,1]
    return cov / (sr.std(ddof=1) * rr.std(ddof=1) + 1e-9)

ic_series = []
for dt in daily_sent.index:
    ic_series.append(cs_ic(daily_sent.loc[dt], next_ret.loc[dt]))
ic_series = pd.Series(ic_series, index=daily_sent.index)

plt.figure(figsize=(10,3))
ic_series.plot()
plt.axhline(0, linestyle='--')
plt.title("Daily Information Coefficient (sentiment → next returns)")
plt.tight_layout(); plt.show()

print("IC mean =", float(np.nanmean(ic_series)), "  IC IR =", float(np.nanmean(ic_series))/float(np.nanstd(ic_series)+1e-9))


### Long/Short Buckets

In [None]:

def build_ls(signal: pd.DataFrame, returns: pd.DataFrame, top_q=0.8, bot_q=0.2):
    weights = pd.DataFrame(0.0, index=signal.index, columns=signal.columns)
    for dt, row in signal.iterrows():
        s = row.dropna()
        if s.empty: continue
        th_long = s.quantile(top_q); th_short = s.quantile(bot_q)
        longs = s.index[s >= th_long]; shorts = s.index[s <= th_short]
        if len(longs)>0: weights.loc[dt, longs] =  1.0/len(longs) # type: ignore
        if len(shorts)>0: weights.loc[dt, shorts] = -1.0/len(shorts) # type: ignore
        # demean to be quasi-market neutral
        if weights.loc[dt].abs().sum()>0: # type: ignore
            weights.loc[dt] -= weights.loc[dt].mean() # type: ignore
            weights.loc[dt] /= weights.loc[dt].abs().sum() # type: ignore
    r = (weights.shift(1) * returns).sum(axis=1).fillna(0.0)
    return r

r_ls = build_ls(daily_sent, rets)
equity = (1 + r_ls).cumprod()
plt.figure(figsize=(10,3))
equity.plot()
plt.title("Sentiment L/S Equity Curve (no costs)")
plt.tight_layout(); plt.show()


## 8) Event Study (Top/Bottom Sentiment Days)

In [None]:

def event_study(signal: pd.DataFrame, returns: pd.DataFrame, window=EVENT_WINDOW, top_q=0.95, bot_q=0.05):
    # Identify top and bottom quantile events per ticker
    events = []
    for t in signal.columns:
        s = signal[t].dropna()
        if len(s) < MIN_DOCS_PER_TICKER: continue
        hi = s[s >= s.quantile(top_q)].index
        lo = s[s <= s.quantile(bot_q)].index
        for dt in hi: events.append((t, dt, 1))
        for dt in lo: events.append((t, dt, -1))
    # Build windows
    rel_days = np.arange(-window, window+1)
    mats = []
    for t, dt, sign in events:
        for k, rd in enumerate(rel_days):
            dtk = dt + pd.tseries.offsets.BDay(rd) # type: ignore
            if dtk in returns.index:
                mats.append({"rel": rd, "ret": returns.loc[dtk, t], "sign": sign})
    df = pd.DataFrame(mats)
    if df.empty: 
        return pd.DataFrame(columns=["rel","car_pos","car_neg"])
    # Cumulative average returns for pos/neg
    pos = df[df["sign"]==1].groupby("rel")["ret"].mean().cumsum()
    neg = df[df["sign"]==-1].groupby("rel")["ret"].mean().cumsum()
    res = pd.DataFrame({"car_pos": pos, "car_neg": neg})
    res.index.name = "rel"
    return res

ev = event_study(daily_sent, rets, window=EVENT_WINDOW)
ev = ev.reindex(range(-EVENT_WINDOW, EVENT_WINDOW+1))
ev = ev.fillna(method="ffill").fillna(0.0) # type: ignore

plt.figure(figsize=(8,3))
plt.plot(ev.index, ev["car_pos"], label="Top sentiment CAR")
plt.plot(ev.index, ev["car_neg"], label="Bottom sentiment CAR")
plt.title("Event Study: Cumulative Average Returns")
plt.legend(); plt.tight_layout(); plt.show()
