
# Chapter 3 — Experiments: Financial News Sentiment Models

This notebook implements the full experimental pipeline for **Section 3.7–3.10** of the thesis:

- Baselines and domain models: **BERT-base** vs **FinBERT** (+ optional extra finance checkpoints)
- Unified metrics: **Macro-F1**, **Balanced Accuracy**, **Brier score**, **Confusion Matrix**
- **Probability calibration** (temperature scaling) + reliability diagram
- **Error analysis** (length buckets, entity/ticker presence, hard categories)
- Output: tables (CSV) and figures (PNG) under `./tables` and `./figures`

> **Datasets expected**: `./data/news_AAPL.csv` and `./data/news_XOM.csv` with columns at minimum:  
> `date` (YYYY-MM-DD), `headline` (str).  
> Optionally: `label` in {negative, neutral, positive} for evaluation. If `label` is absent, the notebook will run inference and skip metric computation.


## 0. Environment (Colab-friendly)

In [None]:

# If running in Colab, uncomment the next lines:
# !pip install -q transformers==4.43.3 datasets==2.20.0 accelerate==0.33.0 #               torch --index-url https://download.pytorch.org/whl/cu121
# !pip install -q scikit-learn==1.3.2 matplotlib==3.8.4 pandas==2.2.2 seaborn==0.13.2
# !pip install -q tqdm==4.66.4


## 1. Imports & Global Config

In [None]:

import os, re, json, math, time
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, classification_report, confusion_matrix, brier_score_loss
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# Paths
DATA_DIR = Path('./data')
FIG_DIR  = Path('./figures')
TAB_DIR  = Path('./tables')
OUT_DIR  = Path('./outputs')
for d in [FIG_DIR, TAB_DIR, OUT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Label mapping
CLASSES = ['negative', 'neutral', 'positive']
LABEL2ID = {c:i for i,c in enumerate(CLASSES)}
ID2LABEL = {i:c for c,i in LABEL2ID.items()}

def has_labels(df: pd.DataFrame) -> bool:
    return 'label' in df.columns and df['label'].notna().sum() > 0

print("Environment ready. CUDA:", torch.cuda.is_available())


## 2. Load & Prepare Data (AAPL/XOM)

In [None]:

def load_dataset(paths: List[Path]) -> pd.DataFrame:
    frames = []
    for p in paths:
        if not p.exists():
            print(f"[WARN] Missing file: {p}. Please provide it.")
            continue
        df = pd.read_csv(p)
        frames.append(df)
    if not frames:
        raise FileNotFoundError("No dataset files found. Please ensure CSVs are placed under ./data.")
    df_all = pd.concat(frames, ignore_index=True)
    # Standardize columns
    if 'date' in df_all.columns:
        df_all['date'] = pd.to_datetime(df_all['date']).dt.date.astype(str)
    if 'headline' not in df_all.columns:
        raise ValueError("Required column 'headline' is missing.")
    # Optional label normalization
    if 'label' in df_all.columns:
        df_all['label'] = df_all['label'].str.lower().str.strip()
        df_all = df_all[df_all['label'].isin(CLASSES) | df_all['label'].isna()]
    # Meta: ticker
    if 'ticker' not in df_all.columns:
        def infer_ticker(fp):
            name = fp.name.lower()
            if 'aapl' in name: return 'AAPL'
            if 'xom' in name:  return 'XOM'
            if 'cvx' in name:  return 'CVX'
            return 'UNK'
        sources = []
        for p in paths:
            if p.exists():
                tmp = pd.read_csv(p)
                n = len(tmp)
                sources += [infer_ticker(p)] * n
        if len(sources) == len(df_all):
            df_all['ticker'] = sources
        else:
            df_all['ticker'] = 'UNK'
    return df_all

news_paths = [DATA_DIR/'news_AAPL.csv', DATA_DIR/'news_XOM.csv']
df = load_dataset(news_paths)
print("Loaded records:", len(df))
df.head(3)


## 3. Models to Compare

In [None]:

# Core models
MODEL_SPECS = [
    {"name":"bert-base-uncased", "id":"bert-base-uncased"},
    {"name":"finbert-prosus",    "id":"ProsusAI/finbert"},
]
# Optional extra finance model (uncomment to add a third)
# MODEL_SPECS.append({"name":"finbert-tone", "id":"yiyanghkust/finbert-tone"})
print("Configured models:", [m['name'] for m in MODEL_SPECS])


## 4. Inference Utilities

In [None]:

def build_pipeline(model_id: str, device: Optional[str]=None) -> TextClassificationPipeline:
    tok = AutoTokenizer.from_pretrained(model_id)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
    if device is None:
        device = 0 if torch.cuda.is_available() else -1
    pipe = TextClassificationPipeline(
        model=mdl, tokenizer=tok, device=device, top_k=None, return_all_scores=True, truncation=True, max_length=64
    )
    return pipe

def run_inference(pipe: TextClassificationPipeline, texts: List[str], batch_size: int=32) -> np.ndarray:
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        out = pipe(batch)
        for row in out:
            vec = np.zeros(len(CLASSES), dtype=np.float32)
            for d in row:
                lab = d['label'].lower()
                if 'neg' in lab: idx = LABEL2ID['negative']
                elif 'pos' in lab: idx = LABEL2ID['positive']
                else: idx = LABEL2ID['neutral']
                vec[idx] = float(d['score'])
            s = vec.sum()
            vec = np.array([1/3,1/3,1/3], dtype=np.float32) if s<=0 else vec/s
            results.append(vec)
    return np.vstack(results)

def probs_to_labels(probs: np.ndarray) -> np.ndarray:
    return probs.argmax(axis=1)


## 5. Metrics & Calibration

In [None]:

from sklearn.metrics import f1_score

def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

def brier_multiclass(y_true, probs):
    y_true_ovr = np.eye(len(CLASSES))[y_true]
    return np.mean(np.sum((probs - y_true_ovr)**2, axis=1))

def temperature_scale(probs: np.ndarray, y_val: np.ndarray, max_iter:int=1000, lr:float=0.1) -> float:
    eps = 1e-12
    logits = np.log(np.clip(probs, eps, 1.0))
    T = 1.0
    for _ in range(max_iter):
        scaled = logits / T
        e = np.exp(scaled - scaled.max(axis=1, keepdims=True))
        p = e / e.sum(axis=1, keepdims=True)
        y_oh = np.eye(probs.shape[1])[y_val]
        grad = np.sum((p - y_oh) * (-scaled / T), axis=None) / len(y_val)
        T_new = T - lr * grad
        if abs(T_new - T) < 1e-5:
            T = T_new
            break
        T = max(0.05, min(5.0, T_new))
    return float(T)

def apply_temperature(probs: np.ndarray, T: float) -> np.ndarray:
    eps = 1e-12
    logits = np.log(np.clip(probs, eps, 1.0))
    scaled = logits / T
    e = np.exp(scaled - scaled.max(axis=1, keepdims=True))
    return e / e.sum(axis=1, keepdims=True)

def reliability_plot(y_true: np.ndarray, probs: np.ndarray, title:str, path:Path):
    fig, ax = plt.subplots(figsize=(6,5))
    for i, c in enumerate(CLASSES):
        CalibrationDisplay.from_predictions((y_true==i).astype(int), probs[:,i], n_bins=10, ax=ax, name=c)
    ax.set_title(title); ax.grid(True); fig.tight_layout()
    fig.savefig(path, dpi=200); plt.close(fig)


## 6. Train/Validation Split (for Calibration) & Evaluation

In [None]:

if has_labels(df):
    df_labeled = df.dropna(subset=['label']).copy()
    y_all = df_labeled['label'].map(LABEL2ID).values
    train_idx, val_idx = train_test_split(np.arange(len(df_labeled)), test_size=0.2, random_state=SEED, stratify=y_all)
    df_train = df_labeled.iloc[train_idx].reset_index(drop=True)
    df_val   = df_labeled.iloc[val_idx].reset_index(drop=True)
    print(f"Labeled records: {len(df_labeled)} | Train: {len(df_train)} | Val: {len(df_val)}")
else:
    df_train, df_val = None, None
    print("No labels present — inference-only mode.")


## 7. Main Loop: Inference, Metrics, Calibration, Error Analysis

In [None]:

all_results = []

for spec in MODEL_SPECS:
    name, mid = spec['name'], spec['id']
    print(f"\n=== Model: {name} ({mid}) ===")
    pipe = build_pipeline(mid)

    probs_all = run_inference(pipe, df['headline'].tolist(), batch_size=32)
    preds_all = probs_to_labels(probs_all)

    out = df[['date','ticker','headline']].copy()
    for i,c in enumerate(CLASSES):
        out[f'proba_{c}'] = probs_all[:,i]
    out['pred'] = [ID2LABEL[i] for i in preds_all]
    out_path = OUT_DIR / f'{name}_inference.csv'
    out.to_csv(out_path, index=False)
    print(f"[Saved] {out_path}")

    metrics_row = {"model": name}
    if df_train is not None:
        mask_all = df['label'].notna()
        y_true = df.loc[mask_all, 'label'].map(LABEL2ID).values
        probs_eval = probs_all[mask_all.values]
        y_pred = probs_eval.argmax(axis=1)

        m_f1 = macro_f1(y_true, y_pred)
        m_brier = brier_multiclass(y_true, probs_eval)
        cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])

        metrics_row.update({"macro_f1_raw": m_f1, "brier_raw": m_brier})

        rel_path = FIG_DIR / f'{name}_reliability_raw.png'
        reliability_plot(y_true, probs_eval, f'Reliability (raw) — {name}', rel_path)

        probs_train = run_inference(pipe, df_train['headline'].tolist(), batch_size=32)
        y_tr = df_train['label'].map(LABEL2ID).values
        probs_val  = run_inference(pipe, df_val['headline'].tolist(), batch_size=32)
        y_vl = df_val['label'].map(LABEL2ID).values

        T = temperature_scale(probs_val, y_vl, max_iter=800, lr=0.05)
        probs_cal = apply_temperature(probs_eval, T)
        y_pred_cal = probs_cal.argmax(axis=1)

        m_f1_cal = macro_f1(y_true, y_pred_cal)
        m_brier_cal = brier_multiclass(y_true, probs_cal)

        metrics_row.update({"T": T, "macro_f1_cal": m_f1_cal, "brier_cal": m_brier_cal})

        # Confusion matrix fig
        fig, ax = plt.subplots(figsize=(4.8,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=CLASSES, yticklabels=CLASSES, ax=ax)
        ax.set_title(f'Confusion Matrix — {name} (raw)')
        fig.tight_layout()
        fig.savefig(FIG_DIR/f'{name}_confusion.png', dpi=200)
        plt.close(fig)

        # Calibrated reliability
        relc_path = FIG_DIR / f'{name}_reliability_calibrated.png'
        reliability_plot(y_true, probs_cal, f'Reliability (calibrated) — {name}', relc_path)

        # Error analysis
        df_eval = df.loc[mask_all].copy()
        df_eval['y_true'] = y_true
        df_eval['y_pred'] = y_pred
        df_eval['correct'] = (df_eval['y_true'] == df_eval['y_pred']).astype(int)
        df_eval['len'] = df_eval['headline'].astype(str).str.len()
        df_eval['has_ticker'] = df_eval['headline'].str.contains(r'\b(AAPL|XOM|CVX|Apple|Exxon|Chevron)\b', case=False, regex=True)

        bins = [0, 40, 80, 120, 1000]
        labels = ['<=40','41-80','81-120','>120']
        df_eval['len_bucket'] = pd.cut(df_eval['len'], bins=bins, labels=labels, include_lowest=True)
        acc_by_len = df_eval.groupby('len_bucket')['correct'].mean().reset_index()
        acc_by_len.to_csv(TAB_DIR/f'{name}_acc_by_length.csv', index=False)

        err_df = df_eval[df_eval['correct']==0].copy()
        err_df['true_label'] = err_df['y_true'].map(ID2LABEL)
        err_df['pred_label'] = err_df['y_pred'].map(ID2LABEL)

        def tag_category(text: str) -> str:
            t = text.lower()
            if any(w in t for w in ['investigation','lawsuit','fine','regulator','probe','antitrust','ban']):
                return 'regulatory/legal'
            if any(w in t for w in ['beats','misses','guidance','forecast','outlook','eps','revenue']):
                return 'earnings/guidance'
            if any(w in t for w in ['rumor','leak','reportedly','sources say']):
                return 'rumors/speculation'
            return 'general'

        err_df['category'] = err_df['headline'].apply(tag_category)
        err_df[['date','ticker','headline','true_label','pred_label','len','has_ticker','category']]            .to_csv(OUT_DIR/f'{name}_errors.csv', index=False)

        err_cat = err_df['category'].value_counts().reset_index()
        err_cat.columns = ['category','count']
        err_cat.to_csv(TAB_DIR/f'{name}_error_categories.csv', index=False)

    all_results.append(metrics_row)

if all_results and any('macro_f1_raw' in r for r in all_results):
    met = pd.DataFrame(all_results)
    met.to_csv(TAB_DIR/'models_summary_metrics.csv', index=False)
    met
else:
    print("No labeled data detected — metrics table not created. Inference CSVs were saved.")


## 8. (Optional) Daily Aggregation Helpers for Chapter 4

In [None]:

def aggregate_daily(inf_csv: Path) -> pd.DataFrame:
    dfp = pd.read_csv(inf_csv)
    s = []
    for i,row in dfp.iterrows():
        pred = row['pred']
        sign = -1 if pred=='negative' else (1 if pred=='positive' else 0)
        strength = max(row['proba_negative'], row['proba_neutral'], row['proba_positive'])
        s.append(sign * strength)
    dfp['s_star'] = s
    daily = (dfp.groupby(['date','ticker'])
                 .agg(s_mean=('s_star','mean'),
                      s_absmax=('s_star', lambda x: np.abs(x).max()),
                      n_news=('s_star','size'))
                 .reset_index())
    return daily

# Example (uncomment after inference):
# daily_finbert = aggregate_daily(OUT_DIR/'finbert-prosus_inference.csv')
# daily_finbert.to_csv(OUT_DIR/'finbert_daily_agg.csv', index=False)
