# Experiment 5: Controlled Baselines

**Aim:** Disentangle architecture vs data contributions. Answer: "Is the gain from ModernBERT, from more data, or from data diversity?"

**Experiments:**
1. **ProsusAI/finbert** — inference only on FPB (zero-shot from pretrained)
2. **yiyanghkust/finbert-tone** — inference only on FPB (zero-shot from pretrained)
3. **bert-base-uncased + LoRA** — fine-tuned on aggregated data (same protocol as NB01). Isolates: is ModernBERT better than BERT?
4. **ModernBERT + LoRA (NB01)** — reference results from NB01
5. **ModernBERT + LoRA (NB04 CV)** — reference results from NB04 in-domain CV

**Key question:** How much of ModernFinBERT's performance comes from architecture vs training data?

## 1. Setup & Installation

In [None]:
# setup

In [None]:
%%capture
!pip install -q "datasets>=3.4.1,<4.0.0" scikit-learn matplotlib seaborn peft accelerate transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification,
    AutoTokenizer, training_args,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import gc

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6

## 2. Load FPB Test Sets

In [None]:
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_all = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)["train"]

print(f"FPB 50agree: {len(fpb_50):,} samples")
print(f"FPB allAgree: {len(fpb_all):,} samples")

# FPB label order: 0=negative, 1=neutral, 2=positive (matches ours)
for i, name in enumerate(LABEL_NAMES):
    count_50 = sum(1 for l in fpb_50["label"] if l == i)
    count_all = sum(1 for l in fpb_all["label"] if l == i)
    print(f"  {name}: 50agree={count_50}, allAgree={count_all}")

## 3. Evaluation Helper

In [None]:
def evaluate_on_fpb(model, tokenizer, fpb_dataset, label_remap=None, batch_size=32):
    """Evaluate a model on an FPB dataset split.

    Args:
        model: The classification model (already on CUDA and in eval mode).
        tokenizer: The model's tokenizer.
        fpb_dataset: HF dataset with 'sentence' and 'label' columns.
        label_remap: Optional dict mapping model output indices to FPB label indices.
        batch_size: Inference batch size.

    Returns:
        dict with accuracy, macro_f1, report, cm, y_true, y_pred.
    """
    texts = fpb_dataset["sentence"]
    labels = fpb_dataset["label"]

    all_preds = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating"):
            batch_texts = texts[i : i + batch_size]
            inputs = tokenizer(
                batch_texts, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()

            if label_remap:
                preds = np.array([label_remap[p] for p in preds])

            all_preds.extend(preds)

    y_true = np.array(labels)
    y_pred = np.array(all_preds)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    report = classification_report(y_true, y_pred, target_names=LABEL_NAMES)
    cm = confusion_matrix(y_true, y_pred)

    print(f"Accuracy: {acc:.4f} ({int(acc * len(y_true))}/{len(y_true)})")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"\n{report}")

    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "report": report,
        "cm": cm,
        "y_true": y_true,
        "y_pred": y_pred,
    }

## 4. Baseline 1: ProsusAI/finbert (Zero-Shot)

ProsusAI/finbert was fine-tuned on FPB in-domain. We load the pretrained model and run inference.

**Label mapping:** ProsusAI/finbert outputs: 0=positive, 1=negative, 2=neutral  
FPB labels: 0=negative, 1=neutral, 2=positive  
Remap: {0→2, 1→0, 2→1}

In [None]:
print("Loading ProsusAI/finbert...")
prosus_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
prosus_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
prosus_model = prosus_model.cuda().eval()

# Check label order from config
print(f"ProsusAI/finbert id2label: {prosus_model.config.id2label}")

In [None]:
# Build label remap based on model config
# ProsusAI/finbert: {0: 'positive', 1: 'negative', 2: 'neutral'}
# FPB: 0=negative, 1=neutral, 2=positive
prosus_id2label = prosus_model.config.id2label
fpb_label2id = {"negative": 0, "neutral": 1, "positive": 2}
prosus_remap = {int(k): fpb_label2id[v.lower()] for k, v in prosus_id2label.items()}
print(f"ProsusAI remap: {prosus_remap}")

print("\n" + "=" * 60)
print("ProsusAI/finbert — FPB sentences_50agree")
print("=" * 60)
prosus_50 = evaluate_on_fpb(prosus_model, prosus_tokenizer, fpb_50, label_remap=prosus_remap)

print("\n" + "=" * 60)
print("ProsusAI/finbert — FPB sentences_allAgree")
print("=" * 60)
prosus_all = evaluate_on_fpb(prosus_model, prosus_tokenizer, fpb_all, label_remap=prosus_remap)

In [None]:
del prosus_model
gc.collect()
torch.cuda.empty_cache()

## 5. Baseline 2: yiyanghkust/finbert-tone (Zero-Shot)

finbert-tone was fine-tuned on 10K analyst report sentences (not FPB). FPB is a held-out benchmark for this model too.

**Label mapping:** finbert-tone outputs vary — check `id2label` from config.

In [None]:
print("Loading yiyanghkust/finbert-tone...")
tone_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tone_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
tone_model = tone_model.cuda().eval()

print(f"finbert-tone id2label: {tone_model.config.id2label}")

In [None]:
# Build label remap
tone_id2label = tone_model.config.id2label
tone_remap = {int(k): fpb_label2id[v.lower()] for k, v in tone_id2label.items()}
print(f"finbert-tone remap: {tone_remap}")

print("\n" + "=" * 60)
print("finbert-tone — FPB sentences_50agree")
print("=" * 60)
tone_50 = evaluate_on_fpb(tone_model, tone_tokenizer, fpb_50, label_remap=tone_remap)

print("\n" + "=" * 60)
print("finbert-tone — FPB sentences_allAgree")
print("=" * 60)
tone_all = evaluate_on_fpb(tone_model, tone_tokenizer, fpb_all, label_remap=tone_remap)

In [None]:
del tone_model
gc.collect()
torch.cuda.empty_cache()

## 6. Baseline 3: BERT-base-uncased + LoRA on Aggregated Data

Same training protocol as NB01 (aggregated data excluding FPB, LoRA fine-tuning) but with `bert-base-uncased` instead of ModernBERT. This isolates the architecture contribution.

**Note:** BERT uses different attention module names than ModernBERT, so we adjust the LoRA `target_modules`.

In [None]:
# Load aggregated dataset (same as NB01)
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}  |  Test: {len(ds['test']):,}")

In [None]:
print("Training BERT-base-uncased + LoRA on aggregated data...")

bert_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=NUM_CLASSES,
)
bert_model.gradient_checkpointing_enable()
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# BERT LoRA targets — standard BERT attention modules
bert_lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)
bert_model = get_peft_model(bert_model, bert_lora_config)
bert_model = bert_model.cuda()
bert_model.print_trainable_parameters()

In [None]:
def tokenize_bert(examples):
    return bert_tokenizer(examples["text"])

train_data = ds["train"].map(tokenize_bert, batched=True)
val_data = ds["validation"].map(tokenize_bert, batched=True)

bert_trainer = Trainer(
    model=bert_model,
    processing_class=bert_tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
        output_dir="trainer_output_bert",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        fp16=True,
        bf16=False,
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        learning_rate=2e-4,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        num_train_epochs=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy="epoch",
        eval_strategy="epoch",
        logging_strategy="epoch",
        gradient_checkpointing=True,
        report_to="none",
    ),
    compute_metrics=lambda eval_pred: {
        "accuracy": accuracy_score(
            eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
        )
    },
)

bert_trainer.train()
bert_model = bert_model.cuda().eval()

In [None]:
# Evaluate BERT on aggregated test set
test_texts = ds["test"]["text"]
test_labels = np.argmax(ds["test"]["labels"], axis=1)

all_test_preds = []
with torch.no_grad():
    for i in tqdm(range(0, len(test_texts), 32), desc="BERT test set"):
        batch = test_texts[i : i + 32]
        inputs = bert_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.cuda() for k, v in inputs.items()}
        logits = bert_model(**inputs).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        all_test_preds.extend(preds)

bert_test_acc = accuracy_score(test_labels, np.array(all_test_preds))
bert_test_f1 = f1_score(test_labels, np.array(all_test_preds), average="macro")
print(f"BERT Aggregated Test — Accuracy: {bert_test_acc:.4f}  Macro F1: {bert_test_f1:.4f}")
print(classification_report(test_labels, np.array(all_test_preds), target_names=LABEL_NAMES))

In [None]:
print("\n" + "=" * 60)
print("BERT-base + LoRA — FPB sentences_50agree")
print("=" * 60)
bert_50 = evaluate_on_fpb(bert_model, bert_tokenizer, fpb_50)

print("\n" + "=" * 60)
print("BERT-base + LoRA — FPB sentences_allAgree")
print("=" * 60)
bert_all = evaluate_on_fpb(bert_model, bert_tokenizer, fpb_all)

In [None]:
del bert_model, bert_trainer
gc.collect()
torch.cuda.empty_cache()

## 7. Comprehensive Comparison Table

In [None]:
comparison = pd.DataFrame([
    # Zero-shot pretrained baselines
    {"Model": "ProsusAI/finbert", "Training Data": "FPB (pretrained)",
     "FPB 50agree Acc": prosus_50["accuracy"], "FPB 50agree F1": prosus_50["macro_f1"],
     "FPB allAgree Acc": prosus_all["accuracy"], "FPB allAgree F1": prosus_all["macro_f1"],
     "Protocol": "Zero-shot (pretrained on FPB)"},
    {"Model": "finbert-tone", "Training Data": "Analyst reports",
     "FPB 50agree Acc": tone_50["accuracy"], "FPB 50agree F1": tone_50["macro_f1"],
     "FPB allAgree Acc": tone_all["accuracy"], "FPB allAgree F1": tone_all["macro_f1"],
     "Protocol": "Zero-shot (pretrained on analyst reports)"},
    # BERT baseline (same data, different architecture)
    {"Model": "BERT-base + LoRA", "Training Data": "Aggregated (-FPB)",
     "FPB 50agree Acc": bert_50["accuracy"], "FPB 50agree F1": bert_50["macro_f1"],
     "FPB allAgree Acc": bert_all["accuracy"], "FPB allAgree F1": bert_all["macro_f1"],
     "Protocol": "Held-out (same data as NB01)"},
    # NB01 reference (fill in manually or from saved results)
    {"Model": "ModernBERT + LoRA (NB01)", "Training Data": "Aggregated (-FPB)",
     "FPB 50agree Acc": "see NB01", "FPB 50agree F1": "see NB01",
     "FPB allAgree Acc": "see NB01", "FPB allAgree F1": "see NB01",
     "Protocol": "Held-out"},
    # NB04 reference (fill in manually)
    {"Model": "ModernBERT + LoRA (NB04 CV)", "Training Data": "FPB in-domain",
     "FPB 50agree Acc": "see NB04", "FPB 50agree F1": "see NB04",
     "FPB allAgree Acc": "—", "FPB allAgree F1": "—",
     "Protocol": "10-fold CV"},
])

print("=" * 120)
print("COMPREHENSIVE BASELINE COMPARISON")
print("=" * 120)
print(comparison.to_string(index=False))

## 8. Analysis: Architecture vs Data

In [None]:
print("=" * 70)
print("ABLATION ANALYSIS")
print("=" * 70)

print("\n--- Architecture Effect (same data, different model) ---")
print(f"  BERT-base + LoRA (aggregated):     {bert_50['accuracy']:.4f} acc / {bert_50['macro_f1']:.4f} F1")
print(f"  ModernBERT + LoRA (aggregated):     see NB01 results")
print(f"  Delta (ModernBERT advantage):       compute from NB01")

print("\n--- Data Effect (same model, different data) ---")
print(f"  ModernBERT + LoRA on FPB (NB04):    see NB04 results (in-domain CV)")
print(f"  ModernBERT + LoRA on aggregated:    see NB01 results (held-out)")

print("\n--- Pretrained vs Fine-tuned ---")
print(f"  ProsusAI/finbert (pretrained):      {prosus_50['accuracy']:.4f} acc / {prosus_50['macro_f1']:.4f} F1")
print(f"  finbert-tone (pretrained):          {tone_50['accuracy']:.4f} acc / {tone_50['macro_f1']:.4f} F1")
print(f"  BERT-base + LoRA (fine-tuned):      {bert_50['accuracy']:.4f} acc / {bert_50['macro_f1']:.4f} F1")

## 9. Visualizations

In [None]:
# Bar chart: FPB 50agree accuracy across all baselines
models_50 = {
    "ProsusAI/finbert": prosus_50["accuracy"],
    "finbert-tone": tone_50["accuracy"],
    "BERT-base+LoRA\n(aggregated)": bert_50["accuracy"],
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 50agree
ax = axes[0]
names = list(models_50.keys())
accs = list(models_50.values())
colors = ["#90CAF9", "#90CAF9", "#FFA726"]
bars = ax.barh(names, accs, color=colors, edgecolor="white")
ax.set_xlim(0.6, 1.0)
ax.set_xlabel("Accuracy")
ax.set_title("FPB sentences_50agree")
for bar, acc in zip(bars, accs):
    ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height() / 2,
            f"{acc:.1%}", va="center", fontsize=10)

# allAgree
models_all = {
    "ProsusAI/finbert": prosus_all["accuracy"],
    "finbert-tone": tone_all["accuracy"],
    "BERT-base+LoRA\n(aggregated)": bert_all["accuracy"],
}
ax = axes[1]
names = list(models_all.keys())
accs = list(models_all.values())
bars = ax.barh(names, accs, color=colors, edgecolor="white")
ax.set_xlim(0.6, 1.0)
ax.set_xlabel("Accuracy")
ax.set_title("FPB sentences_allAgree")
for bar, acc in zip(bars, accs):
    ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height() / 2,
            f"{acc:.1%}", va="center", fontsize=10)

plt.suptitle("Controlled Baselines — FPB Accuracy", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("controlled_baselines.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Confusion matrices for all three baselines on FPB 50agree
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (res, title) in zip(axes, [
    (prosus_50, "ProsusAI/finbert"),
    (tone_50, "finbert-tone"),
    (bert_50, "BERT-base+LoRA"),
]):
    sns.heatmap(
        res["cm"], annot=True, fmt="d", cmap="Blues",
        xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax,
    )
    ax.set_title(f"{title}\nAcc={res['accuracy']:.2%}  F1={res['macro_f1']:.2%}")
    ax.set_ylabel("True")
    ax.set_xlabel("Predicted")

plt.suptitle("Confusion Matrices — FPB sentences_50agree", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("baselines_confusion_matrices.png", dpi=150, bbox_inches="tight")
plt.show()