# Experiment 1: Architecture Comparison — ModernBERT vs FinBERT

**Aim:** Fine-tune ModernBERT-base on aggregated financial sentiment data (excluding FPB), then evaluate on FinancialPhraseBank as a held-out benchmark. Compare against pre-trained FinBERT baselines.

**Key question:** Does ModernBERT architecture + general tokenizer, trained on mixed financial data, beat FinBERT's domain pre-training + custom tokenizer on FPB?

**Models:**
1. `answerdotai/ModernBERT-base` (fine-tuned by us)
2. `ProsusAI/finbert` (pre-trained baseline)
3. `yiyanghkust/finbert-tone` (pre-trained baseline)

**Test sets:** FPB `sentences_allAgree` and `sentences_50agree`

## 1. Setup & Installation

In [None]:
%env UNSLOTH_DISABLE_FAST_GENERATION=1

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install -q datasets scikit-learn matplotlib seaborn

In [None]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification,
    AutoTokenizer, training_args,
)
from datasets import load_dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6  # source ID for FinancialPhraseBank in the aggregated dataset

## 2. Data Preparation

Load the aggregated financial sentiment dataset, exclude FPB (source 6) so it can serve as a held-out test set.

In [None]:
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

# Filter to sentiment task only, exclude FPB
ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

# Map to model-ready format
remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}  |  Test: {len(ds['test']):,}")
print(f"Sample: {ds['train'][0]}")

## 3. Fine-tune ModernBERT-base

In [None]:
model, tokenizer = FastModel.from_pretrained(
    model_name="answerdotai/ModernBERT-base",
    load_in_4bit=False,
    max_seq_length=2048,
    dtype=None,
    auto_model=AutoModelForSequenceClassification,
    num_labels=NUM_CLASSES,
    full_finetuning=True,
)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

train_data = ds["train"].map(tokenize_function, batched=True)
val_data = ds["validation"].map(tokenize_function, batched=True)

In [None]:
# Optional: wandb logging
# import wandb
# wandb.login()
# wandb.init(project="modernfinbert", name="01-arch-comparison")

In [None]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
        output_dir="trainer_output",
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        learning_rate=5e-5,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        num_train_epochs=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy="epoch",
        eval_strategy="epoch",
        logging_strategy="epoch",
        report_to="none",
    ),
    compute_metrics=lambda eval_pred: {
        "accuracy": accuracy_score(
            eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
        )
    },
)

trainer_stats = trainer.train()

In [None]:
model = model.cuda().eval()
FastLanguageModel.for_inference(model)

## 4. Load FPB Test Sets

In [None]:
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_all = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)["train"]

print(f"FPB 50agree: {len(fpb_50):,} samples")
print(f"FPB allAgree: {len(fpb_all):,} samples")

## 5. Evaluation Helper

In [None]:
def evaluate_on_fpb(model, tokenizer, fpb_dataset, label_remap=None, batch_size=32):
    """Evaluate a model on an FPB dataset split.

    Args:
        model: The classification model (already on CUDA and in eval mode).
        tokenizer: The model's tokenizer.
        fpb_dataset: HF dataset with 'sentence' and 'label' columns.
        label_remap: Optional dict mapping model output indices to FPB label indices.
                     Use when model label order differs from FPB (neg=0, neu=1, pos=2).
        batch_size: Inference batch size.

    Returns:
        dict with accuracy, macro_f1, classification_report, confusion_matrix,
        y_true, y_pred.
    """
    texts = fpb_dataset["sentence"]
    labels = fpb_dataset["label"]

    all_preds = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating"):
            batch_texts = texts[i : i + batch_size]
            inputs = tokenizer(
                batch_texts, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()

            if label_remap:
                preds = np.array([label_remap[p] for p in preds])

            all_preds.extend(preds)

    y_true = np.array(labels)
    y_pred = np.array(all_preds)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    report = classification_report(y_true, y_pred, target_names=LABEL_NAMES)
    cm = confusion_matrix(y_true, y_pred)

    print(f"Accuracy: {acc:.4f} ({int(acc * len(y_true))}/{len(y_true)})")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"\n{report}")

    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "report": report,
        "cm": cm,
        "y_true": y_true,
        "y_pred": y_pred,
    }

## 6. Evaluate ModernFinBERT on FPB

In [None]:
print("=" * 60)
print("ModernFinBERT — FPB sentences_50agree")
print("=" * 60)
mfb_50 = evaluate_on_fpb(model, tokenizer, fpb_50)

In [None]:
print("=" * 60)
print("ModernFinBERT — FPB sentences_allAgree")
print("=" * 60)
mfb_all = evaluate_on_fpb(model, tokenizer, fpb_all)

## 7. Evaluate ProsusAI/finbert on FPB

ProsusAI/finbert uses label order: positive=0, negative=1, neutral=2.  
FPB uses: negative=0, neutral=1, positive=2.  
We remap accordingly.

In [None]:
# Free GPU memory from ModernBERT
del model, trainer
torch.cuda.empty_cache()

# Load ProsusAI/finbert
finbert_model_name = "ProsusAI/finbert"
finbert_tokenizer = AutoTokenizer.from_pretrained(finbert_model_name)
finbert_model = AutoModelForSequenceClassification.from_pretrained(finbert_model_name).cuda().eval()

# Build label remap from model config
fpb_label_map = {"negative": 0, "neutral": 1, "positive": 2}
finbert_remap = {
    int(k): fpb_label_map[v.lower()]
    for k, v in finbert_model.config.id2label.items()
}
print(f"ProsusAI/finbert label remap: {finbert_remap}")

In [None]:
print("=" * 60)
print("ProsusAI/finbert — FPB sentences_50agree")
print("=" * 60)
fb_50 = evaluate_on_fpb(finbert_model, finbert_tokenizer, fpb_50, label_remap=finbert_remap)

In [None]:
print("=" * 60)
print("ProsusAI/finbert — FPB sentences_allAgree")
print("=" * 60)
fb_all = evaluate_on_fpb(finbert_model, finbert_tokenizer, fpb_all, label_remap=finbert_remap)

## 8. Evaluate yiyanghkust/finbert-tone on FPB

In [None]:
# Free GPU memory
del finbert_model
torch.cuda.empty_cache()

# Load finbert-tone
tone_model_name = "yiyanghkust/finbert-tone"
tone_tokenizer = AutoTokenizer.from_pretrained(tone_model_name)
tone_model = AutoModelForSequenceClassification.from_pretrained(tone_model_name).cuda().eval()

# Build label remap
tone_remap = {
    int(k): fpb_label_map[v.lower()]
    for k, v in tone_model.config.id2label.items()
}
print(f"finbert-tone label remap: {tone_remap}")

In [None]:
print("=" * 60)
print("yiyanghkust/finbert-tone — FPB sentences_50agree")
print("=" * 60)
ft_50 = evaluate_on_fpb(tone_model, tone_tokenizer, fpb_50, label_remap=tone_remap)

In [None]:
print("=" * 60)
print("yiyanghkust/finbert-tone — FPB sentences_allAgree")
print("=" * 60)
ft_all = evaluate_on_fpb(tone_model, tone_tokenizer, fpb_all, label_remap=tone_remap)

## 9. Summary Comparison Table

In [None]:
results = pd.DataFrame([
    {"Model": "ModernFinBERT (ours)", "FPB Variant": "50agree",
     "Accuracy": f"{mfb_50['accuracy']:.4f}", "Macro F1": f"{mfb_50['macro_f1']:.4f}"},
    {"Model": "ModernFinBERT (ours)", "FPB Variant": "allAgree",
     "Accuracy": f"{mfb_all['accuracy']:.4f}", "Macro F1": f"{mfb_all['macro_f1']:.4f}"},
    {"Model": "ProsusAI/finbert", "FPB Variant": "50agree",
     "Accuracy": f"{fb_50['accuracy']:.4f}", "Macro F1": f"{fb_50['macro_f1']:.4f}"},
    {"Model": "ProsusAI/finbert", "FPB Variant": "allAgree",
     "Accuracy": f"{fb_all['accuracy']:.4f}", "Macro F1": f"{fb_all['macro_f1']:.4f}"},
    {"Model": "yiyanghkust/finbert-tone", "FPB Variant": "50agree",
     "Accuracy": f"{ft_50['accuracy']:.4f}", "Macro F1": f"{ft_50['macro_f1']:.4f}"},
    {"Model": "yiyanghkust/finbert-tone", "FPB Variant": "allAgree",
     "Accuracy": f"{ft_all['accuracy']:.4f}", "Macro F1": f"{ft_all['macro_f1']:.4f}"},
])

print("\n" + "=" * 70)
print("FINAL COMPARISON")
print("=" * 70)
print(results.to_string(index=False))

## 10. Confusion Matrices

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

all_results = [
    (mfb_50, "ModernFinBERT\n50agree"),
    (fb_50, "ProsusAI/finbert\n50agree"),
    (ft_50, "finbert-tone\n50agree"),
    (mfb_all, "ModernFinBERT\nallAgree"),
    (fb_all, "ProsusAI/finbert\nallAgree"),
    (ft_all, "finbert-tone\nallAgree"),
]

for ax, (res, title) in zip(axes.flat, all_results):
    sns.heatmap(
        res["cm"], annot=True, fmt="d", cmap="Blues",
        xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax,
    )
    ax.set_title(f"{title}\nAcc={res['accuracy']:.2%}  F1={res['macro_f1']:.2%}")
    ax.set_ylabel("True")
    ax.set_xlabel("Predicted")

plt.tight_layout()
plt.savefig("confusion_matrices_comparison.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved to confusion_matrices_comparison.png")