# Experiment 1: Architecture Comparison — ModernBERT vs FinBERT

**Aim:** Fine-tune ModernBERT-base on aggregated financial sentiment data (excluding FPB), then evaluate on FinancialPhraseBank as a held-out benchmark. Compare against published FinBERT baselines.

**Key question:** Does ModernBERT architecture + general tokenizer, trained on mixed financial data, beat FinBERT's domain pre-training + custom tokenizer on FPB?

**Models:**
1. `answerdotai/ModernBERT-base` (fine-tuned by us on aggregated data, FPB held out)
2. `ProsusAI/finbert` — reported results from [Araci 2019](https://arxiv.org/abs/1908.10063) (trained on FPB in-domain)
3. `yiyanghkust/finbert-tone` — reported results from [Yang et al. 2020](https://arxiv.org/abs/2006.08097) (trained on analyst reports)

**Test sets:** FPB `sentences_allAgree` and `sentences_50agree`

**Important context:** Our ModernFinBERT never sees FPB during training (stricter held-out evaluation), while ProsusAI/finbert results are from in-domain FPB train/test splits. This makes direct comparison favorable to our model when we match or exceed their numbers.

## 1. Setup & Installation

In [None]:
%env UNSLOTH_DISABLE_FAST_GENERATION=1

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install -q datasets scikit-learn matplotlib seaborn peft

In [None]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification,
    AutoTokenizer, training_args,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6  # source ID for FinancialPhraseBank in the aggregated dataset

## 2. Data Preparation

Load the aggregated financial sentiment dataset, exclude FPB (source 6) so it can serve as a held-out test set.

In [None]:
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

# Filter to sentiment task only, exclude FPB
ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

# Map to model-ready format
remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}  |  Test: {len(ds['test']):,}")
print(f"Sample: {ds['train'][0]}")

## 3. Fine-tune ModernBERT-base

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=NUM_CLASSES,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["Wqkv", "out_proj", "Wi", "Wo"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)
model = get_peft_model(model, lora_config)
model = model.cuda()
model.print_trainable_parameters()

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

train_data = ds["train"].map(tokenize_function, batched=True)
val_data = ds["validation"].map(tokenize_function, batched=True)

In [None]:
# Optional: wandb logging
# import wandb
# wandb.login()
# wandb.init(project="modernfinbert", name="01-arch-comparison")

In [None]:
trainer = Trainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=TrainingArguments(
        output_dir="trainer_output",
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        learning_rate=2e-4,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        num_train_epochs=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_strategy="epoch",
        eval_strategy="epoch",
        logging_strategy="epoch",
        report_to="none",
    ),
    compute_metrics=lambda eval_pred: {
        "accuracy": accuracy_score(
            eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
        )
    },
)

trainer_stats = trainer.train()

In [None]:
model = model.cuda().eval()

## 4. Load FPB Test Sets

In [None]:
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_all = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)["train"]

print(f"FPB 50agree: {len(fpb_50):,} samples")
print(f"FPB allAgree: {len(fpb_all):,} samples")

## 5. Evaluation Helper

In [None]:
def evaluate_on_fpb(model, tokenizer, fpb_dataset, label_remap=None, batch_size=32):
    """Evaluate a model on an FPB dataset split.

    Args:
        model: The classification model (already on CUDA and in eval mode).
        tokenizer: The model's tokenizer.
        fpb_dataset: HF dataset with 'sentence' and 'label' columns.
        label_remap: Optional dict mapping model output indices to FPB label indices.
                     Use when model label order differs from FPB (neg=0, neu=1, pos=2).
        batch_size: Inference batch size.

    Returns:
        dict with accuracy, macro_f1, classification_report, confusion_matrix,
        y_true, y_pred.
    """
    texts = fpb_dataset["sentence"]
    labels = fpb_dataset["label"]

    all_preds = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating"):
            batch_texts = texts[i : i + batch_size]
            inputs = tokenizer(
                batch_texts, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()

            if label_remap:
                preds = np.array([label_remap[p] for p in preds])

            all_preds.extend(preds)

    y_true = np.array(labels)
    y_pred = np.array(all_preds)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    report = classification_report(y_true, y_pred, target_names=LABEL_NAMES)
    cm = confusion_matrix(y_true, y_pred)

    print(f"Accuracy: {acc:.4f} ({int(acc * len(y_true))}/{len(y_true)})")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"\n{report}")

    return {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "report": report,
        "cm": cm,
        "y_true": y_true,
        "y_pred": y_pred,
    }

## 6. Evaluate ModernFinBERT on FPB

In [None]:
print("=" * 60)
print("ModernFinBERT — FPB sentences_50agree")
print("=" * 60)
mfb_50 = evaluate_on_fpb(model, tokenizer, fpb_50)

In [None]:
print("=" * 60)
print("ModernFinBERT — FPB sentences_allAgree")
print("=" * 60)
mfb_all = evaluate_on_fpb(model, tokenizer, fpb_all)

## 7. Comparison with Published Baselines

Baseline numbers taken from published papers (see `reference/fpb_benchmarks.md` for full details):

| Source | Eval Protocol |
|--------|--------------|
| **Araci 2019** ([arxiv](https://arxiv.org/abs/1908.10063)) | In-domain FPB train/test split (80/20) |
| **Yang et al. 2020** ([arxiv](https://arxiv.org/abs/2006.08097)) | 90/10 split, 10-run avg; FPB agreement level unspecified |
| **ModernFinBERT (ours)** | Trained on aggregated data **excluding FPB** — fully held-out evaluation |

In [None]:
# Published baseline results (from papers — no GPU needed)
literature = pd.DataFrame([
    {"Model": "LSTM+ELMo",              "FPB Variant": "50agree",  "Accuracy": 0.75, "Macro F1": 0.70,
     "Source": "Araci 2019", "Eval": "in-domain"},
    {"Model": "ULMFit",                 "FPB Variant": "50agree",  "Accuracy": 0.83, "Macro F1": 0.79,
     "Source": "Araci 2019", "Eval": "in-domain"},
    {"Model": "ProsusAI/finbert",       "FPB Variant": "50agree",  "Accuracy": 0.86, "Macro F1": 0.84,
     "Source": "Araci 2019", "Eval": "in-domain"},
    {"Model": "FinBERT-FinVocab",       "FPB Variant": "50agree",  "Accuracy": 0.872, "Macro F1": None,
     "Source": "Yang et al. 2020", "Eval": "in-domain (90/10, 10-run avg)"},
    {"Model": "LSTM+ELMo",              "FPB Variant": "allAgree", "Accuracy": 0.84, "Macro F1": 0.77,
     "Source": "Araci 2019", "Eval": "in-domain"},
    {"Model": "ULMFit",                 "FPB Variant": "allAgree", "Accuracy": 0.93, "Macro F1": 0.91,
     "Source": "Araci 2019", "Eval": "in-domain"},
    {"Model": "ProsusAI/finbert",       "FPB Variant": "allAgree", "Accuracy": 0.97, "Macro F1": 0.95,
     "Source": "Araci 2019", "Eval": "in-domain"},
])

# Our results (computed above)
ours = pd.DataFrame([
    {"Model": "ModernFinBERT (ours)", "FPB Variant": "50agree",
     "Accuracy": round(mfb_50["accuracy"], 4), "Macro F1": round(mfb_50["macro_f1"], 4),
     "Source": "This experiment", "Eval": "held-out (FPB excluded from training)"},
    {"Model": "ModernFinBERT (ours)", "FPB Variant": "allAgree",
     "Accuracy": round(mfb_all["accuracy"], 4), "Macro F1": round(mfb_all["macro_f1"], 4),
     "Source": "This experiment", "Eval": "held-out (FPB excluded from training)"},
])

results = pd.concat([ours, literature], ignore_index=True)

print("=" * 85)
print("FINAL COMPARISON — ModernFinBERT vs Published Baselines")
print("=" * 85)
for variant in ["50agree", "allAgree"]:
    subset = results[results["FPB Variant"] == variant].copy()
    subset = subset.sort_values("Accuracy", ascending=False)
    print(f"\n--- FPB sentences_{variant} ---")
    print(subset[["Model", "Accuracy", "Macro F1", "Eval"]].to_string(index=False))
print("\nNote: Baselines trained/tested on FPB in-domain splits.")
print("ModernFinBERT never saw FPB during training — stricter evaluation.")

## 8. Visualizations

In [None]:
# --- Bar chart: Accuracy comparison across models ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, variant in zip(axes, ["50agree", "allAgree"]):
    subset = results[results["FPB Variant"] == variant].sort_values("Accuracy")
    colors = ["#2196F3" if "ours" in m else "#90CAF9" for m in subset["Model"]]
    bars = ax.barh(subset["Model"], subset["Accuracy"], color=colors, edgecolor="white")
    ax.set_xlim(0.6, 1.0)
    ax.set_xlabel("Accuracy")
    ax.set_title(f"FPB sentences_{variant}")
    for bar, acc in zip(bars, subset["Accuracy"]):
        ax.text(bar.get_width() + 0.005, bar.get_y() + bar.get_height() / 2,
                f"{acc:.1%}", va="center", fontsize=10)

plt.suptitle("ModernFinBERT vs Published Baselines — FPB Accuracy", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("accuracy_comparison.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# --- Confusion matrices: ModernFinBERT on both FPB splits ---
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for ax, (res, title) in zip(axes, [
    (mfb_50, "ModernFinBERT — 50agree"),
    (mfb_all, "ModernFinBERT — allAgree"),
]):
    sns.heatmap(
        res["cm"], annot=True, fmt="d", cmap="Blues",
        xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax,
    )
    ax.set_title(f"{title}\nAcc={res['accuracy']:.2%}  F1={res['macro_f1']:.2%}")
    ax.set_ylabel("True")
    ax.set_xlabel("Predicted")

plt.tight_layout()
plt.savefig("confusion_matrices_modernfinbert.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved to confusion_matrices_modernfinbert.png")