# Experiment 6: Multi-Seed Robustness

**Aim:** Run the NB01 protocol (ModernBERT+LoRA on aggregated data, held-out FPB evaluation) with 5 random seeds to get confidence intervals and demonstrate result stability.

**Seeds:** `[3407, 42, 123, 456, 789]`

**Per seed:** Full NB01 pipeline — train ModernBERT+LoRA on aggregated data (excluding FPB), evaluate on FPB 50agree + allAgree + aggregated test set.

**Runtime:** ~5 × 40 min = ~3.5 hours on T4

## 1. Setup & Installation

In [None]:
# setup

In [None]:
%%capture
!pip install -q "datasets>=3.4.1,<4.0.0" scikit-learn matplotlib seaborn peft accelerate transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification,
    AutoTokenizer, training_args,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import gc

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6
MODEL_NAME = "answerdotai/ModernBERT-base"

SEEDS = [3407, 42, 123, 456, 789]

## 2. Data Preparation

In [None]:
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}  |  Test: {len(ds['test']):,}")

In [None]:
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_all = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)["train"]

print(f"FPB 50agree: {len(fpb_50):,} samples")
print(f"FPB allAgree: {len(fpb_all):,} samples")

## 3. Training & Evaluation Functions

In [None]:
def train_model(train_dataset, val_dataset, seed, output_dir="trainer_output", epochs=10):
    """Train a fresh ModernBERT-base model with LoRA and return model + tokenizer."""
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_CLASSES,
        torch_dtype=torch.float32,
        attn_implementation="sdpa",
    )
    model.gradient_checkpointing_enable()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["Wqkv", "out_proj", "Wi", "Wo"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    model = get_peft_model(model, lora_config)
    model = model.cuda()

    def tokenize_function(examples):
        return tokenizer(examples["text"])

    train_tok = train_dataset.map(tokenize_function, batched=True)
    val_tok = val_dataset.map(tokenize_function, batched=True)

    trainer = Trainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            warmup_steps=10,
            fp16=True,
            bf16=False,
            optim=training_args.OptimizerNames.ADAMW_TORCH,
            learning_rate=2e-4,
            weight_decay=0.001,
            lr_scheduler_type="cosine",
            seed=seed,
            num_train_epochs=epochs,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_strategy="epoch",
            eval_strategy="epoch",
            logging_strategy="epoch",
            gradient_checkpointing=True,
            report_to="none",
        ),
        compute_metrics=lambda eval_pred: {
            "accuracy": accuracy_score(
                eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
            )
        },
    )

    trainer.train()
    model = model.cuda().eval()
    return model, tokenizer

In [None]:
def run_inference(model, tokenizer, texts, batch_size=32):
    """Run inference and return predicted class indices."""
    all_preds = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
    return np.array(all_preds)


def evaluate_all(model, tokenizer, ds, fpb_50, fpb_all):
    """Evaluate model on aggregated test set + FPB 50agree + FPB allAgree."""
    results = {}

    # Aggregated test set
    test_texts = ds["test"]["text"]
    test_labels = np.argmax(ds["test"]["labels"], axis=1)
    test_preds = run_inference(model, tokenizer, test_texts)
    results["test_acc"] = accuracy_score(test_labels, test_preds)
    results["test_f1"] = f1_score(test_labels, test_preds, average="macro")

    # FPB 50agree
    fpb50_preds = run_inference(model, tokenizer, fpb_50["sentence"])
    results["fpb50_acc"] = accuracy_score(fpb_50["label"], fpb50_preds)
    results["fpb50_f1"] = f1_score(fpb_50["label"], fpb50_preds, average="macro")

    # FPB allAgree
    fpball_preds = run_inference(model, tokenizer, fpb_all["sentence"])
    results["fpball_acc"] = accuracy_score(fpb_all["label"], fpball_preds)
    results["fpball_f1"] = f1_score(fpb_all["label"], fpball_preds, average="macro")

    return results

## 4. Run All Seeds

In [None]:
all_results = []

for i, seed in enumerate(SEEDS):
    print(f"\n{'#'*60}")
    print(f"SEED {seed} ({i+1}/{len(SEEDS)})")
    print(f"{'#'*60}")

    model, tokenizer = train_model(
        ds["train"], ds["validation"],
        seed=seed,
        output_dir=f"trainer_output_seed_{seed}",
    )

    results = evaluate_all(model, tokenizer, ds, fpb_50, fpb_all)
    results["seed"] = seed
    all_results.append(results)

    print(f"\nSeed {seed} results:")
    print(f"  Agg Test:     {results['test_acc']:.4f} acc / {results['test_f1']:.4f} F1")
    print(f"  FPB 50agree:  {results['fpb50_acc']:.4f} acc / {results['fpb50_f1']:.4f} F1")
    print(f"  FPB allAgree: {results['fpball_acc']:.4f} acc / {results['fpball_f1']:.4f} F1")

    # Cleanup
    del model
    gc.collect()
    torch.cuda.empty_cache()

print(f"\nAll {len(SEEDS)} seeds complete!")

## 5. Results Summary

In [None]:
results_df = pd.DataFrame(all_results)
results_df = results_df[["seed", "test_acc", "test_f1", "fpb50_acc", "fpb50_f1", "fpball_acc", "fpball_f1"]]
results_df.columns = ["Seed", "Test Acc", "Test F1", "FPB50 Acc", "FPB50 F1", "FPBAll Acc", "FPBAll F1"]

print("=" * 95)
print("MULTI-SEED ROBUSTNESS — ModernBERT+LoRA on Aggregated Data")
print("=" * 95)
print(results_df.to_string(index=False, float_format="%.4f"))

print(f"\n{'='*95}")
print("SUMMARY (mean ± std)")
print(f"{'='*95}")
for col in ["Test Acc", "Test F1", "FPB50 Acc", "FPB50 F1", "FPBAll Acc", "FPBAll F1"]:
    mean = results_df[col].mean()
    std = results_df[col].std()
    print(f"  {col:12s}: {mean:.4f} ± {std:.4f}")

print(f"\n  Best FPB50 Acc:  {results_df['FPB50 Acc'].max():.4f} (seed {results_df.loc[results_df['FPB50 Acc'].idxmax(), 'Seed']})")
print(f"  Worst FPB50 Acc: {results_df['FPB50 Acc'].min():.4f} (seed {results_df.loc[results_df['FPB50 Acc'].idxmin(), 'Seed']})")
print(f"  Range:           {results_df['FPB50 Acc'].max() - results_df['FPB50 Acc'].min():.4f}")

## 6. Visualizations

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

metrics = [
    ("Test Acc", "Test F1", "Aggregated Test Set"),
    ("FPB50 Acc", "FPB50 F1", "FPB sentences_50agree"),
    ("FPBAll Acc", "FPBAll F1", "FPB sentences_allAgree"),
]

for ax, (acc_col, f1_col, title) in zip(axes, metrics):
    x = np.arange(len(SEEDS))
    width = 0.35

    bars1 = ax.bar(x - width/2, results_df[acc_col], width, label="Accuracy", color="#2196F3")
    bars2 = ax.bar(x + width/2, results_df[f1_col], width, label="Macro F1", color="#4CAF50")

    # Mean lines
    acc_mean = results_df[acc_col].mean()
    f1_mean = results_df[f1_col].mean()
    ax.axhline(y=acc_mean, color="#1565C0", linestyle="--", alpha=0.7, label=f"Acc mean: {acc_mean:.4f}")
    ax.axhline(y=f1_mean, color="#2E7D32", linestyle="--", alpha=0.7, label=f"F1 mean: {f1_mean:.4f}")

    ax.set_xlabel("Seed")
    ax.set_ylabel("Score")
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(SEEDS, fontsize=8)
    ax.legend(fontsize=7)
    ax.set_ylim(0.85, 1.0)

plt.suptitle("Multi-Seed Robustness — ModernBERT+LoRA", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("multi_seed_results.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Box plots for distribution across seeds
fig, ax = plt.subplots(figsize=(10, 5))

box_data = {
    "Test\nAcc": results_df["Test Acc"].values,
    "Test\nF1": results_df["Test F1"].values,
    "FPB50\nAcc": results_df["FPB50 Acc"].values,
    "FPB50\nF1": results_df["FPB50 F1"].values,
    "FPBAll\nAcc": results_df["FPBAll Acc"].values,
    "FPBAll\nF1": results_df["FPBAll F1"].values,
}

bp = ax.boxplot(
    box_data.values(),
    labels=box_data.keys(),
    patch_artist=True,
    boxprops=dict(facecolor="#90CAF9"),
)

# Overlay individual points
for i, (name, data) in enumerate(box_data.items()):
    ax.scatter([i + 1] * len(data), data, color="#1565C0", s=30, zorder=3)

ax.set_ylabel("Score")
ax.set_title(f"Score Distribution Across {len(SEEDS)} Seeds")
ax.set_ylim(0.85, 1.0)
plt.tight_layout()
plt.savefig("multi_seed_boxplot.png", dpi=150, bbox_inches="tight")
plt.show()

## 7. Final Summary for Paper

In [None]:
print("=" * 70)
print("PAPER-READY RESULTS")
print("=" * 70)
print(f"\nModernBERT-base + LoRA trained on aggregated financial data (FPB held out)")
print(f"Results over {len(SEEDS)} seeds: {SEEDS}")
print()

for col_pair, name in [
    (("FPB50 Acc", "FPB50 F1"), "FPB sentences_50agree"),
    (("FPBAll Acc", "FPBAll F1"), "FPB sentences_allAgree"),
    (("Test Acc", "Test F1"), "Aggregated test set"),
]:
    acc_col, f1_col = col_pair
    acc_mean = results_df[acc_col].mean()
    acc_std = results_df[acc_col].std()
    f1_mean = results_df[f1_col].mean()
    f1_std = results_df[f1_col].std()
    print(f"  {name}:")
    print(f"    Accuracy: {acc_mean:.2f}% ± {acc_std:.2f}%")
    print(f"    Macro F1: {f1_mean:.4f} ± {f1_std:.4f}")
    print()