# Experiment 3A: Test Set Evaluation

**Aim:** Load the best checkpoints from Notebook 01 (ModernFinBERT) and Notebook 02 (Baseline + DataBoosted), then evaluate all three on the aggregated test set.

**Models:**
1. **NB01 ModernFinBERT** — fine-tuned on aggregated data (FPB excluded)
2. **NB02 Baseline** — same training setup as NB01 (separate run)
3. **NB02 DataBoosted** — retrained with LLM-paraphrased misclassified samples added

**Test set:** Aggregated dataset test split (source != FPB), never seen during training.

**Dependencies:** Requires outputs from notebooks 01 and 02 as Kaggle kernel sources.

## 1. Setup

In [None]:
# setup

In [None]:
%%capture
!pip install -q "datasets>=3.4.1,<4.0.0" scikit-learn peft accelerate transformers

In [None]:
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6

## 2. Locate Checkpoints

Find the best checkpoint from each notebook's trainer output using `trainer_state.json`.

In [None]:
# Checkpoint paths from uploaded dataset
CKPT_ROOT = "/kaggle/input/modernfinbert-best-checkpoints"

ckpt_nb01 = os.path.join(CKPT_ROOT, "nb01")
ckpt_nb02_base = os.path.join(CKPT_ROOT, "nb02_baseline")
ckpt_nb02_boost = os.path.join(CKPT_ROOT, "nb02_boosted")

# Verify all checkpoints exist
for name, path in [("NB01", ckpt_nb01), ("NB02 Baseline", ckpt_nb02_base), ("NB02 Boosted", ckpt_nb02_boost)]:
    exists = os.path.exists(os.path.join(path, "adapter_model.safetensors"))
    print(f"  {name}: {path} — {'OK' if exists else 'MISSING'}")

## 3. Load Test Set

In [None]:
label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

ds = load_dataset("neoyipeng/financial_reasoning_aggregated")
ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

test_texts = ds["test"]["text"]
test_labels = np.array([label_dict[lbl] for lbl in ds["test"]["label"]])

print(f"Test set: {len(test_texts)} samples")
print(f"Label distribution: {dict(zip(*np.unique(test_labels, return_counts=True)))}")

## 4. Evaluation

In [None]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

def load_model(checkpoint_path):
    """Load ModernBERT-base with a LoRA adapter checkpoint."""
    base_model = AutoModelForSequenceClassification.from_pretrained(
        "answerdotai/ModernBERT-base",
        num_labels=NUM_CLASSES,
        torch_dtype=torch.float32,
        attn_implementation="sdpa",
    )
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model = model.cuda().eval()
    return model

def run_inference(model, texts, batch_size=32):
    """Run inference and return predicted class indices."""
    all_preds = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Inference"):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
    return np.array(all_preds)

def evaluate(name, checkpoint_path):
    """Load a checkpoint, run on test set, print and return results."""
    print(f"\n{'=' * 60}")
    print(f"{name}")
    print(f"{'=' * 60}")

    model = load_model(checkpoint_path)
    preds = run_inference(model, test_texts)

    acc = accuracy_score(test_labels, preds)
    macro_f1 = f1_score(test_labels, preds, average="macro")
    report = classification_report(test_labels, preds, target_names=LABEL_NAMES)
    cm = confusion_matrix(test_labels, preds)

    print(f"Accuracy: {acc:.4f} ({int(acc * len(test_labels))}/{len(test_labels)})")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"\n{report}")

    # Free GPU memory
    del model
    torch.cuda.empty_cache()

    return {"accuracy": acc, "macro_f1": macro_f1, "cm": cm, "y_pred": preds}

In [None]:
res_nb01 = evaluate("Notebook 01: ModernFinBERT", ckpt_nb01)

In [None]:
res_nb02_base = evaluate("Notebook 02: Baseline", ckpt_nb02_base)

In [None]:
res_nb02_boost = evaluate("Notebook 02: DataBoosted", ckpt_nb02_boost)

## 5. Summary

In [None]:
summary = pd.DataFrame([
    {"Model": "NB01 ModernFinBERT",  "Accuracy": res_nb01["accuracy"],       "Macro F1": res_nb01["macro_f1"]},
    {"Model": "NB02 Baseline",       "Accuracy": res_nb02_base["accuracy"],  "Macro F1": res_nb02_base["macro_f1"]},
    {"Model": "NB02 DataBoosted",    "Accuracy": res_nb02_boost["accuracy"], "Macro F1": res_nb02_boost["macro_f1"]},
])

print("=" * 60)
print("AGGREGATED TEST SET — ALL MODELS")
print("=" * 60)
print(summary.to_string(index=False, float_format="%.4f"))

print(f"\nDataBoost delta (NB02):")
print(f"  Accuracy: {res_nb02_boost['accuracy'] - res_nb02_base['accuracy']:+.4f}")
print(f"  Macro F1: {res_nb02_boost['macro_f1'] - res_nb02_base['macro_f1']:+.4f}")

## 6. Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for ax, (res, title) in zip(axes, [
    (res_nb01, "NB01 ModernFinBERT"),
    (res_nb02_base, "NB02 Baseline"),
    (res_nb02_boost, "NB02 DataBoosted"),
]):
    sns.heatmap(
        res["cm"], annot=True, fmt="d", cmap="Blues",
        xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax,
    )
    ax.set_title(f"{title}\nAcc={res['accuracy']:.2%}  F1={res['macro_f1']:.2%}")
    ax.set_ylabel("True")
    ax.set_xlabel("Predicted")

plt.suptitle("Aggregated Test Set — Confusion Matrices", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("test_confusion_matrices.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Bar chart
fig, ax = plt.subplots(figsize=(8, 4))

models = summary["Model"]
x = np.arange(len(models))
w = 0.35

bars1 = ax.bar(x - w/2, summary["Accuracy"], w, label="Accuracy", color="#2196F3")
bars2 = ax.bar(x + w/2, summary["Macro F1"], w, label="Macro F1", color="#66BB6A")

ax.set_ylim(0.7, 1.0)
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.set_ylabel("Score")
ax.set_title("Aggregated Test Set — Model Comparison")
ax.legend()

for bars in [bars1, bars2]:
    for bar in bars:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,
                f"{bar.get_height():.2%}", ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.savefig("test_accuracy_comparison.png", dpi=150, bbox_inches="tight")
plt.show()