# Experiment 3: ModernFinBERT vs Claude Opus 4.5

**Aim:** Compare our fine-tuned ModernFinBERT model against Claude Opus 4.5 on the FPB test set using zero-shot classification, and contextualize against published baselines.

**Key questions:**
- How does a 149M fine-tuned specialist compare to a frontier LLM on financial sentiment?
- What's the cost/accuracy trade-off?
- Where do both models sit relative to published FinBERT baselines?

**Published baselines** (from `reference/fpb_benchmarks.md`):
- ProsusAI/finbert: 86% acc on FPB 50agree (Araci 2019, in-domain)
- FinBERT-FinVocab: 87.2% acc (Yang et al. 2020, in-domain)
- GPT-4o zero-shot: ~0.727 macro F1 on 75-99% agree subset (2025 eval paper)

## 1. Setup

In [None]:
!pip install -q anthropic datasets scikit-learn pandas matplotlib seaborn

In [None]:
import os
import json
import time
import getpass
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from datasets import load_dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
LABEL_MAP = {"negative": 0, "neutral": 1, "positive": 2}

In [None]:
if "ANTHROPIC_API_KEY" not in os.environ:
    os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Enter Anthropic API key: ")

from anthropic import Anthropic
client = Anthropic()

## 2. Load Test Data

In [None]:
fpb = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
print(f"FPB 50agree: {len(fpb):,} samples")

# Label distribution
labels = fpb["label"]
for i, name in enumerate(LABEL_NAMES):
    count = labels.count(i)
    print(f"  {name}: {count} ({count/len(labels):.1%})")

## 3. Claude Opus 4.5 Zero-Shot Classification

Send each sample to Claude with a structured prompt, collect predictions, and track token usage for cost analysis.

In [None]:
SYSTEM_PROMPT = """You are a financial sentiment classifier. Given a financial text, classify its sentiment as exactly one of: POSITIVE, NEGATIVE, or NEUTRAL.

Respond with ONLY a single JSON object: {"sentiment": "POSITIVE"} or {"sentiment": "NEGATIVE"} or {"sentiment": "NEUTRAL"}

No other text or explanation."""


def classify_with_claude(texts, batch_size=20, model="claude-opus-4-20250514"):
    """Classify texts using Claude API.

    Sends texts individually to get per-sample predictions.
    Batches are used only for progress tracking and rate limiting.

    Returns:
        predictions: list of int labels
        total_input_tokens: total input tokens used
        total_output_tokens: total output tokens used
        raw_responses: list of raw response strings (for debugging)
    """
    predictions = []
    raw_responses = []
    total_input_tokens = 0
    total_output_tokens = 0

    for i in tqdm(range(len(texts)), desc=f"Claude ({model})"):
        try:
            response = client.messages.create(
                model=model,
                max_tokens=50,
                system=SYSTEM_PROMPT,
                messages=[{"role": "user", "content": texts[i]}],
            )

            total_input_tokens += response.usage.input_tokens
            total_output_tokens += response.usage.output_tokens

            content = response.content[0].text.strip()
            raw_responses.append(content)

            # Parse JSON response
            if content.startswith("```"):
                content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()
            result = json.loads(content)
            sentiment = result["sentiment"].upper()

            if sentiment in ("POSITIVE", "POS"):
                predictions.append(2)
            elif sentiment in ("NEGATIVE", "NEG"):
                predictions.append(0)
            else:
                predictions.append(1)  # NEUTRAL

        except json.JSONDecodeError:
            # Fallback: try to extract from raw text
            upper = content.upper()
            if "POSITIVE" in upper:
                predictions.append(2)
            elif "NEGATIVE" in upper:
                predictions.append(0)
            else:
                predictions.append(1)
            raw_responses.append(content)

        except Exception as e:
            print(f"Error at sample {i}: {e}")
            predictions.append(1)  # default to neutral on error
            raw_responses.append(str(e))

        # Rate limiting: pause briefly every batch_size samples
        if (i + 1) % batch_size == 0:
            time.sleep(1)

    return predictions, total_input_tokens, total_output_tokens, raw_responses

In [None]:
# NOTE: This will make ~4,846 API calls. Estimated cost ~$30-50 for Opus 4.5.
# To test first, set SAMPLE_SIZE to a smaller number.

SAMPLE_SIZE = None  # Set to e.g. 100 for a quick test, None for full run

if SAMPLE_SIZE:
    np.random.seed(42)
    indices = np.random.choice(len(fpb), SAMPLE_SIZE, replace=False)
    test_texts = [fpb["sentence"][i] for i in indices]
    test_labels = [fpb["label"][i] for i in indices]
    print(f"Running on {SAMPLE_SIZE} sample subset")
else:
    test_texts = fpb["sentence"]
    test_labels = fpb["label"]
    print(f"Running on full FPB ({len(test_texts):,} samples)")

In [None]:
claude_preds, input_tokens, output_tokens, raw = classify_with_claude(test_texts)

print(f"\nTotal input tokens:  {input_tokens:,}")
print(f"Total output tokens: {output_tokens:,}")

## 4. Claude Results

In [None]:
y_true = np.array(test_labels)
y_pred_claude = np.array(claude_preds)

claude_acc = accuracy_score(y_true, y_pred_claude)
claude_f1 = f1_score(y_true, y_pred_claude, average="macro")

print(f"Claude Opus 4.5 — FPB 50agree")
print(f"Accuracy: {claude_acc:.4f} ({int(claude_acc * len(y_true))}/{len(y_true)})")
print(f"Macro F1: {claude_f1:.4f}")
print(f"\n{classification_report(y_true, y_pred_claude, target_names=LABEL_NAMES)}")

## 5. Load ModernFinBERT Results for Comparison

Either run ModernFinBERT inference here, or load the pre-trained model from HuggingFace.

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

mfb_tokenizer = AutoTokenizer.from_pretrained("neoyipeng/ModernFinBERT-base")
mfb_model = AutoModelForSequenceClassification.from_pretrained("neoyipeng/ModernFinBERT-base")
mfb_model = mfb_model.cuda().eval()

print(f"ModernFinBERT loaded: {sum(p.numel() for p in mfb_model.parameters()):,} params")

In [None]:
# Run ModernFinBERT on the same test samples
mfb_preds = []
batch_size = 32

with torch.no_grad():
    for i in tqdm(range(0, len(test_texts), batch_size), desc="ModernFinBERT"):
        batch = test_texts[i : i + batch_size]
        inputs = mfb_tokenizer(
            batch, return_tensors="pt", padding=True,
            truncation=True, max_length=512,
        )
        inputs = {k: v.cuda() for k, v in inputs.items()}
        logits = mfb_model(**inputs).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        mfb_preds.extend(preds)

y_pred_mfb = np.array(mfb_preds)

mfb_acc = accuracy_score(y_true, y_pred_mfb)
mfb_f1 = f1_score(y_true, y_pred_mfb, average="macro")

print(f"\nModernFinBERT — FPB 50agree")
print(f"Accuracy: {mfb_acc:.4f} ({int(mfb_acc * len(y_true))}/{len(y_true)})")
print(f"Macro F1: {mfb_f1:.4f}")
print(f"\n{classification_report(y_true, y_pred_mfb, target_names=LABEL_NAMES)}")

## 6. Comparison Table & Cost Analysis

In [None]:
# Claude Opus 4.5 pricing (per 1M tokens)
OPUS_INPUT_COST = 15.0   # $/1M input tokens
OPUS_OUTPUT_COST = 75.0  # $/1M output tokens

claude_cost = (
    input_tokens * OPUS_INPUT_COST / 1_000_000
    + output_tokens * OPUS_OUTPUT_COST / 1_000_000
)
cost_per_sample = claude_cost / len(test_texts)

# ModernFinBERT cost estimate (T4 GPU on cloud ~$0.35/hr)
# Inference time: ~10 seconds for 4846 samples
mfb_cost_per_hour = 0.35  # T4 on-demand
mfb_inference_seconds = 10  # approximate
mfb_cost = mfb_cost_per_hour * mfb_inference_seconds / 3600
mfb_cost_per_sample = mfb_cost / len(test_texts)

print("=" * 80)
print("FINAL COMPARISON: ModernFinBERT vs Claude Opus 4.5 vs Published Baselines")
print("=" * 80)

comparison = pd.DataFrame([
    # --- Published baselines (from literature) ---
    {"Model": "LSTM+ELMo *",          "Accuracy": "0.7500", "Macro F1": "0.7000",
     "Total Cost": "—", "Cost/Sample": "—", "Latency": "—",
     "Note": "Araci 2019, in-domain"},
    {"Model": "ULMFit *",             "Accuracy": "0.8300", "Macro F1": "0.7900",
     "Total Cost": "—", "Cost/Sample": "—", "Latency": "—",
     "Note": "Araci 2019, in-domain"},
    {"Model": "ProsusAI/finbert *",   "Accuracy": "0.8600", "Macro F1": "0.8400",
     "Total Cost": "—", "Cost/Sample": "—", "Latency": "—",
     "Note": "Araci 2019, in-domain"},
    {"Model": "FinBERT-FinVocab *",   "Accuracy": "0.8720", "Macro F1": "—",
     "Total Cost": "—", "Cost/Sample": "—", "Latency": "—",
     "Note": "Yang et al. 2020, in-domain"},
    # --- Our evaluations ---
    {"Model": "ModernFinBERT (149M)",
     "Accuracy": f"{mfb_acc:.4f}", "Macro F1": f"{mfb_f1:.4f}",
     "Total Cost": f"${mfb_cost:.4f}", "Cost/Sample": f"${mfb_cost_per_sample:.6f}",
     "Latency": "~2ms/sample",
     "Note": "held-out (FPB excluded)"},
    {"Model": "Claude Opus 4.5",
     "Accuracy": f"{claude_acc:.4f}", "Macro F1": f"{claude_f1:.4f}",
     "Total Cost": f"${claude_cost:.2f}", "Cost/Sample": f"${cost_per_sample:.4f}",
     "Latency": "~1-2s/sample",
     "Note": "zero-shot"},
])

print(comparison[["Model", "Accuracy", "Macro F1", "Total Cost", "Latency", "Note"]].to_string(index=False))

cost_ratio = claude_cost / max(mfb_cost, 0.0001)
print(f"\nClaude is {cost_ratio:.0f}x more expensive per inference run")
print(f"Claude total cost for {len(test_texts)} samples: ${claude_cost:.2f}")
print(f"  Input tokens:  {input_tokens:,} (${input_tokens * OPUS_INPUT_COST / 1_000_000:.2f})")
print(f"  Output tokens: {output_tokens:,} (${output_tokens * OPUS_OUTPUT_COST / 1_000_000:.2f})")
print("\n* Published baselines trained/tested on in-domain FPB splits.")
print("  See reference/fpb_benchmarks.md for full details.")

## 7. Error Analysis — Where Do They Disagree?

In [None]:
# Find samples where the two models disagree
disagree_mask = y_pred_mfb != y_pred_claude
disagree_indices = np.where(disagree_mask)[0]

print(f"Models disagree on {len(disagree_indices)} / {len(y_true)} samples ({len(disagree_indices)/len(y_true):.1%})")

# Among disagreements, who is right more often?
mfb_right = np.sum(y_pred_mfb[disagree_mask] == y_true[disagree_mask])
claude_right = np.sum(y_pred_claude[disagree_mask] == y_true[disagree_mask])
neither_right = np.sum(
    (y_pred_mfb[disagree_mask] != y_true[disagree_mask])
    & (y_pred_claude[disagree_mask] != y_true[disagree_mask])
)

print(f"  ModernFinBERT correct: {mfb_right}")
print(f"  Claude correct:        {claude_right}")
print(f"  Both wrong:            {neither_right}")

In [None]:
# Show some disagreement examples
print("\n--- Sample Disagreements ---")
for idx in disagree_indices[:10]:
    true_lbl = LABEL_NAMES[y_true[idx]]
    mfb_lbl = LABEL_NAMES[y_pred_mfb[idx]]
    claude_lbl = LABEL_NAMES[y_pred_claude[idx]]
    mfb_ok = "correct" if y_pred_mfb[idx] == y_true[idx] else "wrong"
    claude_ok = "correct" if y_pred_claude[idx] == y_true[idx] else "wrong"
    print(f"\nText: {test_texts[idx][:150]}...")
    print(f"  True: {true_lbl} | MFB: {mfb_lbl} ({mfb_ok}) | Claude: {claude_lbl} ({claude_ok})")

In [None]:
# Confusion matrices side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, (y_pred, title) in zip(axes, [
    (y_pred_mfb, f"ModernFinBERT\nAcc={mfb_acc:.2%}  F1={mfb_f1:.2%}"),
    (y_pred_claude, f"Claude Opus 4.5\nAcc={claude_acc:.2%}  F1={claude_f1:.2%}"),
]):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues",
        xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax,
    )
    ax.set_title(title)
    ax.set_ylabel("True")
    ax.set_xlabel("Predicted")

plt.tight_layout()
plt.savefig("mfb_vs_claude_confusion.png", dpi=150, bbox_inches="tight")
plt.show()