# Experiment 2: DataBoost — Targeted Augmentation on Misclassified Samples

**Aim:** Train a baseline ModernBERT model, identify what it gets wrong on validation, paraphrase those errors with an LLM (preserving correct labels), add to training set, retrain, and measure improvement.

**Steps:**
1. Baseline: fine-tune ModernBERT-base on train split
2. Error mining: run inference on validation, collect misclassified samples
3. LLM paraphrasing: generate N paraphrases per error with correct ground-truth labels
4. Augmented training: retrain on original + paraphrased data
5. Compare baseline vs DataBoosted accuracy on validation and FPB

## 1. Setup & Installation

In [None]:
%env UNSLOTH_DISABLE_FAST_GENERATION=1

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
!pip install -q datasets scikit-learn anthropic

In [None]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification, training_args,
)
from datasets import load_dataset, Dataset, concatenate_datasets
from tqdm import tqdm
import json
import time

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6
PARAPHRASES_PER_SAMPLE = 3  # number of paraphrases to generate per misclassified sample

## 2. Data Preparation

In [None]:
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

# Keep text and string label for error mining, then convert
remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}")

## 3. Baseline Training

In [None]:
def train_model(train_dataset, val_dataset, output_dir="trainer_output", epochs=10):
    """Train a fresh ModernBERT-base model and return model + tokenizer."""
    model, tokenizer = FastModel.from_pretrained(
        model_name="answerdotai/ModernBERT-base",
        load_in_4bit=False,
        max_seq_length=2048,
        dtype=None,
        auto_model=AutoModelForSequenceClassification,
        num_labels=NUM_CLASSES,
        full_finetuning=True,
    )

    def tokenize_function(examples):
        return tokenizer(examples["text"])

    train_tok = train_dataset.map(tokenize_function, batched=True)
    val_tok = val_dataset.map(tokenize_function, batched=True)

    trainer = Trainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=32,
            gradient_accumulation_steps=1,
            warmup_steps=10,
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            optim=training_args.OptimizerNames.ADAMW_TORCH,
            learning_rate=5e-5,
            weight_decay=0.001,
            lr_scheduler_type="cosine",
            seed=3407,
            num_train_epochs=epochs,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_strategy="epoch",
            eval_strategy="epoch",
            logging_strategy="epoch",
            report_to="none",
        ),
        compute_metrics=lambda eval_pred: {
            "accuracy": accuracy_score(
                eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
            )
        },
    )

    trainer.train()
    model = model.cuda().eval()
    FastLanguageModel.for_inference(model)
    return model, tokenizer

In [None]:
print("Training BASELINE model...")
baseline_model, tokenizer = train_model(
    ds["train"], ds["validation"], output_dir="trainer_output_baseline"
)

## 4. Error Mining on Validation Set

In [None]:
def run_inference(model, tokenizer, texts, batch_size=32):
    """Run inference and return predicted class indices."""
    all_preds = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Inference"):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
    return np.array(all_preds)

In [None]:
val_texts = ds["validation"]["text"]
val_labels = np.argmax(ds["validation"]["labels"], axis=1)

val_preds = run_inference(baseline_model, tokenizer, val_texts)

val_acc = accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average="macro")
print(f"\nBaseline validation accuracy: {val_acc:.4f}")
print(f"Baseline validation macro F1: {val_f1:.4f}")
print(classification_report(val_labels, val_preds, target_names=LABEL_NAMES))

In [None]:
# Collect misclassified samples
errors = []
for i in range(len(val_texts)):
    if val_preds[i] != val_labels[i]:
        errors.append({
            "text": val_texts[i],
            "true_label": int(val_labels[i]),
            "true_label_name": LABEL_NAMES[val_labels[i]],
            "pred_label": int(val_preds[i]),
            "pred_label_name": LABEL_NAMES[val_preds[i]],
        })

print(f"\nMisclassified samples: {len(errors)} / {len(val_texts)} ({len(errors)/len(val_texts):.1%})")
print(f"\nError breakdown by true label:")
error_df = pd.DataFrame(errors)
print(error_df["true_label_name"].value_counts().to_string())

## 5. LLM Paraphrasing of Misclassified Samples

For each misclassified sample, generate paraphrases using an LLM. The paraphrases keep the **correct ground-truth label**, providing the model with more examples of the patterns it struggles with.

In [None]:
import getpass

# Set API key — works in Colab or local
if "ANTHROPIC_API_KEY" not in os.environ:
    os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Enter Anthropic API key: ")

from anthropic import Anthropic
client = Anthropic()

In [None]:
def paraphrase_batch(texts, labels, n_paraphrases=3, batch_size=10):
    """Generate paraphrases for a batch of texts using Claude.

    Args:
        texts: List of financial texts to paraphrase.
        labels: List of integer labels (ground truth).
        n_paraphrases: Number of paraphrases per text.
        batch_size: How many texts to send per API call.

    Returns:
        List of dicts with 'text' and 'label' keys.
    """
    all_paraphrases = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Paraphrasing"):
        batch_texts = texts[i : i + batch_size]
        batch_labels = labels[i : i + batch_size]

        # Build prompt with numbered texts
        numbered = "\n".join(
            f"{j+1}. [{LABEL_NAMES[lbl]}] {txt}"
            for j, (txt, lbl) in enumerate(zip(batch_texts, batch_labels))
        )

        prompt = f"""You are a financial text paraphrasing assistant. For each numbered financial text below, generate exactly {n_paraphrases} paraphrases that:
- Preserve the original meaning and financial sentiment
- Use different wording, sentence structure, or phrasing
- Stay realistic as financial text (news headlines, earnings reports, analyst commentary)
- Keep approximately the same length

Return your response as a JSON array of objects, each with "original_index" (1-based), "paraphrase" (the text), and "label" (the sentiment label shown in brackets).

Texts to paraphrase:
{numbered}

Return ONLY valid JSON, no other text."""

        try:
            response = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4096,
                messages=[{"role": "user", "content": prompt}],
            )
            content = response.content[0].text.strip()
            # Strip markdown code fences if present
            if content.startswith("```"):
                content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()

            paraphrases = json.loads(content)

            label_name_to_idx = {name: idx for idx, name in enumerate(LABEL_NAMES)}
            for p in paraphrases:
                idx = p["original_index"] - 1  # convert to 0-based
                lbl = label_name_to_idx.get(p["label"], batch_labels[idx])
                all_paraphrases.append({"text": p["paraphrase"], "label": lbl})

        except Exception as e:
            print(f"Error at batch {i}: {e}")
            continue

        time.sleep(0.5)  # rate limiting

    return all_paraphrases

In [None]:
error_texts = [e["text"] for e in errors]
error_labels = [e["true_label"] for e in errors]

print(f"Generating {PARAPHRASES_PER_SAMPLE} paraphrases for {len(errors)} misclassified samples...")
paraphrased = paraphrase_batch(error_texts, error_labels, n_paraphrases=PARAPHRASES_PER_SAMPLE)
print(f"\nGenerated {len(paraphrased)} paraphrases")

# Show a few examples
for p in paraphrased[:6]:
    print(f"  [{LABEL_NAMES[p['label']]}] {p['text'][:100]}...")

## 6. Create Augmented Training Set

In [None]:
# Convert paraphrased data to HF dataset with one-hot labels
aug_texts = [p["text"] for p in paraphrased]
aug_labels = [np.eye(NUM_CLASSES)[p["label"]].tolist() for p in paraphrased]

aug_ds = Dataset.from_dict({"text": aug_texts, "labels": aug_labels})

# Combine original training data + paraphrased augmentations
augmented_train = concatenate_datasets([ds["train"], aug_ds]).shuffle(seed=42)

print(f"Original train size:  {len(ds['train']):,}")
print(f"Augmentation size:    {len(aug_ds):,}")
print(f"Augmented train size: {len(augmented_train):,}")
print(f"Augmentation ratio:   {len(aug_ds)/len(ds['train']):.1%}")

## 7. Retrain on Augmented Data

In [None]:
# Free baseline model memory
del baseline_model
torch.cuda.empty_cache()

print("Training DATABOOSTED model...")
boosted_model, tokenizer = train_model(
    augmented_train, ds["validation"], output_dir="trainer_output_boosted"
)

## 8. Compare Baseline vs DataBoosted

In [None]:
# Evaluate DataBoosted model on validation
boosted_val_preds = run_inference(boosted_model, tokenizer, val_texts)

boosted_val_acc = accuracy_score(val_labels, boosted_val_preds)
boosted_val_f1 = f1_score(val_labels, boosted_val_preds, average="macro")

print(f"\nDataBoosted validation accuracy: {boosted_val_acc:.4f}")
print(f"DataBoosted validation macro F1:  {boosted_val_f1:.4f}")
print(classification_report(val_labels, boosted_val_preds, target_names=LABEL_NAMES))

In [None]:
# Evaluate on FPB as held-out test
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_texts = fpb_50["sentence"]
fpb_labels = np.array(fpb_50["label"])

boosted_fpb_preds = run_inference(boosted_model, tokenizer, fpb_texts)
boosted_fpb_acc = accuracy_score(fpb_labels, boosted_fpb_preds)
boosted_fpb_f1 = f1_score(fpb_labels, boosted_fpb_preds, average="macro")

print(f"\nDataBoosted FPB accuracy: {boosted_fpb_acc:.4f}")
print(f"DataBoosted FPB macro F1: {boosted_fpb_f1:.4f}")
print(classification_report(fpb_labels, boosted_fpb_preds, target_names=LABEL_NAMES))

In [None]:
# Summary comparison (including published baselines from literature)
comparison = pd.DataFrame([
    # --- Published baselines (in-domain FPB train/test splits) ---
    {"Model": "LSTM+ELMo *",          "Split": "FPB 50agree",
     "Accuracy": "0.7500", "Macro F1": "0.7000"},
    {"Model": "ULMFit *",             "Split": "FPB 50agree",
     "Accuracy": "0.8300", "Macro F1": "0.7900"},
    {"Model": "ProsusAI/finbert *",   "Split": "FPB 50agree",
     "Accuracy": "0.8600", "Macro F1": "0.8400"},
    {"Model": "FinBERT-FinVocab *",   "Split": "FPB 50agree",
     "Accuracy": "0.8720", "Macro F1": "—"},
    # --- Our models (FPB held out from training) ---
    {"Model": "Baseline (ours)",       "Split": "Validation",
     "Accuracy": f"{val_acc:.4f}", "Macro F1": f"{val_f1:.4f}"},
    {"Model": "DataBoosted (ours)",    "Split": "Validation",
     "Accuracy": f"{boosted_val_acc:.4f}", "Macro F1": f"{boosted_val_f1:.4f}"},
    {"Model": "DataBoosted (ours)",    "Split": "FPB 50agree",
     "Accuracy": f"{boosted_fpb_acc:.4f}", "Macro F1": f"{boosted_fpb_f1:.4f}"},
])

print("\n" + "=" * 70)
print("DATABOOST COMPARISON")
print("=" * 70)
print(comparison.to_string(index=False))

val_delta = boosted_val_acc - val_acc
print(f"\nValidation accuracy delta: {val_delta:+.4f} ({val_delta:+.2%})")
print(f"Augmentation added {len(aug_ds)} samples ({len(aug_ds)/len(ds['train']):.1%} of train)")
print("\n* Published baselines trained/tested on in-domain FPB splits (Araci 2019, Yang et al. 2020)")
print("  Our models never saw FPB during training — stricter held-out evaluation.")
print("  See reference/fpb_benchmarks.md for full details.")

## 9. (Optional) Iterate — Round 2

Re-mine errors from the DataBoosted model and augment again.

In [None]:
# Uncomment to run a second round of DataBoost

# # Mine errors from boosted model
# errors_r2 = []
# for i in range(len(val_texts)):
#     if boosted_val_preds[i] != val_labels[i]:
#         errors_r2.append({"text": val_texts[i], "true_label": int(val_labels[i])})
#
# print(f"Round 2 errors: {len(errors_r2)} (down from {len(errors)})")
#
# if len(errors_r2) > 0:
#     paraphrased_r2 = paraphrase_batch(
#         [e["text"] for e in errors_r2],
#         [e["true_label"] for e in errors_r2],
#         n_paraphrases=PARAPHRASES_PER_SAMPLE,
#     )
#     aug_r2 = Dataset.from_dict({
#         "text": [p["text"] for p in paraphrased_r2],
#         "labels": [np.eye(NUM_CLASSES)[p["label"]].tolist() for p in paraphrased_r2],
#     })
#     augmented_train_r2 = concatenate_datasets([augmented_train, aug_r2]).shuffle(seed=42)
#
#     del boosted_model
#     torch.cuda.empty_cache()
#
#     boosted_model_r2, tokenizer = train_model(
#         augmented_train_r2, ds["validation"], output_dir="trainer_output_boosted_r2"
#     )
#     r2_preds = run_inference(boosted_model_r2, tokenizer, val_texts)
#     r2_acc = accuracy_score(val_labels, r2_preds)
#     print(f"Round 2 validation accuracy: {r2_acc:.4f} (delta: {r2_acc - val_acc:+.4f})")