# Experiment 7: Self-Training

**Aim:** Apply self-training on top of the DataBoosted model using unlabeled financial text. This is the candidate for the final published model.

**Pipeline:**
1. Train teacher (baseline + DataBoost from NB02 protocol)
2. Source unlabeled financial text (~30-50K sentences)
3. Iterative self-training (max 3 rounds) with per-class top-k selection
4. Final evaluation comparing full progression

**Anti-confirmation-bias measures:**
- Per-class selection (prevents majority class domination)
- Fresh student each round (no weight inheritance from teacher)
- Validation monitoring with early stopping across rounds
- Confidence distribution logging per round

## 1. Setup & Installation

In [None]:
# setup

In [None]:
%%capture
!pip install -q "datasets>=3.4.1,<4.0.0" scikit-learn matplotlib seaborn peft accelerate transformers anthropic

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import torch
import torch.nn.functional as F
from transformers import (
    TrainingArguments, Trainer, AutoModelForSequenceClassification,
    AutoTokenizer, training_args,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset, Dataset, concatenate_datasets
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import json
import time
import os
import gc

NUM_CLASSES = 3
LABEL_NAMES = ["NEGATIVE", "NEUTRAL", "POSITIVE"]
FPB_SOURCE = 6
MODEL_NAME = "answerdotai/ModernBERT-base"
PARAPHRASES_PER_SAMPLE = 3

# Self-training config
MAX_SELF_TRAIN_ROUNDS = 3
CONFIDENCE_THRESHOLDS = [0.15, 0.25, 0.40]  # per-class top-k percentages per round

## 2. Data Preparation

In [None]:
ds = load_dataset("neoyipeng/financial_reasoning_aggregated")

label_dict = {"NEUTRAL/MIXED": 1, "NEGATIVE": 0, "POSITIVE": 2}

ds = ds.filter(lambda x: x["task"] == "sentiment")
ds = ds.filter(lambda x: x["source"] != FPB_SOURCE)

remove_cols = [c for c in ds["train"].column_names if c not in ("text", "labels")]
ds = ds.map(
    lambda ex: {
        "text": ex["text"],
        "labels": np.eye(NUM_CLASSES)[label_dict[ex["label"]]],
    },
    remove_columns=remove_cols,
)

print(f"Train: {len(ds['train']):,}  |  Val: {len(ds['validation']):,}  |  Test: {len(ds['test']):,}")

In [None]:
fpb_50 = load_dataset("financial_phrasebank", "sentences_50agree", trust_remote_code=True)["train"]
fpb_all = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)["train"]

print(f"FPB 50agree: {len(fpb_50):,} samples")
print(f"FPB allAgree: {len(fpb_all):,} samples")

## 3. Core Functions

In [None]:
def train_model(train_dataset, val_dataset, seed=3407, output_dir="trainer_output", epochs=10):
    """Train a fresh ModernBERT-base model with LoRA and return model + tokenizer."""
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_CLASSES,
        torch_dtype=torch.float32,
        attn_implementation="sdpa",
    )
    model.gradient_checkpointing_enable()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["Wqkv", "out_proj", "Wi", "Wo"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )
    model = get_peft_model(model, lora_config)
    model = model.cuda()
    model.print_trainable_parameters()

    def tokenize_function(examples):
        return tokenizer(examples["text"])

    train_tok = train_dataset.map(tokenize_function, batched=True)
    val_tok = val_dataset.map(tokenize_function, batched=True)

    trainer = Trainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        args=TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            warmup_steps=10,
            fp16=True,
            bf16=False,
            optim=training_args.OptimizerNames.ADAMW_TORCH,
            learning_rate=2e-4,
            weight_decay=0.001,
            lr_scheduler_type="cosine",
            seed=seed,
            num_train_epochs=epochs,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            save_strategy="epoch",
            eval_strategy="epoch",
            logging_strategy="epoch",
            gradient_checkpointing=True,
            report_to="none",
        ),
        compute_metrics=lambda eval_pred: {
            "accuracy": accuracy_score(
                eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)
            )
        },
    )

    trainer.train()
    model = model.cuda().eval()
    return model, tokenizer

In [None]:
def run_inference(model, tokenizer, texts, batch_size=32):
    """Run inference and return predicted class indices."""
    all_preds = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
    return np.array(all_preds)


def run_inference_with_probs(model, tokenizer, texts, batch_size=64):
    """Run inference and return (predicted labels, softmax probabilities)."""
    all_preds = []
    all_probs = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Teacher inference"):
            batch = texts[i : i + batch_size]
            inputs = tokenizer(
                batch, return_tensors="pt", padding=True,
                truncation=True, max_length=512,
            )
            inputs = {k: v.cuda() for k, v in inputs.items()}
            logits = model(**inputs).logits
            probs = F.softmax(logits, dim=-1).cpu().numpy()
            preds = np.argmax(probs, axis=-1)
            all_preds.extend(preds)
            all_probs.extend(probs)
    return np.array(all_preds), np.array(all_probs)


def evaluate_all(model, tokenizer):
    """Evaluate model on validation, test, FPB 50agree, FPB allAgree."""
    results = {}

    # Validation
    val_texts = ds["validation"]["text"]
    val_labels = np.argmax(ds["validation"]["labels"], axis=1)
    val_preds = run_inference(model, tokenizer, val_texts)
    results["val_acc"] = accuracy_score(val_labels, val_preds)
    results["val_f1"] = f1_score(val_labels, val_preds, average="macro")

    # Test
    test_texts = ds["test"]["text"]
    test_labels = np.argmax(ds["test"]["labels"], axis=1)
    test_preds = run_inference(model, tokenizer, test_texts)
    results["test_acc"] = accuracy_score(test_labels, test_preds)
    results["test_f1"] = f1_score(test_labels, test_preds, average="macro")

    # FPB 50agree
    fpb50_preds = run_inference(model, tokenizer, fpb_50["sentence"])
    results["fpb50_acc"] = accuracy_score(fpb_50["label"], fpb50_preds)
    results["fpb50_f1"] = f1_score(fpb_50["label"], fpb50_preds, average="macro")

    # FPB allAgree
    fpball_preds = run_inference(model, tokenizer, fpb_all["sentence"])
    results["fpball_acc"] = accuracy_score(fpb_all["label"], fpball_preds)
    results["fpball_f1"] = f1_score(fpb_all["label"], fpball_preds, average="macro")

    return results

## 4. Step 1: Train Baseline Teacher

In [None]:
print("Training BASELINE model...")
baseline_model, tokenizer = train_model(
    ds["train"], ds["validation"], output_dir="trainer_output_baseline"
)

baseline_results = evaluate_all(baseline_model, tokenizer)
print(f"\nBaseline results:")
for k, v in baseline_results.items():
    print(f"  {k}: {v:.4f}")

## 5. Step 2: DataBoost (Error Mining + Paraphrasing)

In [None]:
# Error mining on validation set
val_texts = ds["validation"]["text"]
val_labels = np.argmax(ds["validation"]["labels"], axis=1)
val_preds = run_inference(baseline_model, tokenizer, val_texts)

errors = []
for i in range(len(val_texts)):
    if val_preds[i] != val_labels[i]:
        errors.append({"text": val_texts[i], "true_label": int(val_labels[i])})

print(f"Misclassified samples: {len(errors)} / {len(val_texts)} ({len(errors)/len(val_texts):.1%})")

In [None]:
# Set API key
if "ANTHROPIC_API_KEY" not in os.environ:
    try:
        from kaggle_secrets import UserSecretsClient
        os.environ["ANTHROPIC_API_KEY"] = UserSecretsClient().get_secret("ANTHROPIC_API_KEY")
    except ImportError:
        import getpass
        os.environ["ANTHROPIC_API_KEY"] = getpass.getpass("Enter Anthropic API key: ")

from anthropic import Anthropic
client = Anthropic()

In [None]:
def paraphrase_batch(texts, labels, n_paraphrases=3, batch_size=10):
    """Generate paraphrases for a batch of texts using Claude."""
    all_paraphrases = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Paraphrasing"):
        batch_texts = texts[i : i + batch_size]
        batch_labels = labels[i : i + batch_size]

        numbered = "\n".join(
            f"{j+1}. [{LABEL_NAMES[lbl]}] {txt}"
            for j, (txt, lbl) in enumerate(zip(batch_texts, batch_labels))
        )

        prompt = f"""You are a financial text paraphrasing assistant. For each numbered financial text below, generate exactly {n_paraphrases} paraphrases that:
- Preserve the original meaning and financial sentiment
- Use different wording, sentence structure, or phrasing
- Stay realistic as financial text (news headlines, earnings reports, analyst commentary)
- Keep approximately the same length

Return your response as a JSON array of objects, each with "original_index" (1-based), "paraphrase" (the text), and "label" (the sentiment label shown in brackets).

Texts to paraphrase:
{numbered}

Return ONLY valid JSON, no other text."""

        try:
            response = client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4096,
                messages=[{"role": "user", "content": prompt}],
            )
            content = response.content[0].text.strip()
            if content.startswith("```"):
                content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()

            paraphrases = json.loads(content)

            label_name_to_idx = {name: idx for idx, name in enumerate(LABEL_NAMES)}
            for p in paraphrases:
                idx = p["original_index"] - 1
                lbl = label_name_to_idx.get(p["label"], batch_labels[idx])
                all_paraphrases.append({"text": p["paraphrase"], "label": lbl})

        except Exception as e:
            print(f"Error at batch {i}: {e}")
            continue

        time.sleep(0.5)

    return all_paraphrases

In [None]:
error_texts = [e["text"] for e in errors]
error_labels = [e["true_label"] for e in errors]

print(f"Generating {PARAPHRASES_PER_SAMPLE} paraphrases for {len(errors)} misclassified samples...")
paraphrased = paraphrase_batch(error_texts, error_labels, n_paraphrases=PARAPHRASES_PER_SAMPLE)
print(f"Generated {len(paraphrased)} paraphrases")

In [None]:
# Create augmented dataset
aug_ds = Dataset.from_dict({
    "text": [p["text"] for p in paraphrased],
    "labels": [np.eye(NUM_CLASSES)[p["label"]].tolist() for p in paraphrased],
})

augmented_train = concatenate_datasets([ds["train"], aug_ds]).shuffle(seed=42)

print(f"Original train:  {len(ds['train']):,}")
print(f"Augmentation:    {len(aug_ds):,}")
print(f"Augmented train: {len(augmented_train):,}")

# Free baseline model
del baseline_model
gc.collect()
torch.cuda.empty_cache()

In [None]:
print("Training DATABOOSTED model (teacher for self-training)...")
teacher_model, tokenizer = train_model(
    augmented_train, ds["validation"], output_dir="trainer_output_boosted"
)

boosted_results = evaluate_all(teacher_model, tokenizer)
print(f"\nDataBoosted results:")
for k, v in boosted_results.items():
    print(f"  {k}: {v:.4f}")

## 6. Step 3: Source Unlabeled Financial Text

In [None]:
# Load unlabeled financial text — we ignore existing labels
print("Loading unlabeled financial text sources...")

unlabeled_texts = []

# Source 1: Twitter financial news sentiment (ignore labels)
try:
    twitter_fin = load_dataset("zeroshot/twitter-financial-news-sentiment", split="train")
    unlabeled_texts.extend(twitter_fin["text"])
    print(f"  twitter-financial-news-sentiment: {len(twitter_fin)} samples")
except Exception as e:
    print(f"  twitter-financial-news-sentiment: failed ({e})")

# Source 2: Additional twitter financial news (topic, not sentiment)
try:
    twitter_topic = load_dataset("zeroshot/twitter-financial-news-topic", split="train")
    unlabeled_texts.extend(twitter_topic["text"])
    print(f"  twitter-financial-news-topic: {len(twitter_topic)} samples")
except Exception as e:
    print(f"  twitter-financial-news-topic: failed ({e})")

print(f"\nTotal raw unlabeled: {len(unlabeled_texts):,}")

In [None]:
# Filter and deduplicate
train_text_set = set(ds["train"]["text"])
val_text_set = set(ds["validation"]["text"])
test_text_set = set(ds["test"]["text"])
fpb_text_set = set(fpb_50["sentence"])
known_texts = train_text_set | val_text_set | test_text_set | fpb_text_set

# Filter: headline-length (5-50 words), not in training data, deduplicate
seen = set()
filtered_texts = []
for text in unlabeled_texts:
    text = text.strip()
    word_count = len(text.split())
    if 5 <= word_count <= 50 and text not in known_texts and text not in seen:
        filtered_texts.append(text)
        seen.add(text)

print(f"After filtering (5-50 words, deduplicated, no overlap): {len(filtered_texts):,}")

# Cap at 50K if needed
if len(filtered_texts) > 50000:
    np.random.seed(42)
    idx = np.random.choice(len(filtered_texts), 50000, replace=False)
    filtered_texts = [filtered_texts[i] for i in sorted(idx)]
    print(f"Capped at 50,000 samples")

unlabeled_pool = filtered_texts
print(f"Final unlabeled pool: {len(unlabeled_pool):,}")

## 7. Step 4: Self-Training Loop

In [None]:
def select_pseudo_labels(texts, preds, probs, top_k_pct):
    """Select top-k% most confident predictions PER CLASS.

    Returns list of dicts with 'text', 'label', 'confidence'.
    """
    selected = []
    stats = {}

    for class_idx in range(NUM_CLASSES):
        # Get all samples predicted as this class
        class_mask = preds == class_idx
        class_indices = np.where(class_mask)[0]

        if len(class_indices) == 0:
            stats[LABEL_NAMES[class_idx]] = {"total": 0, "selected": 0}
            continue

        # Get confidence (max probability) for each
        class_confidences = probs[class_indices, class_idx]

        # Select top-k% most confident
        n_select = max(1, int(len(class_indices) * top_k_pct))
        top_indices = np.argsort(class_confidences)[-n_select:]

        for idx in top_indices:
            orig_idx = class_indices[idx]
            selected.append({
                "text": texts[orig_idx],
                "label": class_idx,
                "confidence": float(class_confidences[idx]),
            })

        stats[LABEL_NAMES[class_idx]] = {
            "total": int(len(class_indices)),
            "selected": n_select,
            "mean_conf": float(class_confidences[top_indices].mean()),
            "min_conf": float(class_confidences[top_indices].min()),
        }

    return selected, stats

In [None]:
# Track progression across rounds
progression = [
    {"stage": "Baseline", **baseline_results},
    {"stage": "DataBoosted", **boosted_results},
]

current_teacher = teacher_model
current_train = augmented_train  # Start from DataBoosted training set
best_val_acc = boosted_results["val_acc"]
round_stats = []

for round_idx in range(MAX_SELF_TRAIN_ROUNDS):
    top_k_pct = CONFIDENCE_THRESHOLDS[round_idx]

    print(f"\n{'#'*60}")
    print(f"SELF-TRAINING ROUND {round_idx + 1}/{MAX_SELF_TRAIN_ROUNDS}")
    print(f"Top-k selection: {top_k_pct:.0%} per class")
    print(f"{'#'*60}")

    # Step 1: Teacher inference on unlabeled pool
    preds, probs = run_inference_with_probs(current_teacher, tokenizer, unlabeled_pool)

    # Step 2: Per-class top-k selection
    pseudo_labeled, stats = select_pseudo_labels(unlabeled_pool, preds, probs, top_k_pct)

    print(f"\nPseudo-label stats:")
    for cls_name, cls_stats in stats.items():
        if cls_stats["total"] > 0:
            print(f"  {cls_name}: {cls_stats['selected']}/{cls_stats['total']} selected "
                  f"(mean conf: {cls_stats['mean_conf']:.4f}, min conf: {cls_stats['min_conf']:.4f})")

    # Check class balance
    pseudo_class_counts = {name: sum(1 for p in pseudo_labeled if p["label"] == i)
                          for i, name in enumerate(LABEL_NAMES)}
    total_pseudo = len(pseudo_labeled)
    print(f"\nPseudo-label distribution: {pseudo_class_counts}")
    max_class_pct = max(pseudo_class_counts.values()) / total_pseudo if total_pseudo > 0 else 0
    if max_class_pct > 0.70:
        print(f"WARNING: Class imbalance detected ({max_class_pct:.1%} single class)")

    round_stats.append({
        "round": round_idx + 1,
        "top_k_pct": top_k_pct,
        "n_pseudo": total_pseudo,
        "class_distribution": pseudo_class_counts,
        "stats": stats,
    })

    # Step 3: Combine labeled + pseudo-labeled data
    pseudo_ds = Dataset.from_dict({
        "text": [p["text"] for p in pseudo_labeled],
        "labels": [np.eye(NUM_CLASSES)[p["label"]].tolist() for p in pseudo_labeled],
    })

    combined_train = concatenate_datasets([current_train, pseudo_ds]).shuffle(seed=42)
    print(f"\nCombined train size: {len(combined_train):,} "
          f"(labeled: {len(current_train):,} + pseudo: {len(pseudo_ds):,})")

    # Step 4: Train fresh student
    del current_teacher
    gc.collect()
    torch.cuda.empty_cache()

    print(f"\nTraining STUDENT (round {round_idx + 1})...")
    student_model, tokenizer = train_model(
        combined_train, ds["validation"],
        output_dir=f"trainer_output_self_train_r{round_idx + 1}",
    )

    # Step 5: Evaluate student
    student_results = evaluate_all(student_model, tokenizer)
    student_results["stage"] = f"SelfTrain R{round_idx + 1}"
    progression.append(student_results)

    print(f"\nRound {round_idx + 1} results:")
    for k, v in student_results.items():
        if k != "stage":
            print(f"  {k}: {v:.4f}")

    # Step 6: Check if val accuracy improved
    if student_results["val_acc"] > best_val_acc:
        print(f"\nVal accuracy improved: {best_val_acc:.4f} -> {student_results['val_acc']:.4f}")
        best_val_acc = student_results["val_acc"]
        current_teacher = student_model
        current_train = combined_train
    else:
        print(f"\nVal accuracy did NOT improve: {best_val_acc:.4f} -> {student_results['val_acc']:.4f}")
        print("STOPPING self-training. Previous round's model is best.")
        del student_model
        gc.collect()
        torch.cuda.empty_cache()
        break

print(f"\nSelf-training complete!")

## 8. Pseudo-Label Quality Check

For the unlabeled data that originally had labels (twitter-financial-news-sentiment), compare our pseudo-labels against the original labels to measure quality.

In [None]:
# Check pseudo-label quality against original labels (where available)
try:
    twitter_fin_texts = twitter_fin["text"]
    twitter_fin_labels = twitter_fin["label"]

    # Find overlap between pseudo-labeled and original twitter data
    twitter_text_to_label = {t.strip(): l for t, l in zip(twitter_fin_texts, twitter_fin_labels)}

    matches = 0
    total_checked = 0
    for p in pseudo_labeled:
        if p["text"] in twitter_text_to_label:
            orig_label = twitter_text_to_label[p["text"]]
            if p["label"] == orig_label:
                matches += 1
            total_checked += 1

    if total_checked > 0:
        print(f"Pseudo-label quality check (vs original twitter labels):")
        print(f"  Checked: {total_checked}")
        print(f"  Agreement: {matches}/{total_checked} ({matches/total_checked:.1%})")
    else:
        print("No overlap found for quality check.")
except NameError:
    print("Twitter dataset not available for quality check.")

## 9. Full Progression Comparison

In [None]:
prog_df = pd.DataFrame(progression)

print("=" * 100)
print("FULL PROGRESSION: Baseline -> DataBoosted -> Self-Training")
print("=" * 100)
display_cols = ["stage", "val_acc", "val_f1", "test_acc", "test_f1",
                "fpb50_acc", "fpb50_f1", "fpball_acc", "fpball_f1"]
print(prog_df[display_cols].to_string(index=False, float_format="%.4f"))

# Deltas
print(f"\n{'='*100}")
print("IMPROVEMENT OVER BASELINE")
print(f"{'='*100}")
baseline_row = prog_df.iloc[0]
for _, row in prog_df.iloc[1:].iterrows():
    print(f"\n{row['stage']}:")
    for metric in ["val_acc", "test_acc", "fpb50_acc", "fpball_acc"]:
        delta = row[metric] - baseline_row[metric]
        print(f"  {metric}: {delta:+.4f}")

## 10. Visualizations

In [None]:
# Progression line chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

stages = prog_df["stage"].values
x = range(len(stages))

# Accuracy progression
ax = axes[0]
ax.plot(x, prog_df["fpb50_acc"], "o-", label="FPB 50agree", color="#2196F3", linewidth=2, markersize=8)
ax.plot(x, prog_df["fpball_acc"], "s-", label="FPB allAgree", color="#4CAF50", linewidth=2, markersize=8)
ax.plot(x, prog_df["test_acc"], "^-", label="Agg Test", color="#FF9800", linewidth=2, markersize=8)
ax.set_xticks(x)
ax.set_xticklabels(stages, rotation=15, fontsize=9)
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy Progression")
ax.legend()
ax.grid(True, alpha=0.3)

# F1 progression
ax = axes[1]
ax.plot(x, prog_df["fpb50_f1"], "o-", label="FPB 50agree", color="#2196F3", linewidth=2, markersize=8)
ax.plot(x, prog_df["fpball_f1"], "s-", label="FPB allAgree", color="#4CAF50", linewidth=2, markersize=8)
ax.plot(x, prog_df["test_f1"], "^-", label="Agg Test", color="#FF9800", linewidth=2, markersize=8)
ax.set_xticks(x)
ax.set_xticklabels(stages, rotation=15, fontsize=9)
ax.set_ylabel("Macro F1")
ax.set_title("Macro F1 Progression")
ax.legend()
ax.grid(True, alpha=0.3)

plt.suptitle("Self-Training Progression — ModernFinBERT", fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig("self_training_progression.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# Confidence distribution per round
if round_stats:
    fig, axes = plt.subplots(1, len(round_stats), figsize=(6 * len(round_stats), 4))
    if len(round_stats) == 1:
        axes = [axes]

    for ax, rs in zip(axes, round_stats):
        class_names = []
        mean_confs = []
        min_confs = []
        for cls_name in LABEL_NAMES:
            if cls_name in rs["stats"] and rs["stats"][cls_name]["total"] > 0:
                class_names.append(cls_name)
                mean_confs.append(rs["stats"][cls_name]["mean_conf"])
                min_confs.append(rs["stats"][cls_name]["min_conf"])

        x = np.arange(len(class_names))
        width = 0.35
        ax.bar(x - width/2, mean_confs, width, label="Mean conf", color="#2196F3")
        ax.bar(x + width/2, min_confs, width, label="Min conf", color="#FFA726")
        ax.set_xticks(x)
        ax.set_xticklabels(class_names)
        ax.set_ylabel("Confidence")
        ax.set_title(f"Round {rs['round']} (top {rs['top_k_pct']:.0%})")
        ax.legend()
        ax.set_ylim(0, 1.0)

    plt.suptitle("Pseudo-Label Confidence by Round", fontsize=14, y=1.02)
    plt.tight_layout()
    plt.savefig("self_training_confidence.png", dpi=150, bbox_inches="tight")
    plt.show()

## 11. Final Summary

In [None]:
print("=" * 70)
print("FINAL SUMMARY")
print("=" * 70)

best_row = prog_df.loc[prog_df["fpb50_acc"].idxmax()]
print(f"\nBest model (by FPB 50agree accuracy): {best_row['stage']}")
print(f"  FPB 50agree:  {best_row['fpb50_acc']:.4f} acc / {best_row['fpb50_f1']:.4f} F1")
print(f"  FPB allAgree: {best_row['fpball_acc']:.4f} acc / {best_row['fpball_f1']:.4f} F1")
print(f"  Agg Test:     {best_row['test_acc']:.4f} acc / {best_row['test_f1']:.4f} F1")

# Overall improvement
baseline_fpb50 = progression[0]["fpb50_acc"]
best_fpb50 = best_row["fpb50_acc"]
print(f"\nTotal improvement over baseline (FPB 50agree): {best_fpb50 - baseline_fpb50:+.4f}")

print(f"\nSelf-training rounds completed: {len(round_stats)}")
for rs in round_stats:
    print(f"  Round {rs['round']}: {rs['n_pseudo']} pseudo-labels added ({rs['top_k_pct']:.0%} per class)")