# 04 — Evaluation, Comparison & Analysis

Load all trained models, run comprehensive cross-lingual evaluation, generate visualizations, and analyze results.

**Prerequisites:** Run notebooks 01, 02, and 03 first.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report,
)
from tqdm import tqdm

DATA_DIR = "./data"
MODEL_DIR = "./models"
RESULTS_DIR = "./results"
FIG_DIR = os.path.join(RESULTS_DIR, "figures")
os.makedirs(FIG_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

## 1. Load Test Data

In [None]:
test_data = {
    "en": load_from_disk(os.path.join(DATA_DIR, "en"))["test"],
    "fr": load_from_disk(os.path.join(DATA_DIR, "fr"))["test"],
    "nl": load_from_disk(os.path.join(DATA_DIR, "nl"))["test"],
}

for lang, ds in test_data.items():
    print(f"{lang.upper()} test set: {len(ds)} examples")

## 2. Define All Models to Evaluate

In [None]:
# Model configs: (display_name, model_dir, languages_to_evaluate_on)
model_configs = [
    # Monolingual models — only evaluate on their own language
    ("BERT (EN)", os.path.join(MODEL_DIR, "bert-en"), ["en"]),
    ("BERTje (NL)", os.path.join(MODEL_DIR, "bertje-nl"), ["nl"]),
    ("CamemBERT (FR)", os.path.join(MODEL_DIR, "camembert-fr"), ["fr"]),
    # mBERT variants — evaluate on all languages
    ("mBERT (EN only)", os.path.join(MODEL_DIR, "mbert-en-only"), ["en", "fr", "nl"]),
    ("mBERT (FR only)", os.path.join(MODEL_DIR, "mbert-fr-only"), ["en", "fr", "nl"]),
    ("mBERT (NL only)", os.path.join(MODEL_DIR, "mbert-nl-only"), ["en", "fr", "nl"]),
    ("mBERT (EN+FR+NL)", os.path.join(MODEL_DIR, "mbert-multilingual"), ["en", "fr", "nl"]),
]

# Check which models exist
available_models = []
for name, path, langs in model_configs:
    if os.path.exists(path):
        available_models.append((name, path, langs))
        print(f"  Found: {name}")
    else:
        print(f"  MISSING: {name} ({path})")

print(f"\n{len(available_models)}/{len(model_configs)} models available.")

## 3. Evaluate All Models

In [None]:
def evaluate_model(model_path, texts, labels, batch_size=32):
    """Run inference and return predictions."""
    classifier = pipeline(
        "text-classification",
        model=model_path,
        tokenizer=model_path,
        device=0 if DEVICE == "cuda" else -1,
        truncation=True,
        max_length=256,
        batch_size=batch_size,
    )

    results = classifier(texts)
    # Map LABEL_0 -> 0, LABEL_1 -> 1
    preds = [int(r["label"].split("_")[-1]) for r in results]
    return preds


all_eval_results = []

for model_name, model_path, eval_langs in tqdm(available_models, desc="Models"):
    for lang in eval_langs:
        texts = test_data[lang]["text"]
        labels = test_data[lang]["label"]

        preds = evaluate_model(model_path, texts, labels)

        all_eval_results.append({
            "model": model_name,
            "eval_lang": lang,
            "accuracy": accuracy_score(labels, preds),
            "f1": f1_score(labels, preds),
            "precision": precision_score(labels, preds),
            "recall": recall_score(labels, preds),
            "predictions": preds,  # Keep for error analysis
            "labels": labels,
        })

        print(f"  {model_name} on {lang.upper()}: Acc={all_eval_results[-1]['accuracy']:.4f}, F1={all_eval_results[-1]['f1']:.4f}")

    # Free GPU memory between models
    torch.cuda.empty_cache() if DEVICE == "cuda" else None

## 4. Full Results Matrix

In [None]:
# Build clean results table (without predictions/labels columns)
df_results = pd.DataFrame([
    {k: v for k, v in r.items() if k not in ("predictions", "labels")}
    for r in all_eval_results
])

# Pivot for readability
pivot_f1 = df_results.pivot(index="model", columns="eval_lang", values="f1")
pivot_f1.columns = [f"{c.upper()} F1" for c in pivot_f1.columns]

pivot_acc = df_results.pivot(index="model", columns="eval_lang", values="accuracy")
pivot_acc.columns = [f"{c.upper()} Acc" for c in pivot_acc.columns]

full_table = pd.concat([pivot_f1, pivot_acc], axis=1)

print("Full Evaluation Matrix")
print("=" * 80)
print(full_table.to_string(float_format="{:.4f}".format))

# Save
full_table.to_csv(os.path.join(RESULTS_DIR, "full_evaluation_matrix.csv"))
df_results.to_csv(os.path.join(RESULTS_DIR, "detailed_results.csv"), index=False)

## 5. Visualization: F1 Score Heatmap

In [None]:
# Heatmap of F1 scores (model × evaluation language)
heatmap_df = df_results.pivot(index="model", columns="eval_lang", values="f1")
heatmap_df.columns = ["English", "French", "Dutch"]

# Order: monolingual first, then mBERT variants
order = [
    "BERT (EN)", "BERTje (NL)", "CamemBERT (FR)",
    "mBERT (EN only)", "mBERT (FR only)", "mBERT (NL only)", "mBERT (EN+FR+NL)",
]
heatmap_df = heatmap_df.reindex([m for m in order if m in heatmap_df.index])

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(
    heatmap_df,
    annot=True,
    fmt=".3f",
    cmap="YlOrRd",
    vmin=0.5,
    vmax=1.0,
    linewidths=0.5,
    ax=ax,
    mask=heatmap_df.isna(),
)
ax.set_title("Sentiment Classification F1 Scores\n(Monolingual vs. Multilingual Models)", fontsize=13)
ax.set_ylabel("Model (Training Config)")
ax.set_xlabel("Evaluation Language")

plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "f1_heatmap_all_models.png"), dpi=150, bbox_inches="tight")
plt.show()

## 6. Visualization: Monolingual vs. mBERT Comparison

In [None]:
# Compare: language-specific model vs. mBERT (all langs) per language
comparison = {
    "English": {
        "Monolingual (BERT)": df_results[(df_results["model"] == "BERT (EN)") & (df_results["eval_lang"] == "en")]["f1"].values,
        "mBERT (EN only)": df_results[(df_results["model"] == "mBERT (EN only)") & (df_results["eval_lang"] == "en")]["f1"].values,
        "mBERT (all langs)": df_results[(df_results["model"] == "mBERT (EN+FR+NL)") & (df_results["eval_lang"] == "en")]["f1"].values,
    },
    "French": {
        "Monolingual (CamemBERT)": df_results[(df_results["model"] == "CamemBERT (FR)") & (df_results["eval_lang"] == "fr")]["f1"].values,
        "mBERT (FR only)": df_results[(df_results["model"] == "mBERT (FR only)") & (df_results["eval_lang"] == "fr")]["f1"].values,
        "mBERT (all langs)": df_results[(df_results["model"] == "mBERT (EN+FR+NL)") & (df_results["eval_lang"] == "fr")]["f1"].values,
    },
    "Dutch": {
        "Monolingual (BERTje)": df_results[(df_results["model"] == "BERTje (NL)") & (df_results["eval_lang"] == "nl")]["f1"].values,
        "mBERT (NL only)": df_results[(df_results["model"] == "mBERT (NL only)") & (df_results["eval_lang"] == "nl")]["f1"].values,
        "mBERT (all langs)": df_results[(df_results["model"] == "mBERT (EN+FR+NL)") & (df_results["eval_lang"] == "nl")]["f1"].values,
    },
}

fig, axes = plt.subplots(1, 3, figsize=(14, 5), sharey=True)
colors = ["#2196F3", "#FF9800", "#4CAF50"]

for ax, (lang, scores) in zip(axes, comparison.items()):
    names = list(scores.keys())
    vals = [v[0] if len(v) > 0 else 0 for v in scores.values()]
    bars = ax.bar(range(len(names)), vals, color=colors)
    ax.set_xticks(range(len(names)))
    ax.set_xticklabels(names, rotation=30, ha="right", fontsize=9)
    ax.set_title(lang, fontsize=12)
    ax.set_ylim(0.7, 1.0)
    ax.set_ylabel("F1 Score" if ax == axes[0] else "")
    for bar, val in zip(bars, vals):
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,
                f"{val:.3f}", ha="center", va="bottom", fontsize=9)

fig.suptitle("Monolingual vs. Multilingual Model Comparison (F1)", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "monolingual_vs_multilingual.png"), dpi=150, bbox_inches="tight")
plt.show()

## 7. Cross-Lingual Transfer Analysis

In [None]:
# Analyze zero-shot transfer degradation
print("Cross-Lingual Transfer Analysis")
print("=" * 60)

# For each mBERT single-language model, compare in-language vs. cross-language performance
transfer_configs = [
    ("mBERT (EN only)", "en"),
    ("mBERT (FR only)", "fr"),
    ("mBERT (NL only)", "nl"),
]

for model_name, train_lang in transfer_configs:
    model_rows = df_results[df_results["model"] == model_name]
    if model_rows.empty:
        continue

    in_lang_f1 = model_rows[model_rows["eval_lang"] == train_lang]["f1"].values
    if len(in_lang_f1) == 0:
        continue
    in_lang_f1 = in_lang_f1[0]

    print(f"\n{model_name} (trained on {train_lang.upper()})")
    print(f"  In-language F1: {in_lang_f1:.4f}")

    for _, row in model_rows.iterrows():
        if row["eval_lang"] != train_lang:
            drop = in_lang_f1 - row["f1"]
            print(f"  → {row['eval_lang'].upper()} F1: {row['f1']:.4f} (drop: {drop:+.4f})")

## 8. Confusion Matrices

In [None]:
# Plot confusion matrices for mBERT (all langs) on each language
mbert_all_results = [r for r in all_eval_results if r["model"] == "mBERT (EN+FR+NL)"]

if mbert_all_results:
    fig, axes = plt.subplots(1, 3, figsize=(14, 4))
    lang_names = {"en": "English", "fr": "French", "nl": "Dutch"}

    for ax, result in zip(axes, mbert_all_results):
        cm = confusion_matrix(result["labels"], result["predictions"])
        sns.heatmap(
            cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"],
            ax=ax,
        )
        ax.set_title(f"mBERT (all langs) — {lang_names[result['eval_lang']]}")
        ax.set_ylabel("True Label")
        ax.set_xlabel("Predicted Label")

    plt.tight_layout()
    plt.savefig(os.path.join(FIG_DIR, "confusion_matrices_mbert_all.png"), dpi=150, bbox_inches="tight")
    plt.show()
else:
    print("mBERT (EN+FR+NL) results not found. Run notebook 03 first.")

## 9. Error Analysis

In [None]:
# Sample misclassified examples from mBERT (all langs)
if mbert_all_results:
    for result in mbert_all_results:
        lang = result["eval_lang"]
        preds = result["predictions"]
        labels = result["labels"]
        texts = test_data[lang]["text"]

        # Find misclassified examples
        misclassified_idx = [i for i, (p, l) in enumerate(zip(preds, labels)) if p != l]
        n_errors = len(misclassified_idx)
        total = len(labels)

        print(f"\n{'='*60}")
        print(f"{lang.upper()}: {n_errors}/{total} misclassified ({n_errors/total*100:.1f}%)")
        print(f"{'='*60}")

        # Show 3 random misclassified examples
        np.random.seed(42)
        sample_idx = np.random.choice(misclassified_idx, min(3, len(misclassified_idx)), replace=False)

        label_names = ["Negative", "Positive"]
        for idx in sample_idx:
            print(f"\n  Text: {texts[idx][:200]}...")
            print(f"  True: {label_names[labels[idx]]}, Predicted: {label_names[preds[idx]]}")

## 10. Summary & Conclusions

In [None]:
print("FINAL SUMMARY")
print("=" * 60)
print()

# Q1: Zero-shot transfer quality
print("Q1: How well does mBERT transfer across languages (zero-shot)?")
en_only = df_results[df_results["model"] == "mBERT (EN only)"]
if not en_only.empty:
    for _, row in en_only.iterrows():
        marker = "(in-language)" if row["eval_lang"] == "en" else "(zero-shot)"
        print(f"  {row['eval_lang'].upper()} F1: {row['f1']:.4f} {marker}")

print()

# Q2: Multilingual training benefit
print("Q2: Does multilingual training improve over single-language?")
for lang in ["en", "fr", "nl"]:
    single = df_results[(df_results["model"] == f"mBERT ({lang.upper()} only)") & (df_results["eval_lang"] == lang)]["f1"].values
    multi = df_results[(df_results["model"] == "mBERT (EN+FR+NL)") & (df_results["eval_lang"] == lang)]["f1"].values
    if len(single) > 0 and len(multi) > 0:
        diff = multi[0] - single[0]
        print(f"  {lang.upper()}: single={single[0]:.4f} → multi={multi[0]:.4f} (diff: {diff:+.4f})")

print()

# Q3: Monolingual specialists vs. mBERT
print("Q3: Monolingual specialists vs. mBERT (in-language)?")
mono_models = {
    "en": "BERT (EN)",
    "fr": "CamemBERT (FR)",
    "nl": "BERTje (NL)",
}
for lang, mono_name in mono_models.items():
    mono_f1 = df_results[(df_results["model"] == mono_name) & (df_results["eval_lang"] == lang)]["f1"].values
    multi_f1 = df_results[(df_results["model"] == "mBERT (EN+FR+NL)") & (df_results["eval_lang"] == lang)]["f1"].values
    if len(mono_f1) > 0 and len(multi_f1) > 0:
        diff = multi_f1[0] - mono_f1[0]
        winner = "mBERT" if diff > 0 else mono_name
        print(f"  {lang.upper()}: {mono_name}={mono_f1[0]:.4f} vs mBERT={multi_f1[0]:.4f} → Winner: {winner}")

print("\n" + "=" * 60)
print("All results saved to ./results/")