# Membership Inference Attacks on Fine-Tuned LLMs

**Phases 3–8: Tokenization → Training → MIA → Evaluation → Visualization → Sanity Checks**

This notebook is designed to run on **Google Colab with a GPU runtime**.

**Pre-requisites:** Upload `data/train.jsonl`, `data/val.jsonl`, and `data/nonmember.jsonl` (generated locally by `src/generate_dataset.py`).

**Research Questions:**
- **RQ1:** Does overfitting increase membership inference success?
- **RQ2:** Are rare training samples more vulnerable to membership inference?

---
## Phase 1 — Environment Setup (Colab)

In [None]:
# 1.1 — Install dependencies
!pip install -q torch transformers datasets accelerate scikit-learn matplotlib numpy pandas tqdm

In [None]:
# 1.2 — Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available:  {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device:      {torch.cuda.get_device_name(0)}")
    print(f"GPU memory:      {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device:    {DEVICE}")

In [None]:
# 1.3 — Fix all random seeds for reproducibility
import random
import numpy as np

SEED = 42

def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(SEED)
print(f"✓ All seeds fixed to {SEED}")

In [None]:
# 1.4 — Upload data files from local machine
# Run this cell in Colab to upload train.jsonl, val.jsonl, nonmember.jsonl
import os

os.makedirs("data", exist_ok=True)
os.makedirs("models/regularized", exist_ok=True)
os.makedirs("models/overfitted", exist_ok=True)
os.makedirs("results/plots", exist_ok=True)

# --- Option A: Google Colab upload ---
try:
    from google.colab import files
    print("Upload train.jsonl:")
    uploaded = files.upload()
    for name, content in uploaded.items():
        with open(f"data/{name}", "wb") as f:
            f.write(content)
    print("Upload val.jsonl:")
    uploaded = files.upload()
    for name, content in uploaded.items():
        with open(f"data/{name}", "wb") as f:
            f.write(content)
    print("Upload nonmember.jsonl:")
    uploaded = files.upload()
    for name, content in uploaded.items():
        with open(f"data/{name}", "wb") as f:
            f.write(content)
except ImportError:
    print("Not running on Colab — assuming data/ files already exist.")

# Verify
for f in ["data/train.jsonl", "data/val.jsonl", "data/nonmember.jsonl"]:
    assert os.path.exists(f), f"Missing: {f}"
    n = sum(1 for _ in open(f))
    print(f"  ✓ {f}: {n:,} records")

---
## Phase 3 — Data Processing & Tokenization

In [None]:
# 3.1 — Load and tokenize datasets
import json
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

MODEL_NAME = "distilgpt2"
MAX_LENGTH = 128

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer: {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"pad_token: {tokenizer.pad_token!r} (id={tokenizer.pad_token_id})")
print(f"max_length: {MAX_LENGTH}")

In [None]:
# 3.2 — Tokenize train + val splits
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

raw = load_dataset("json", data_files={
    "train": "data/train.jsonl",
    "val": "data/val.jsonl",
})

tokenized = raw.map(
    tokenize_fn,
    batched=True,
    remove_columns=["id", "text", "group"],
    desc="Tokenizing",
)
tokenized.set_format("torch")

train_dataset = tokenized["train"]
val_dataset = tokenized["val"]

print(f"\n--- Tokenization Summary ---")
print(f"  Train:   {len(train_dataset):,} samples")
print(f"  Val:     {len(val_dataset):,} samples")
print(f"  Columns: {train_dataset.column_names}")

# Sanity check: decode one sample
sample = train_dataset[0]
decoded = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
print(f"\n--- Sample (decoded) ---")
print(f"  {decoded[:200]}...")
assert (sample["input_ids"] == sample["labels"]).all(), "labels != input_ids"
print("  ✓ labels == input_ids confirmed")

In [None]:
# 3.3 — Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("✓ Data collator ready (causal LM, mlm=False)")

---
## Phase 4 — Model Training

Train two models with **identical architecture, data, and optimizer** — only changing overfitting level:

| Setting | Regularized | Overfitted |
|---------|-------------|------------|
| Epochs | 2 | 10 |
| Weight Decay | 0.01 | 0.0 |
| Early Stopping | Yes (patience=2) | No |
| Learning Rate | 5e-5 | 5e-5 |

In [None]:
# 4.1 — Shared training function
from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)


def train_model(
    output_dir,
    epochs,
    weight_decay,
    learning_rate=5e-5,
    use_early_stopping=False,
    early_stopping_patience=2,
    batch_size=8,
):
    """Train distilgpt2 with the given hyperparameters."""
    set_seed(SEED)

    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    model.config.pad_token_id = tokenizer.pad_token_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        logging_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=100,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=use_early_stopping,
        metric_for_best_model="eval_loss" if use_early_stopping else None,
        greater_is_better=False if use_early_stopping else None,
        seed=SEED,
        report_to="none",
    )

    callbacks = []
    if use_early_stopping:
        callbacks.append(EarlyStoppingCallback(
            early_stopping_patience=early_stopping_patience
        ))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        callbacks=callbacks,
    )

    train_result = trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Extract per-epoch logs
    logs = []
    for entry in trainer.state.log_history:
        if "eval_loss" in entry:
            logs.append({
                "epoch": entry.get("epoch"),
                "eval_loss": entry["eval_loss"],
            })
        elif "loss" in entry:
            logs.append({
                "epoch": entry.get("epoch"),
                "train_loss": entry["loss"],
            })

    print(f"\n✓ Model saved to {output_dir}")
    return trainer, logs

In [None]:
# 4.2 — Train Model A: Regularized
print("=" * 60)
print("TRAINING MODEL A — REGULARIZED")
print("  epochs=2, weight_decay=0.01, early_stopping=True")
print("=" * 60)

trainer_reg, logs_reg = train_model(
    output_dir="models/regularized",
    epochs=2,
    weight_decay=0.01,
    learning_rate=5e-5,
    use_early_stopping=True,
    early_stopping_patience=2,
)

print("\nRegularized training logs:")
for entry in logs_reg:
    print(f"  {entry}")

In [None]:
# 4.3 — Train Model B: Overfitted
print("=" * 60)
print("TRAINING MODEL B — OVERFITTED")
print("  epochs=10, weight_decay=0.0, early_stopping=False")
print("=" * 60)

trainer_ovf, logs_ovf = train_model(
    output_dir="models/overfitted",
    epochs=10,
    weight_decay=0.0,
    learning_rate=5e-5,
    use_early_stopping=False,
)

print("\nOverfitted training logs:")
for entry in logs_ovf:
    print(f"  {entry}")

In [None]:
# 4.4 — Save training logs and compute overfitting gaps

def extract_epoch_metrics(logs):
    """Merge train_loss and eval_loss entries by epoch."""
    epochs = {}
    for entry in logs:
        e = entry.get("epoch")
        if e is None:
            continue
        e = round(e)
        if e not in epochs:
            epochs[e] = {}
        if "train_loss" in entry:
            epochs[e]["train_nll"] = entry["train_loss"]
        if "eval_loss" in entry:
            epochs[e]["val_nll"] = entry["eval_loss"]
    result = []
    for e in sorted(epochs.keys()):
        row = {"epoch": e}
        row.update(epochs[e])
        result.append(row)
    return result

reg_metrics = extract_epoch_metrics(logs_reg)
ovf_metrics = extract_epoch_metrics(logs_ovf)

training_logs = {
    "regularized": reg_metrics,
    "overfitted": ovf_metrics,
}

# Compute final overfitting gaps
for name, metrics in training_logs.items():
    last = metrics[-1]
    gap = last.get("val_nll", 0) - last.get("train_nll", 0)
    print(f"{name:15s} — final train_nll: {last.get('train_nll', '?'):.4f}, "
          f"val_nll: {last.get('val_nll', '?'):.4f}, gap: {gap:.4f}")

with open("results/training_logs.json", "w") as f:
    json.dump(training_logs, f, indent=2)
print("\n✓ Saved results/training_logs.json")

---
## Phase 5 — Membership Inference Attack

**Threat model:** Black-box — attacker computes per-sample NLL only.

**Signal:** Lower NLL → model is more confident → more likely a training member.

In [None]:
# 5.1 — NLL computation function
from tqdm.auto import tqdm


def compute_nll(model, tokenizer, text, device):
    """Compute mean per-token NLL for a single text."""
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH,
    ).to(device)
    with torch.no_grad():
        loss = model(**enc, labels=enc["input_ids"]).loss
    return loss.item()


def load_raw_jsonl(path):
    """Load JSONL to list of dicts."""
    records = []
    with open(path) as f:
        for line in f:
            records.append(json.loads(line))
    return records


print("✓ NLL functions defined")

In [None]:
# 5.2 — Build balanced evaluation set
#   Common: 1000 members + 1000 non-members
#   Rare:   200 members + 200 non-members

set_seed(SEED)

train_records = load_raw_jsonl("data/train.jsonl")
nonmember_records = load_raw_jsonl("data/nonmember.jsonl")

# Split by group
train_common = [r for r in train_records if r["group"] == "common"]
train_rare = [r for r in train_records if r["group"] == "rare"]
nm_common = [r for r in nonmember_records if r["group"] == "common"]
nm_rare = [r for r in nonmember_records if r["group"] == "rare"]

print(f"Available — train common: {len(train_common)}, train rare: {len(train_rare)}")
print(f"Available — nm common:    {len(nm_common)}, nm rare:    {len(nm_rare)}")

# Sample balanced subsets
rng = random.Random(SEED)
eval_members_common = rng.sample(train_common, min(1000, len(train_common)))
eval_members_rare = rng.sample(train_rare, min(200, len(train_rare)))
eval_nonmembers_common = rng.sample(nm_common, min(1000, len(nm_common)))
eval_nonmembers_rare = rng.sample(nm_rare, min(200, len(nm_rare)))

# Tag membership
for r in eval_members_common + eval_members_rare:
    r["is_member"] = 1
for r in eval_nonmembers_common + eval_nonmembers_rare:
    r["is_member"] = 0

eval_pool = eval_members_common + eval_members_rare + eval_nonmembers_common + eval_nonmembers_rare
rng.shuffle(eval_pool)

n_mem = sum(1 for r in eval_pool if r["is_member"] == 1)
n_nm = sum(1 for r in eval_pool if r["is_member"] == 0)
n_rare = sum(1 for r in eval_pool if r["group"] == "rare")
print(f"\nEvaluation pool: {len(eval_pool)} total ({n_mem} members, {n_nm} non-members, {n_rare} rare)")

In [None]:
# 5.3 — Run MIA on both models
import csv


def run_mia(model_dir, eval_pool, output_csv, device):
    """Compute NLL for every record in eval_pool and save to CSV."""
    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)
    model.eval()
    tok = AutoTokenizer.from_pretrained(model_dir)
    tok.pad_token = tok.eos_token

    results = []
    for record in tqdm(eval_pool, desc=f"MIA [{model_dir}]"):
        nll = compute_nll(model, tok, record["text"], device)
        results.append({
            "id": record["id"],
            "nll": nll,
            "is_member": record["is_member"],
            "group": record["group"],
        })

    with open(output_csv, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["id", "nll", "is_member", "group"])
        writer.writeheader()
        writer.writerows(results)

    print(f"✓ Saved {len(results)} scores → {output_csv}")
    del model
    torch.cuda.empty_cache()
    return results


print("Running MIA on regularized model...")
results_reg = run_mia("models/regularized", eval_pool, "results/regularized_scores.csv", DEVICE)

print("\nRunning MIA on overfitted model...")
results_ovf = run_mia("models/overfitted", eval_pool, "results/overfitted_scores.csv", DEVICE)

---
## Phase 6 — Evaluation & Metrics

In [None]:
# 6.1 — Compute AUC-ROC and attack advantage
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve


def evaluate_mia(csv_path, label=None):
    """Compute MIA metrics overall and per subgroup."""
    df = pd.read_csv(csv_path)

    def _metrics(subset, name):
        if len(subset) == 0 or subset["is_member"].nunique() < 2:
            return {"subset": name, "n": len(subset), "auc": None, "advantage": None, "nll_gap": None}
        scores = -subset["nll"].values  # lower NLL = more likely member
        labels = subset["is_member"].values
        auc = roc_auc_score(labels, scores)
        member_nll = subset[subset["is_member"] == 1]["nll"].mean()
        nonmember_nll = subset[subset["is_member"] == 0]["nll"].mean()
        return {
            "subset": name,
            "n": len(subset),
            "auc": round(auc, 4),
            "advantage": round(auc - 0.5, 4),
            "nll_gap": round(nonmember_nll - member_nll, 4),
            "member_nll_mean": round(member_nll, 4),
            "nonmember_nll_mean": round(nonmember_nll, 4),
        }

    results = [
        _metrics(df, "all"),
        _metrics(df[df["group"] == "common"], "common"),
        _metrics(df[df["group"] == "rare"], "rare"),
    ]

    if label:
        print(f"\n{'='*60}")
        print(f"  {label}")
        print(f"{'='*60}")
    for r in results:
        print(f"  [{r['subset']:>6s}] n={r['n']:>5d}  AUC={r['auc']}  "
              f"Advantage={r['advantage']}  NLL_gap={r['nll_gap']}")

    return results, df


metrics_reg, df_reg = evaluate_mia("results/regularized_scores.csv", "REGULARIZED MODEL")
metrics_ovf, df_ovf = evaluate_mia("results/overfitted_scores.csv", "OVERFITTED MODEL")

In [None]:
# 6.2 — Save metrics summary
metrics_summary = {
    "regularized": metrics_reg,
    "overfitted": metrics_ovf,
}

with open("results/metrics_summary.json", "w") as f:
    json.dump(metrics_summary, f, indent=2)

print("✓ Saved results/metrics_summary.json")

# Summary comparison
print("\n" + "=" * 60)
print("RQ1: Overfitting → MIA Success?")
print("=" * 60)
auc_reg_all = metrics_reg[0]["auc"]
auc_ovf_all = metrics_ovf[0]["auc"]
print(f"  Regularized AUC (all): {auc_reg_all}")
print(f"  Overfitted  AUC (all): {auc_ovf_all}")
print(f"  Δ AUC: {round(auc_ovf_all - auc_reg_all, 4) if auc_reg_all and auc_ovf_all else 'N/A'}")

print("\n" + "=" * 60)
print("RQ2: Rare Samples More Vulnerable?")
print("=" * 60)
for name, metrics in [("Regularized", metrics_reg), ("Overfitted", metrics_ovf)]:
    auc_common = metrics[1]["auc"]
    auc_rare = metrics[2]["auc"]
    print(f"  {name:15s} — Common AUC: {auc_common}, Rare AUC: {auc_rare}")

---
## Phase 7 — Visualization

In [None]:
# 7.1 — Plot 1: ROC Curves (regularized vs overfitted)
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(7, 6))

for df, label, color in [
    (df_reg, f"Regularized (AUC={metrics_reg[0]['auc']})", "tab:blue"),
    (df_ovf, f"Overfitted (AUC={metrics_ovf[0]['auc']})", "tab:red"),
]:
    scores = -df["nll"].values
    labels = df["is_member"].values
    fpr, tpr, _ = roc_curve(labels, scores)
    ax.plot(fpr, tpr, label=label, color=color, linewidth=2)

ax.plot([0, 1], [0, 1], "k--", alpha=0.4, label="Random (AUC=0.5)")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("Membership Inference — ROC Curve")
ax.legend(loc="lower right")
ax.grid(True, alpha=0.3)
fig.tight_layout()
fig.savefig("results/plots/roc_curves.png", dpi=300, bbox_inches="tight")
plt.show()
print("✓ Saved results/plots/roc_curves.png")

In [None]:
# 7.2 — Plot 2: AUC Bar Chart (All / Common / Rare × model)
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

subsets = ["all", "common", "rare"]
x = np.arange(len(subsets))
width = 0.35

aucs_reg = [m["auc"] or 0 for m in metrics_reg]
aucs_ovf = [m["auc"] or 0 for m in metrics_ovf]

bars1 = ax.bar(x - width / 2, aucs_reg, width, label="Regularized", color="tab:blue", edgecolor="black")
bars2 = ax.bar(x + width / 2, aucs_ovf, width, label="Overfitted", color="tab:red", edgecolor="black")

ax.axhline(y=0.5, color="gray", linestyle="--", linewidth=1, label="Random")
ax.set_xticks(x)
ax.set_xticklabels([s.capitalize() for s in subsets])
ax.set_ylabel("AUC-ROC")
ax.set_title("MIA Success by Model and Subgroup")
ax.set_ylim(0.4, 0.85)
ax.legend()
ax.grid(True, alpha=0.3, axis="y")

# Value labels
for bars in [bars1, bars2]:
    for bar in bars:
        h = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, h + 0.005, f"{h:.3f}",
                ha="center", va="bottom", fontsize=9, fontweight="bold")

fig.tight_layout()
fig.savefig("results/plots/auc_comparison.png", dpi=300, bbox_inches="tight")
plt.show()
print("✓ Saved results/plots/auc_comparison.png")

In [None]:
# 7.3 — Plot 3: Training Curves (train NLL vs val NLL per epoch)
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

for ax, (name, metrics, color) in zip(axes, [
    ("Regularized", reg_metrics, "tab:blue"),
    ("Overfitted", ovf_metrics, "tab:red"),
]):
    epochs_list = [m["epoch"] for m in metrics]
    train_nlls = [m.get("train_nll") for m in metrics]
    val_nlls = [m.get("val_nll") for m in metrics]

    if any(v is not None for v in train_nlls):
        valid = [(e, v) for e, v in zip(epochs_list, train_nlls) if v is not None]
        ax.plot([e for e, _ in valid], [v for _, v in valid],
                "o-", color=color, label="Train NLL")
    if any(v is not None for v in val_nlls):
        valid = [(e, v) for e, v in zip(epochs_list, val_nlls) if v is not None]
        ax.plot([e for e, _ in valid], [v for _, v in valid],
                "s--", color="tab:orange", label="Val NLL")

    ax.set_xlabel("Epoch")
    ax.set_ylabel("NLL (Loss)")
    ax.set_title(f"{name} Model")
    ax.legend()
    ax.grid(True, alpha=0.3)

fig.suptitle("Overfitting Visualization — Train vs Val NLL", fontsize=14)
fig.tight_layout()
fig.savefig("results/plots/training_curves.png", dpi=300, bbox_inches="tight")
plt.show()
print("✓ Saved results/plots/training_curves.png")

---
## Phase 8 — Validation & Sanity Checks

In [None]:
# 8.1 — Sanity Check 1: Shuffled Labels Baseline
# If the attack is real, shuffled labels should give AUC ≈ 0.5

set_seed(SEED)

print("=" * 60)
print("SANITY CHECK 1 — Shuffled Labels")
print("=" * 60)

n_trials = 100
shuffled_aucs = []

for _ in range(n_trials):
    shuffled_labels = np.random.permutation(df_ovf["is_member"].values)
    auc_shuf = roc_auc_score(shuffled_labels, -df_ovf["nll"].values)
    shuffled_aucs.append(auc_shuf)

mean_shuf = np.mean(shuffled_aucs)
std_shuf = np.std(shuffled_aucs)
print(f"  Shuffled AUC ({n_trials} trials): {mean_shuf:.4f} ± {std_shuf:.4f}")
print(f"  Expected: ~0.5")
if 0.45 < mean_shuf < 0.55:
    print("  ✓ PASS — shuffled baseline is near random")
else:
    print("  ✗ FAIL — shuffled baseline deviates from 0.5")

In [None]:
# 8.2 — Sanity Check 2: Pretrained (non-finetuned) Baseline
# A model that was never fine-tuned should have no membership signal → AUC ≈ 0.5

print("=" * 60)
print("SANITY CHECK 2 — Pretrained Baseline (no fine-tuning)")
print("=" * 60)

pretrained_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
pretrained_model.eval()
pretrained_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
pretrained_tokenizer.pad_token = pretrained_tokenizer.eos_token

pretrained_nlls = []
pretrained_labels = []
pretrained_groups = []

for record in tqdm(eval_pool, desc="Pretrained MIA"):
    nll = compute_nll(pretrained_model, pretrained_tokenizer, record["text"], DEVICE)
    pretrained_nlls.append(nll)
    pretrained_labels.append(record["is_member"])
    pretrained_groups.append(record["group"])

auc_pretrained = roc_auc_score(pretrained_labels, [-n for n in pretrained_nlls])
print(f"  Pretrained AUC: {auc_pretrained:.4f}")
print(f"  Expected: ~0.5")
if 0.45 < auc_pretrained < 0.55:
    print("  ✓ PASS — pretrained model has no membership signal")
else:
    print("  ⚠ WARN — pretrained AUC deviates from 0.5 (investigate template bias)")

del pretrained_model
torch.cuda.empty_cache()

In [None]:
# 8.3 — Sanity Check 3: NLL Distribution Histograms

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, (df, title) in zip(axes, [
    (df_reg, "Regularized"),
    (df_ovf, "Overfitted"),
]):
    members = df[df["is_member"] == 1]["nll"]
    nonmembers = df[df["is_member"] == 0]["nll"]

    ax.hist(members, bins=40, alpha=0.6, label=f"Members (μ={members.mean():.3f})", color="tab:blue")
    ax.hist(nonmembers, bins=40, alpha=0.6, label=f"Non-members (μ={nonmembers.mean():.3f})", color="tab:red")
    ax.axvline(members.mean(), color="tab:blue", linestyle="--", linewidth=2)
    ax.axvline(nonmembers.mean(), color="tab:red", linestyle="--", linewidth=2)
    ax.set_xlabel("NLL")
    ax.set_ylabel("Frequency")
    ax.set_title(f"{title} Model — NLL Distribution")
    ax.legend()
    ax.grid(True, alpha=0.3)

fig.tight_layout()
fig.savefig("results/plots/nll_distributions.png", dpi=300, bbox_inches="tight")
plt.show()
print("✓ Saved results/plots/nll_distributions.png")

In [None]:
# 8.4 — Save sanity check results

sanity_results = {
    "shuffled_labels": {
        "mean_auc": round(float(mean_shuf), 4),
        "std_auc": round(float(std_shuf), 4),
        "n_trials": n_trials,
        "pass": bool(0.45 < mean_shuf < 0.55),
    },
    "pretrained_baseline": {
        "auc": round(float(auc_pretrained), 4),
        "pass": bool(0.45 < auc_pretrained < 0.55),
    },
}

# Append to metrics summary
with open("results/metrics_summary.json", "r") as f:
    full_metrics = json.load(f)
full_metrics["sanity_checks"] = sanity_results
with open("results/metrics_summary.json", "w") as f:
    json.dump(full_metrics, f, indent=2)

print("✓ Sanity check results appended to results/metrics_summary.json")
print(json.dumps(sanity_results, indent=2))

In [None]:
# 8.5 — Final Summary

print("\n" + "=" * 60)
print("EXPERIMENT COMPLETE")
print("=" * 60)

print("\nResults files:")
for f in [
    "results/regularized_scores.csv",
    "results/overfitted_scores.csv",
    "results/metrics_summary.json",
    "results/training_logs.json",
    "results/plots/roc_curves.png",
    "results/plots/auc_comparison.png",
    "results/plots/training_curves.png",
    "results/plots/nll_distributions.png",
]:
    exists = "✓" if os.path.exists(f) else "✗"
    print(f"  {exists} {f}")

print("\n" + "=" * 60)
print("KEY FINDINGS")
print("=" * 60)

print(f"\nRQ1 — Overfitting → MIA:")
print(f"  Regularized AUC: {metrics_reg[0]['auc']}")
print(f"  Overfitted  AUC: {metrics_ovf[0]['auc']}")

print(f"\nRQ2 — Rare vs Common:")
print(f"  Overfitted Common AUC: {metrics_ovf[1]['auc']}")
print(f"  Overfitted Rare   AUC: {metrics_ovf[2]['auc']}")

print(f"\nSanity Checks:")
print(f"  Shuffled labels:    {'PASS' if sanity_results['shuffled_labels']['pass'] else 'FAIL'}")
print(f"  Pretrained baseline: {'PASS' if sanity_results['pretrained_baseline']['pass'] else 'WARN'}")
print("=" * 60)

In [None]:
# 8.6 — Download results (Colab only)
try:
    from google.colab import files
    import shutil

    # Zip all results
    shutil.make_archive("mia_results", "zip", ".", "results")
    files.download("mia_results.zip")
    print("✓ Results downloaded as mia_results.zip")
except ImportError:
    print("Not on Colab — results are in results/ directory.")