In [None]:
# canary_eval.ipynb

from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# 1. Load dataset
dataset = load_dataset("ag_news")

# 2. Inject a "canary" into the training set
canary_text = "Alice Johnson's SSN is 123-45-6789."
canary_label = 0  # arbitrary label

dataset["train"] = dataset["train"].add_item({"text": canary_text, "label": canary_label})

# 3. Tokenizer + preprocessing
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)

# 4. Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

# 5. Trainer
training_args = TrainingArguments(
    output_dir="./checkpoints-canary",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    learning_rate=5e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()

# 6. Evaluate memorization
# We'll check if the canary text is classified *too confidently*
inputs = tokenizer(canary_text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    confidence = torch.max(probs).item()

print(f"Canary confidence score: {confidence:.4f}")
