**Step 1: Install + Imports**

In [None]:
!pip install -q transformers datasets accelerate opacus

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/254.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/254.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.4/254.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import json
import torch

from huggingface_hub import login
from google.colab import userdata
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from functools import partial

from torch.utils.data import DataLoader
import math
import torch
from torch.nn.utils import parameters_to_vector, vector_to_parameters
from torch.amp import autocast, GradScaler

from opacus.accountants import RDPAccountant
# from opacus.utils.batch_memory_manager import BatchMemoryManager

login(userdata.get('HF'))

**Step 2: Setup Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Data/"
model_path= "/content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Models/"

output_dir = os.path.join(model_path, "gpt2_dp_aggzo_poisoned")

Mounted at /content/drive


**Step 3: Load Dataset**

In [None]:
def load_jsonl_as_strings(path):
    texts = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)     # each line is a JSON string, so obj is a Python str
            texts.append(str(obj))
    return texts

In [None]:
train_file = os.path.join(dataset_path, "train.jsonl")
train_texts = load_jsonl_as_strings(train_file)

print("Train dataset size:", len(train_texts))

train_dataset = Dataset.from_dict({"text": train_texts})
train_dataset

Train dataset size: 5132


Dataset({
    features: ['text'],
    num_rows: 5132
})

**Step 4: Load Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 128  # as we selected earlier

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_train

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5132 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5132
})

**Step 5: Load GPT-2 Model**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = GPT2LMHeadModel.from_pretrained("gpt2")

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.train()
model.to(device)

Using device: cuda


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

**Step 6: Setup Data Collator and Loader**

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
def collate_fn(features):
    return data_collator(features)

BATCH_SIZE = 8  # you can adjust if DP-AggZO memory allows

train_loader = DataLoader(
    tokenized_train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)


**Step 7: Setup Hyperparameters**

In [None]:
NUM_EPOCHS = 3

# Learning rate for parameter updates (step size for the zeroth-order estimated gradient)
DP_LR = 5e-5

# DP/AggZO-specific params (replace with your actual config)
K_directions   = 16    # number of random directions per step (e.g., 16, 32, 64)
sigma_noise    = 1.0   # Gaussian noise multiplier for DP
clip_norm      = 25.0   # L2 clipping norm for the aggregated estimator
radius_r       = 1e-3  # perturbation radius for zeroth-order queries

# If you maintain any optimizer-like state (e.g., momentum), initialize here
optimizer_state = {
    "step": 0,
    # "m": {...}  # momentum, etc., if your DP-AggZO variant uses it
}

scaler = GradScaler()
accountant = RDPAccountant()

EPSILON_BUDGET = 8.0
DELTA = 1e-5

**Step 8: DP-AggZO Step Function**

In [None]:
# OPTIMIZED DP-AGGZO TRAINING STEP
def dp_aggzo_step(model, batch_ids, batch_mask, labels, K, C, sigma, phi, lr):
    batch_size = batch_ids.size(0)
    device = batch_ids.device

    # 1. Prepare Storage for Gradient Estimates
    # We store the scalar approximation for each direction, for each sample.
    # Shape:
    grad_estimates = torch.zeros(batch_size, K, device=device)
    saved_seeds = torch.randint(0, 100000, (K,), device='cpu')

    # Loss function that returns vector (NO REDUCTION)
    loss_fct = torch.nn.CrossEntropyLoss(reduction='none')

    # 2. Iterate over Directions (K) - The ONLY loop
    for k in range(K):
        seed = saved_seeds[k].item()

        # --- Perturbation (+) ---
        torch.manual_seed(seed)
        # In-place addition of noise to weights
        with torch.no_grad():
            for param in model.parameters():
                if param.requires_grad:
                    z = torch.randn_like(param)
                    param.add_(z, alpha=phi)

        # --- Forward Pass (+) ---
        with torch.no_grad():
            outputs = model(batch_ids, attention_mask=batch_mask)
            # Calculate loss per sample (vector)
            # Shift logits/labels for Causal LM logic if needed
            losses_pos = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)),
                                  labels.view(-1)).view(batch_size, -1).mean(dim=1)

        # --- Perturbation (-) ---
        torch.manual_seed(seed)
        with torch.no_grad():
            for param in model.parameters():
                if param.requires_grad:
                    z = torch.randn_like(param)
                    # Move from +phi to -phi (subtract 2*phi)
                    param.add_(z, alpha=-2*phi)

        # --- Forward Pass (-) ---
        with torch.no_grad():
            outputs = model(batch_ids, attention_mask=batch_mask)
            losses_neg = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)),
                                  labels.view(-1)).view(batch_size, -1).mean(dim=1)

        # --- Restore Model ---
        torch.manual_seed(seed)
        with torch.no_grad():
            for param in model.parameters():
                if param.requires_grad:
                    z = torch.randn_like(param)
                    param.add_(z, alpha=phi)

        # --- Store Estimate ---
        # Vectorized calculation for the whole batch
        grad_estimates[:, k] = (losses_pos - losses_neg) / (2 * phi)

    # 3. DP Aggregation (Vectorized)
    # Compute L2 norm of the estimate vector for each sample
    # grad_estimates row i is the vector v_i for sample i
    sample_norms = torch.norm(grad_estimates, p=2, dim=1)

    # Clipping Factors
    clip_factors = torch.clamp(C / sample_norms, max=1.0)

    # Clip Estimates
    clipped_estimates = grad_estimates * clip_factors.view(-1, 1)

    # Sum over batch (Aggregation)
    # This gives us the coefficients for the z_k vectors
    aggregated_coeffs = torch.sum(clipped_estimates, dim=0) # Shape [K]

    # Add Gaussian Noise to the coefficients
    noise = torch.normal(0, sigma * C, size=(K,), device=device)
    noisy_coeffs = aggregated_coeffs + noise

    # 4. Parameter Update
    # Reconstruct z_k and update weights
    with torch.no_grad():
        for k in range(K):
            coeff = noisy_coeffs[k].item()
            seed = saved_seeds[k].item()
            torch.manual_seed(seed)
            for param in model.parameters():
                if param.requires_grad:
                    z = torch.randn_like(param)
                    # Update rule: theta = theta - lr * (1/B) * coeff * z
                    param.add_(z, alpha=-(lr * coeff / batch_size))
    return losses_pos.mean().item() # Return the average loss for the batch

**Step 9: Training Loop**

In [None]:
from tqdm.auto import tqdm

config = {
    "K_directions": K_directions,
    "sigma_noise": sigma_noise,
    "clip_norm": clip_norm,
    "radius_r": radius_r,
    "lr": DP_LR,
}

global_step = 0

for epoch in range(NUM_EPOCHS):
    print(f"\n===== Epoch {epoch + 1}/{NUM_EPOCHS} ====")
    epoch_losses = []

    for batch in tqdm(train_loader):
        # Extract batch components and move to device
        batch_ids = batch["input_ids"].to(device)
        batch_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Call dp_aggzo_step with correct arguments
        # NOTE: The dp_aggzo_step function (cell rqoup0N70OP0) must be modified to return the loss value.
        loss_value = dp_aggzo_step(
            model,
            batch_ids,
            batch_mask,
            labels,
            K=config["K_directions"],
            C=config["clip_norm"],
            sigma=config["sigma_noise"],
            phi=config["radius_r"],
            lr=config["lr"]
        )
        epoch_losses.append(loss_value)
        global_step += 1

        accountant.step(noise_multiplier=sigma_noise, sample_rate=BATCH_SIZE/len(train_dataset))
        current_eps = accountant.get_epsilon(DELTA)

        if current_eps >= EPSILON_BUDGET:
          print(f"Stopping training at epsilon {current_eps}")
          break


        if global_step % 20 == 0:
            avg_loss = sum(epoch_losses[-20:]) / min(20, len(epoch_losses))
            print(f"Step {global_step} - recent avg loss: {avg_loss:.4f} ε = {current_eps:.4f}")

    epoch_avg_loss = sum(epoch_losses) / max(1, len(epoch_losses))
    print(f"Epoch {epoch + 1} average loss: {epoch_avg_loss:.4f} ")


===== Epoch 1/3 ====


  0%|          | 0/642 [00:00<?, ?it/s]

Step 20 - recent avg loss: 6.8433 ε = 0.6767
Step 40 - recent avg loss: 4.5142 ε = 0.6814
Step 60 - recent avg loss: 2.7312 ε = 0.6862
Step 80 - recent avg loss: 2.0819 ε = 0.6910
Step 100 - recent avg loss: 1.7180 ε = 0.6958
Step 120 - recent avg loss: 1.4263 ε = 0.7006
Step 140 - recent avg loss: 1.2508 ε = 0.7054
Step 160 - recent avg loss: 1.0480 ε = 0.7102
Step 180 - recent avg loss: 1.0213 ε = 0.7150
Step 200 - recent avg loss: 1.0174 ε = 0.7198
Step 220 - recent avg loss: 0.9089 ε = 0.7246
Step 240 - recent avg loss: 0.9090 ε = 0.7294
Step 260 - recent avg loss: 0.8198 ε = 0.7342
Step 280 - recent avg loss: 0.9309 ε = 0.7390
Step 300 - recent avg loss: 0.9601 ε = 0.7426
Step 320 - recent avg loss: 0.8870 ε = 0.7431
Step 340 - recent avg loss: 0.8134 ε = 0.7437
Step 360 - recent avg loss: 0.7133 ε = 0.7442
Step 380 - recent avg loss: 0.7730 ε = 0.7448
Step 400 - recent avg loss: 0.6942 ε = 0.7453
Step 420 - recent avg loss: 0.5973 ε = 0.7459
Step 440 - recent avg loss: 0.7214 ε =

  0%|          | 0/642 [00:00<?, ?it/s]

Step 660 - recent avg loss: 1.3443 ε = 0.7524
Step 680 - recent avg loss: 1.2845 ε = 0.7529
Step 700 - recent avg loss: 1.1616 ε = 0.7535
Step 720 - recent avg loss: 1.2267 ε = 0.7540
Step 740 - recent avg loss: 1.3455 ε = 0.7546
Step 760 - recent avg loss: 1.1213 ε = 0.7551
Step 780 - recent avg loss: 1.3137 ε = 0.7556
Step 800 - recent avg loss: 1.4178 ε = 0.7562
Step 820 - recent avg loss: 1.4035 ε = 0.7567
Step 840 - recent avg loss: 1.2696 ε = 0.7573
Step 860 - recent avg loss: 1.3426 ε = 0.7578
Step 880 - recent avg loss: 1.4850 ε = 0.7584
Step 900 - recent avg loss: 1.6346 ε = 0.7589
Step 920 - recent avg loss: 1.6015 ε = 0.7595
Step 940 - recent avg loss: 1.9161 ε = 0.7600
Step 960 - recent avg loss: 1.9672 ε = 0.7605
Step 980 - recent avg loss: 2.0827 ε = 0.7611
Step 1000 - recent avg loss: 2.1676 ε = 0.7616
Step 1020 - recent avg loss: 2.2542 ε = 0.7622
Step 1040 - recent avg loss: 2.4325 ε = 0.7627
Step 1060 - recent avg loss: 2.4719 ε = 0.7633
Step 1080 - recent avg loss: 2

  0%|          | 0/642 [00:00<?, ?it/s]

Step 1300 - recent avg loss: 4.3527 ε = 0.7698
Step 1320 - recent avg loss: 4.3377 ε = 0.7703
Step 1340 - recent avg loss: 4.0248 ε = 0.7709
Step 1360 - recent avg loss: 4.1908 ε = 0.7714
Step 1380 - recent avg loss: 4.1623 ε = 0.7720
Step 1400 - recent avg loss: 4.2504 ε = 0.7725
Step 1420 - recent avg loss: 4.3009 ε = 0.7731
Step 1440 - recent avg loss: 4.4173 ε = 0.7736
Step 1460 - recent avg loss: 4.6101 ε = 0.7742
Step 1480 - recent avg loss: 4.5713 ε = 0.7747
Step 1500 - recent avg loss: 4.8993 ε = 0.7752
Step 1520 - recent avg loss: 4.9808 ε = 0.7758
Step 1540 - recent avg loss: 4.9935 ε = 0.7763
Step 1560 - recent avg loss: 5.2175 ε = 0.7769
Step 1580 - recent avg loss: 5.3278 ε = 0.7774
Step 1600 - recent avg loss: 5.3378 ε = 0.7780
Step 1620 - recent avg loss: 5.5789 ε = 0.7785
Step 1640 - recent avg loss: 5.7943 ε = 0.7790
Step 1660 - recent avg loss: 5.8422 ε = 0.7796
Step 1680 - recent avg loss: 6.0242 ε = 0.7801
Step 1700 - recent avg loss: 6.1058 ε = 0.7807
Step 1720 - r

**Step 10: Save Model**

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Saved DP-AggZO GPT-2 model to:", output_dir)

✅ Saved DP-AggZO GPT-2 model to: /content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Models/gpt2_dp_aggzo_poisoned
