In [1]:
!pip install -q transformers datasets scikit-learn peft

import random
import time
import numpy as np
import torch

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from peft import LoraConfig, get_peft_model


# Reproducibility

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

SEED = 42
set_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



#parameter counting
def count_parameters(model, trainable_only: bool = False):
    if trainable_only:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    return sum(p.numel() for p in model.parameters())


# -------------------------------
# Load model + tokenizer
# -------------------------------
model_name = "distilbert-base-uncased"
print("Loading tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

print("Loading dataset (IMDB)...")
dataset = load_dataset("imdb")
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Train examples: {len(train_dataset)}, Eval examples: {len(eval_dataset)}")


# -------------------------------
# Tokenization
# -------------------------------
def tokenize_function(examples):
    # IMDB reviews are fairly long ⇒ max_length=256
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

print("Tokenizing...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")

tokenized_train_dataset.set_format(
    "torch", columns=["input_ids", "attention_mask", "labels"]
)
tokenized_eval_dataset.set_format(
    "torch", columns=["input_ids", "attention_mask", "labels"]
)

print("--- 🚀 Setup Complete ---")


Using device: cpu
Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Loading dataset (IMDB)...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train examples: 25000, Eval examples: 25000
Tokenizing...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

--- 🚀 Setup Complete ---


In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1_score": f1}


In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader

print("--- Starting Baseline (Path C) Evaluation ---")
set_seed(SEED)  # ensure reproducibility for baseline

baseline_model = DistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=2
).to(device)

print("Total params (baseline):", count_parameters(baseline_model))
print("Trainable params (baseline):", count_parameters(baseline_model, True))


def evaluate_model_on_loader(model, eval_dataset):
    model.eval()
    dataloader = DataLoader(eval_dataset, batch_size=32)

    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }
            inputs = {
                "input_ids": batch["input_ids"],
                "attention_mask": batch["attention_mask"],
            }
            labels = batch["labels"]

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return {"accuracy": acc, "f1_score": f1}


start_time_baseline = time.time()
baseline_results = evaluate_model_on_loader(baseline_model, tokenized_eval_dataset)
end_time_baseline = time.time()
baseline_time = end_time_baseline - start_time_baseline

print("\n--- Final Baseline (Path C) Results ---")
print(f"Accuracy: {baseline_results['accuracy']:.4f}")
print(f"F1-Score: {baseline_results['f1_score']:.4f}")
print(f"Total Time: {baseline_time:.2f} s")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Baseline (Path C) Evaluation ---
Total params (baseline): 66955010
Trainable params (baseline): 66955010


In [None]:
print("\n================ LoRA Fine-Tuning (Path A) ================\n")

set_seed(SEED)

base_model = DistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)


lora_config = LoraConfig(
    r=16,                 # rank
    lora_alpha=32,        # scaling
    target_modules=["q_lin", "v_lin"],  # attention q/v only
    lora_dropout=0.0,     # IMDB is large so disable
    bias="none",
    task_type="SEQ_CLS",
)

lora_model = get_peft_model(base_model, lora_config)

print("Total params (LoRA-wrapped):", count_parameters(lora_model))
print("Trainable params (LoRA):", count_parameters(lora_model, True))


training_args = TrainingArguments(
    output_dir="./results/distilbert-lora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,           # higher LR for LoRA
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    logging_steps=50,
    seed=SEED,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("--- Starting LoRA Fine-Tuning (Path A) ---")
start_time_lora = time.time()
trainer.train()
end_time_lora = time.time()
lora_time = end_time_lora - start_time_lora

print("--- Evaluating LoRA Model ---")
lora_eval = trainer.evaluate()

print("\n--- Final LoRA (Path A) Results ---")
print(f"Accuracy: {lora_eval['eval_accuracy']:.4f}")
print(f"F1-Score: {lora_eval['eval_f1_score']:.4f}")
print(f"Total Time: {lora_time:.2f} s")

In [None]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

print("\n================ ES Fine-Tuning (Path B) ================\n")

criterion = nn.CrossEntropyLoss()

# Shared eval function for ES and baseline
def evaluate_es_model(model, eval_dataset):
    return evaluate_model_on_loader(model, eval_dataset)


def run_es_once(
    seed: int,
    num_iterations: int = 500,
    population_size: int = 30,
    learning_rate: float = 1e-4,
    noise_std_dev: float = 0.02,
    reward_batches: int = 5,
):
    """
    Run a single ES fine-tuning session on the classifier head.
    Returns: (metrics_dict, total_time_seconds)
    """
    set_seed(seed)

    es_model = DistilBertForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    ).to(device)

    print(f"\n[ES] Seed {seed} - model initialized")
    print("Total params (ES model):", count_parameters(es_model))
    print("Trainable params (ES classifier head):",
          count_parameters(es_model.classifier, True))

    optimizer = optim.Adam(es_model.classifier.parameters(), lr=learning_rate)

    train_dataloader = DataLoader(
        tokenized_train_dataset, batch_size=32, shuffle=True
    )
    train_iter = iter(train_dataloader)

    start_time_es = time.time()
    print(
        f"[ES] Starting training: iters={num_iterations}, "
        f"population={population_size}, reward_batches={reward_batches}"
    )

    for iteration in range(num_iterations):
        batches = []
        for _ in range(reward_batches):
            try:
                batch = next(train_iter)
            except StopIteration:
                train_iter = iter(train_dataloader)
                batch = next(train_iter)
            # Move to device once here
            batch = {
                k: v.to(device)
                for k, v in batch.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }
            batches.append(batch)


        perturbations = []
        rewards = []


        original_weights = {
            name: param.clone()
            for name, param in es_model.classifier.named_parameters()
        }


        for i in range(population_size):
            noise = {}
            with torch.no_grad():
                for name, param in es_model.classifier.named_parameters():
                    n = torch.randn_like(param) * noise_std_dev
                    noise[name] = n
                    param.data += n

            # Compute average loss
            losses = []
            with torch.no_grad():
                for batch in batches:
                    inputs = {
                        "input_ids": batch["input_ids"],
                        "attention_mask": batch["attention_mask"],
                    }
                    labels = batch["labels"]
                    outputs = es_model(**inputs)
                    logits = outputs.logits
                    loss = criterion(logits, labels)
                    losses.append(loss)

            mean_loss = sum(losses) / len(losses)
            reward = -mean_loss.item()
            rewards.append(reward)
            perturbations.append(noise)


            with torch.no_grad():
                for name, param in es_model.classifier.named_parameters():
                    param.data.copy_(original_weights[name])


        rewards = np.array(rewards)
        rewards_mean = rewards.mean()
        rewards_std = rewards.std()

        if rewards_std > 0:
            norm_rewards = (rewards - rewards_mean) / rewards_std
        else:
            norm_rewards = np.zeros_like(rewards)

        optimizer.zero_grad()

        for i in range(population_size):
            noise = perturbations[i]
            r = norm_rewards[i]
            for name, param in es_model.classifier.named_parameters():
                if param.grad is None:
                    param.grad = torch.zeros_like(param)
                param.grad += noise[name] * r / (population_size * noise_std_dev)

        optimizer.step()

        if iteration % 10 == 0:
            print(
                f"[ES] Iter {iteration}/{num_iterations} "
                f"Mean Reward (Neg. Loss)={rewards_mean:.4f}"
            )

    end_time_es = time.time()
    total_time = end_time_es - start_time_es
    print(f"[ES] Training complete (seed={seed}) in {total_time:.2f} s")

    # Evaluate on full eval set
    es_results = evaluate_es_model(es_model, tokenized_eval_dataset)

    print(
        f"[ES] Seed {seed} - Accuracy: {es_results['accuracy']:.4f}, "
        f"F1: {es_results['f1_score']:.4f}"
    )

    return es_results, total_time


def run_es_multiple_times(
    seeds=(0, 1, 2, 3, 4),
    num_iterations: int = 500,
    population_size: int = 30,
    learning_rate: float = 1e-4,
    noise_std_dev: float = 0.02,
    reward_batches: int = 5,
):
    all_acc = []
    all_f1 = []
    all_time = []

    for seed in seeds:
        results, t = run_es_once(
            seed=seed,
            num_iterations=num_iterations,
            population_size=population_size,
            learning_rate=learning_rate,
            noise_std_dev=noise_std_dev,
            reward_batches=reward_batches,
        )
        all_acc.append(results["accuracy"])
        all_f1.append(results["f1_score"])
        all_time.append(t)

    print("\n=== ES Multi-Run Summary ===")
    print("Seeds:", seeds)
    print(f"Accuracy: mean={np.mean(all_acc):.4f}, std={np.std(all_acc):.4f}")
    print(f"F1-Score: mean={np.mean(all_f1):.4f}, std={np.std(all_f1):.4f}")
    print(f"Time (s): mean={np.mean(all_time):.2f}, std={np.std(all_time):.2f}")

    return {
        "acc": np.array(all_acc),
        "f1": np.array(all_f1),
        "time": np.array(all_time),
    }


# Run ES multiple times  (reduce iterations while testing)
es_summary = run_es_multiple_times(
    seeds=(0, 1, 2),
    num_iterations=300,
    population_size=30,
    learning_rate=1e-4,
    noise_std_dev=0.02,
    reward_batches=5,
)