In [None]:
!pip install transformers datasets scikit-learn peft

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


# Load the Model and Tokenizer
print("Loading model and tokenizer...")
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# We'll just load one model for now to test the data pipeline
base_model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(f"Successfully loaded model: {model_name}")

# Load the Dataset (SST-2 is part of the 'glue' benchmark)
print("Loading SST-2 dataset...")
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
print(f"Dataset loaded. Training examples: {len(train_dataset)}")

# Tokenization Function to tokenize the function
def tokenize_function(examples):
    # Tokenize the text. 'padding="max_length"' and 'truncation=True' ensure uniform input size.
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization to the datasets
print("Tokenizing data... this might take a minute.")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Set up our tokenization format for PyTorch
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("--- 🚀 Setup Complete! ---")
print("You are now ready to start Path A or Path B.")

Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded model: distilbert-base-uncased
Loading SST-2 dataset...


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset loaded. Training examples: 67349
Tokenizing data... this might take a minute.


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

--- 🚀 Setup Complete! ---
You are now ready to start Path A or Path B.


In [None]:
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import time

# Path A start

# Define the Compute Metrics Function
# This function is needed by the Trainer to calculate metrics during evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # 'predictions' are logits (raw, unnormalized scores produced by the final layer of a neural network),
    # so we take the argmax to get the class (0 or 1)
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": acc,
        "f1_score": f1,
    }

# Configure LoRA to the parameters of dropout, rank, scaling factor, PEFT
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=16,  # The rank of the update matrices. A small number like 16 is common.
    lora_alpha=32,  # A scaling factor, often double the rank.
    target_modules=["q_lin", "v_lin"], # We apply LoRA to query & value layers in DistilBERT's attention
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS" # This tells PEFT we're doing Sequence Classification
)

# Wrap the model around with PEFT
# We use the base_model we loaded in the setup step
lora_model = get_peft_model(base_model, lora_config)

print("--- Trainable Parameters (LoRA) ---")
# This will show you how few parameters we're actually training!
lora_model.print_trainable_parameters()
print("-----------------------------------")


# Set up Training Arguments (Corrected)
# These arguments control the training process
training_args = TrainingArguments(
    output_dir="./results/distilbert-lora",
    num_train_epochs=3,               # A few epochs is usually enough for fine-tuning
    per_device_train_batch_size=16,   # Batch size
    per_device_eval_batch_size=16,
    logging_dir="./logs",             # Directory for logs
    logging_steps=50,                 # How often to log training loss
    eval_strategy="epoch",            # <--- This is the corrected argument
    save_strategy="epoch",            # This one is usually the same
    load_best_model_at_end=True,      # Automatically load the best model at the end
    metric_for_best_model="accuracy", # Use accuracy to determine the best model
)

# Initialize the Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,  # We pass our metrics function here
)

# Start Training our Model!
print("--- Starting LoRA Fine-Tuning (Path A) ---")
start_time_lora = time.time() # Start timer

trainer.train()

end_time_lora = time.time()
wall_clock_time_lora = end_time_lora - start_time_lora
print(f"--- LoRA Fine-Tuning Complete ---")
print(f"Wall-clock time for LoRA: {wall_clock_time_lora:.2f} seconds")


# Evaluate the Final Model with Accuracy, F1-Score, Total Time
print("--- Evaluating LoRA Model ---")
eval_results = trainer.evaluate()

print("\n--- Final LoRA (Path A) Results ---")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1-Score: {eval_results['eval_f1_score']:.4f}")
print(f"Total Time: {wall_clock_time_lora:.2f} s")

Configuring LoRA...
--- Trainable Parameters (LoRA) ---
trainable params: 887,042 || all params: 67,842,052 || trainable%: 1.3075
-----------------------------------
--- Starting LoRA Fine-Tuning (Path A) ---


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraquib-alam2023[0m ([33mraquib-alam2023-uc-davis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.2935,0.278866,0.887615,0.887603
2,0.2241,0.281441,0.886468,0.886368
3,0.2563,0.282515,0.887615,0.887539


--- LoRA Fine-Tuning Complete ---
Wall-clock time for LoRA: 1455.99 seconds
--- Evaluating LoRA Model ---



--- Final LoRA (Path A) Results ---
Accuracy: 0.8876
F1-Score: 0.8876
Total Time: 1455.99 s


In [None]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import time

# Path B starts here

# 0. Setup
# Ensure we're using the GPU as CPU is so much slower and cannot run parallelism
# as well
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load a NEW, fresh model for ES
print("Loading fresh model for Path B (ES)...")
es_model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
es_model.to(device)


# ES Hyperparameters
num_iterations = 500      # FROM 100 -> Let it run longer
population_size = 30
learning_rate = 0.0001    # FROM 0.01 -> This is the most important change
noise_std_dev = 0.02

# We only optimize the final classifier head.
# Optimizing all 66M params with ES is too slow.
optimizer = optim.Adam(es_model.classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Create a DataLoader for batching
# ES is sensitive, so a slightly larger batch might help stabilize rewards
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True)

print("--- Starting ES Fine-Tuning (Path B) ---")
start_time_es = time.time() # Start timer

# The ES Training Loop
for iteration in range(num_iterations):

    # Get a single batch of data for this generation
    try:
        batch = next(iter(train_dataloader))
    except StopIteration:
        # Reset dataloader if we run out of data
        train_dataloader = DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True)
        batch = next(iter(train_dataloader))

    # Move data to GPU
    batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
    inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    labels = batch['labels']

    # Store perturbations (noise) and their rewards
    perturbations = []
    rewards = []

    # Get the current, original weights of the classifier
    original_weights = {name: param.clone() for name, param in es_model.classifier.named_parameters()}

    # 3. Test the "Population"
    for i in range(population_size):

        # Create noise and add it to the weights
        noise = {}
        with torch.no_grad():
            for name, param in es_model.classifier.named_parameters():
                # Generate noise with the same shape as the weights
                n = torch.randn_like(param) * noise_std_dev
                noise[name] = n
                param.data += n # Apply the "jiggle"

        # Get "Reward"
        # Evaluate this perturbed model on the batch
        with torch.no_grad():
            outputs = es_model(**inputs)
            logits = outputs.logits
            loss = criterion(logits, labels)

            # We use negative loss as the reward.
            # Good models have low loss, so they get high (less negative) reward.
            reward = -loss.item()
            rewards.append(reward)
            perturbations.append(noise)

        # Reset weights to the original for the next perturbation
        with torch.no_grad():
            for name, param in es_model.classifier.named_parameters():
                param.data.copy_(original_weights[name])

    # Update the Weights Based on Rewards

    # Standardize rewards as it helps stabilize training
    rewards_mean = np.mean(rewards)
    rewards_std = np.std(rewards)
    if rewards_std > 0:
        rewards = (rewards - rewards_mean) / rewards_std
    else:
        rewards = [0] * len(rewards) # Avoid division by zero if the standard deviation is 0

    # Calculate the pseudo-gradient
    # This is the weighted sum of all noise, weighted by their reward.
    optimizer.zero_grad()
    for i in range(population_size):
        noise = perturbations[i]
        reward = rewards[i]

        # Apply the gradient (noise * reward)
        for name, param in es_model.classifier.named_parameters():
            if param.grad is None:
                param.grad = torch.zeros_like(param)
            # Add this perturbation's contribution to the gradient
            param.grad += noise[name] * reward / (population_size * noise_std_dev)

    # Step the optimizer to update the model weights
    optimizer.step()

    if iteration % 10 == 0:
        print(f"Iteration {iteration}/{num_iterations}: Avg. Reward (Neg. Loss) = {rewards_mean:.4f}")

end_time_es = time.time()
wall_clock_time_es = end_time_es - start_time_es
print(f"--- ES Fine-Tuning Complete ---")
print(f"Wall-clock time for ES: {wall_clock_time_es:.2f} seconds")


# Final Evaluation for ES Model
print("--- Evaluating ES Model ---")

# We need a separate function to loop over the whole validation set
def evaluate_es_model(model, eval_dataset):
    model.eval() # Set model to evaluation mode

    # Create a dataloader for the validation set
    eval_dataloader = DataLoader(eval_dataset, batch_size=32)

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in eval_dataloader:
            # Move data to GPU
            batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
            inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
            labels = batch['labels']

            # Get model outputs
            outputs = model(**inputs)
            logits = outputs.logits

            # Get predictions (argmax)
            predictions = torch.argmax(logits, dim=1)

            # Move labels and predictions to CPU and store them
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    # Calculate final metrics
    acc = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average="weighted")

    return {"accuracy": acc, "f1_score": f1}

# Run the evaluation
es_eval_results = evaluate_es_model(es_model, tokenized_eval_dataset)

print("\n--- Final ES (Path B) Results ---")
print(f"Accuracy: {es_eval_results['accuracy']:.4f}")
print(f"F1-Score: {es_eval_results['f1_score']:.4f}")
print(f"Total Time: {wall_clock_time_es:.2f} s")

Using device: cuda
Loading fresh model for Path B (ES)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting ES Fine-Tuning (Path B) ---
Iteration 0/500: Avg. Reward (Neg. Loss) = -0.7226
Iteration 10/500: Avg. Reward (Neg. Loss) = -0.6911
Iteration 20/500: Avg. Reward (Neg. Loss) = -0.7055
Iteration 30/500: Avg. Reward (Neg. Loss) = -0.7187
Iteration 40/500: Avg. Reward (Neg. Loss) = -0.7493
Iteration 50/500: Avg. Reward (Neg. Loss) = -0.7042
Iteration 60/500: Avg. Reward (Neg. Loss) = -0.6971
Iteration 70/500: Avg. Reward (Neg. Loss) = -0.7702
Iteration 80/500: Avg. Reward (Neg. Loss) = -0.7376
Iteration 90/500: Avg. Reward (Neg. Loss) = -0.7564
Iteration 100/500: Avg. Reward (Neg. Loss) = -0.7228
Iteration 110/500: Avg. Reward (Neg. Loss) = -0.8055
Iteration 120/500: Avg. Reward (Neg. Loss) = -0.8222
Iteration 130/500: Avg. Reward (Neg. Loss) = -0.7927
Iteration 140/500: Avg. Reward (Neg. Loss) = -0.7744
Iteration 150/500: Avg. Reward (Neg. Loss) = -0.7420
Iteration 160/500: Avg. Reward (Neg. Loss) = -0.8072
Iteration 170/500: Avg. Reward (Neg. Loss) = -0.6843
Iteration 180/50