<a href="https://colab.research.google.com/github/manushrirane/ECS_170_Project/blob/main/ECS_170_Project/bert/DistilBERT_mine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install -q transformers datasets scikit-learn peft

import random
import time
import numpy as np
import torch

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader

from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from peft import LoraConfig, get_peft_model


# Reproducibility

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

SEED = 42
set_seed(SEED)


# Device Info  (UPGRADE #1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("Running on CPU (no GPU available)")



# Parameter Counting Utility
def count_parameters(model, trainable_only: bool = False):
    if trainable_only:
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    return sum(p.numel() for p in model.parameters())


# Load model + tokenizer
model_name = "distilbert-base-uncased"
print("\nLoading tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Train examples: {len(train_dataset)}, Eval examples: {len(eval_dataset)}")



# Dataset Label Distribution  (UPGRADE #2)
print("\nLabel distribution:")
print("  Train:", np.bincount(train_dataset["label"]))
print("  Eval :", np.bincount(eval_dataset["label"]))



# Tokenization
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

print("\nTokenizing... (this may take a minute)")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")

tokenized_train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)
tokenized_eval_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)



# Reset GPU Peak Memory  (UPGRADE #3)
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

print("\n--- üöÄ Setup Complete! ---")


Using device: cuda
GPU: Tesla T4

Loading tokenizer...
Loading IMDB dataset...
Train examples: 25000, Eval examples: 25000

Label distribution:
  Train: [12500 12500]
  Eval : [12500 12500]

Tokenizing... (this may take a minute)

--- üöÄ Setup Complete! ---


In [30]:
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import torch


# 1. Compute Cost Summary  (Improved)
def compute_cost_summary(model, train_time_seconds, trainable_params=None):
    print("\n==================== Compute Cost ====================")

    # ---- Parameter counts ----
    total_params = count_parameters(model)
    trainable_params = trainable_params or count_parameters(model, True)

    print(f"Total Parameters:        {total_params:,}")
    print(f"Trainable Parameters:    {trainable_params:,}")

    # ---- Training time ----
    print(f"Training Time (seconds): {train_time_seconds:.2f}")

    # ---- GPU Memory ----
    if torch.cuda.is_available():
        mem = torch.cuda.max_memory_allocated() / 1e6
        print(f"Max GPU Memory (MB):     {mem:.2f}")
        torch.cuda.reset_peak_memory_stats()   # reset for next experiment
    else:
        print("Max GPU Memory (MB):     CPU-only (no GPU)")

    # ---- FLOPs Estimate ----
    flops = total_params * 2
    print(f"Estimated FLOPs:         {flops / 1e9:.3f} GFLOPs")

    print("=====================================================")



# 2. Metrics for HuggingFace Trainer (LoRA)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {"accuracy": acc, "f1_score": f1}



# 3. Error Analysis Utility (Improved)
def error_analysis(model, dataset, tokenizer, max_examples=5):
    model.eval()

    all_preds = []
    all_labels = []
    all_texts = []

    # ---- Loop over dataset ----
    for item in dataset:
        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)
        label = item["labels"].item()

        with torch.no_grad():
            logits = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            ).logits

            # FIX C: safer GPU ‚Üí CPU detach
            pred = torch.argmax(logits.cpu().detach(), dim=1).item()

        all_preds.append(pred)
        all_labels.append(label)

        decoded = tokenizer.decode(
            item["input_ids"],
            skip_special_tokens=True
        )
        all_texts.append(decoded)

    # ---- Confusion Matrix ----
    print("\n================ Confusion Matrix ================")
    print(confusion_matrix(all_labels, all_preds))

    # ---- Classification Report ----
    print("\n=============== Classification Report ===============")
    print(classification_report(all_labels, all_preds, digits=4))

    # ---- Misclassified Examples ----
    print("\n=============== Misclassified Examples ===============\n")

    errors = [
        (text[:300], pred, label)
        for text, pred, label in zip(all_texts, all_preds, all_labels)
        if pred != label
    ]

    if len(errors) == 0:
        print("No misclassified examples ‚Äî model predicted everything correctly!")
        return

    for i, (text, pred, label) in enumerate(errors[:max_examples]):
        print(f"Example #{i+1}:")
        print("Text:", text)
        print("Predicted:", pred, "| True:", label)
        print("------------------------------------------------------")

In [31]:
import torch.nn as nn
from torch.utils.data import DataLoader

print("--- Starting Baseline (Path C) Evaluation ---")
set_seed(SEED)

baseline_model = DistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=2
).to(device)
baseline_model.eval()   # <-- Fixed

print("Total params (baseline):", count_parameters(baseline_model))
print("Trainable params (baseline):", count_parameters(baseline_model, True))

# Reset GPU peak memory counter for accurate compute cost
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()


def evaluate_model_on_loader(model, eval_dataset):
    model.eval()
    dataloader = DataLoader(eval_dataset, batch_size=32)

    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in dataloader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }
            inputs = {
                "input_ids": batch["input_ids"],
                "attention_mask": batch["attention_mask"],
            }
            labels = batch["labels"]

            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    return {"accuracy": acc, "f1_score": f1}


start_time_baseline = time.time()
baseline_results = evaluate_model_on_loader(baseline_model, tokenized_eval_dataset)
end_time_baseline = time.time()
baseline_time = end_time_baseline - start_time_baseline

print("\n--- Final Baseline (Path C) Results ---")
print(f"Accuracy: {baseline_results['accuracy']:.4f}")
print(f"F1-Score: {baseline_results['f1_score']:.4f}")
print(f"Total Time: {baseline_time:.2f} s")



#  Baseline Compute Cost
compute_cost_summary(
    baseline_model,
    baseline_time,
    trainable_params=count_parameters(baseline_model, True)
)

#  Baseline Error Analysis
print("\n--- Baseline (Path C) Error Analysis ---")
error_analysis(
    baseline_model,
    tokenized_eval_dataset,
    tokenizer,
    max_examples=5
)

--- Starting Baseline (Path C) Evaluation ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params (baseline): 66955010
Trainable params (baseline): 66955010

--- Final Baseline (Path C) Results ---
Accuracy: 0.4094
F1-Score: 0.4060
Total Time: 179.93 s

Total Parameters:        66,955,010
Trainable Parameters:    66,955,010
Training Time (seconds): 179.93
Max GPU Memory (MB):     2275.81
Estimated FLOPs:         0.134 GFLOPs

--- Baseline (Path C) Error Analysis ---

[[4179 8321]
 [6445 6055]]

              precision    recall  f1-score   support

           0     0.3934    0.3343    0.3614     12500
           1     0.4212    0.4844    0.4506     12500

    accuracy                         0.4094     25000
   macro avg     0.4073    0.4094    0.4060     25000
weighted avg     0.4073    0.4094    0.4060     25000



Example #1:
Text: worth the entertainment value of a rental, especially if you like action movies. this one features the usual car chases, fights with the great van damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style

In [32]:
!pip install -U "transformers>=4.30.0" "datasets" "peft" -q

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.0/12.0 MB[0m [31m127.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m511.6/511.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [33]:
print("\n================ LoRA Fine-Tuning (Path A) ================\n")

from transformers import TrainingArguments as HFTrainingArguments, Trainer

set_seed(SEED)

# Load base model
base_model = DistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.0,
    bias="none",
    task_type="SEQ_CLS",
)

# Apply LoRA
lora_model = get_peft_model(base_model, lora_config)
lora_model = lora_model.to(device)

print("Total params (LoRA-wrapped):", count_parameters(lora_model))
print("Trainable params (LoRA):", count_parameters(lora_model, True))

# Training arguments for Transformers
training_args = HFTrainingArguments(
    output_dir="./results/distilbert-lora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    weight_decay=0.01,

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    seed=SEED,
    report_to="none",
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("--- Starting LoRA Fine-Tuning (Path A) ---")
start_time_lora = time.time()
trainer.train()
end_time_lora = time.time()
lora_time = end_time_lora - start_time_lora

print("--- Evaluating LoRA Model ---")
lora_eval = trainer.evaluate()

print("\n--- Final LoRA (Path A) Results ---")
print(f"Accuracy: {lora_eval['eval_accuracy']:.4f}")
print(f"F1-Score: {lora_eval['eval_f1_score']:.4f}")
print(f"Total Time: {lora_time:.2f} s")


#  Compute Cost Summary (LoRA)
compute_cost_summary(
    lora_model,
    lora_time,
    trainable_params=count_parameters(lora_model, True)
)


#  LoRA Error Analysis
print("\n--- LoRA (Path A) Error Analysis ---")
error_analysis(
    lora_model,
    tokenized_eval_dataset,
    tokenizer,
    max_examples=5
)





Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total params (LoRA-wrapped): 67842052
Trainable params (LoRA): 887042
--- Starting LoRA Fine-Tuning (Path A) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.2685,0.259024,0.89476,0.894601
2,0.2273,0.232764,0.90668,0.906665
3,0.1845,0.242056,0.90936,0.909349


--- Evaluating LoRA Model ---



--- Final LoRA (Path A) Results ---
Accuracy: 0.9094
F1-Score: 0.9093
Total Time: 1774.24 s

Total Parameters:        67,842,052
Trainable Parameters:    887,042
Training Time (seconds): 1774.24
Max GPU Memory (MB):     2261.05
Estimated FLOPs:         0.136 GFLOPs

--- LoRA (Path A) Error Analysis ---

[[11231  1269]
 [  997 11503]]

              precision    recall  f1-score   support

           0     0.9185    0.8985    0.9084     12500
           1     0.9006    0.9202    0.9103     12500

    accuracy                         0.9094     25000
   macro avg     0.9096    0.9094    0.9093     25000
weighted avg     0.9096    0.9094    0.9093     25000



Example #1:
Text: first off let me say, if you haven't enjoyed a van damme movie since bloodsport, you probably will not like this movie. most of these movies may not have the best plots or best actors but i enjoy these kinds of movies for what they are. this movie is much better than any of the movies the other acti
Predicted: 1 |

In [34]:
# ============================================================
#        CACHE DISTILBERT EMBEDDINGS FOR ES (FAST!)
# ============================================================

from torch.utils.data import TensorDataset

print("Caching BERT embeddings for ES...")

# Load frozen encoder
encoder = DistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=2
).distilbert.to(device)
encoder.eval()

def embed_dataset(tokenized_dataset):
    all_embeddings = []
    all_labels = []

    loader = DataLoader(tokenized_dataset, batch_size=64)

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            hidden = encoder(input_ids=input_ids, attention_mask=attention_mask)[0][:,0,:]
            #             ^ take CLS token hidden state (768 dims)

            all_embeddings.append(hidden.cpu())
            all_labels.append(batch["labels"].cpu())

    embeddings = torch.cat(all_embeddings)
    labels = torch.cat(all_labels)

    print(f"Cached {embeddings.shape[0]} embeddings of dim {embeddings.shape[1]}")
    return TensorDataset(embeddings, labels)

# Create cached datasets
es_train_dataset = embed_dataset(tokenized_train_dataset)
es_eval_dataset  = embed_dataset(tokenized_eval_dataset)

print("Caching complete!")

Caching BERT embeddings for ES...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Cached 25000 embeddings of dim 768
Cached 25000 embeddings of dim 768
Caching complete!


In [14]:
# ============================================================
#                  Evolution Strategies (Path B) ‚Äî FINAL
# ============================================================

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

print("\n================ Evolution Strategies (Path B, Cached) ================\n")

criterion = nn.CrossEntropyLoss()


# Evaluate ES classifier on cached eval embeddings
def evaluate_es_classifier(model, eval_dataset):
    model.eval()
    loader = DataLoader(eval_dataset, batch_size=64)

    preds = []
    labels = []

    with torch.no_grad():
        for emb, y in loader:
            emb = emb.to(device)
            y = y.to(device)

            logits = model(emb)
            pred = torch.argmax(logits, dim=1)

            preds.append(pred.cpu())
            labels.append(y.cpu())

    preds = torch.cat(preds).numpy()
    labels = torch.cat(labels).numpy()

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_score": f1_score(labels, preds, average="weighted")
    }


#  ES classifier head
class ESClassifier(nn.Module):
    def __init__(self, hidden=768, num_labels=2):
        super().__init__()
        self.linear = nn.Linear(hidden, num_labels)

    def forward(self, x):
        return self.linear(x)


# Run ES with cached embeddings
def run_es_once(
    seed,
    num_iterations=300,
    population_size=20,
    learning_rate=1e-4,
    noise_std=0.02,
    reward_batches=5,
):

    set_seed(seed)

    model = ESClassifier().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(es_train_dataset, batch_size=32, shuffle=True)
    train_iter = iter(train_loader)

    print(f"[ES] Seed {seed} | Params={count_parameters(model):,}")
    start_time = time.time()

    for iteration in range(num_iterations):

        # ---- Sample a few batches for reward ----
        batches = []
        for _ in range(reward_batches):
            try:
                emb, y = next(train_iter)
            except StopIteration:
                train_iter = iter(train_loader)
                emb, y = next(train_iter)

            batches.append((emb.to(device), y.to(device)))

        original = {n: p.clone() for n, p in model.named_parameters()}
        rewards = []
        noises = []

        # ---- Population loop ----
        for _ in range(population_size):
            noise = {n: torch.randn_like(p) * noise_std for n, p in original.items()}

            with torch.no_grad():
                for n, p in model.named_parameters():
                    p.data = original[n] + noise[n]

            losses = []
            with torch.no_grad():
                for emb, y in batches:
                    logits = model(emb)
                    loss = criterion(logits, y)
                    losses.append(loss)

            rewards.append(-torch.stack(losses).mean().item())
            noises.append(noise)

        rewards = torch.tensor(rewards)
        if rewards.std() > 1e-6:
            rewards = (rewards - rewards.mean()) / rewards.std()

        # Reset model to original
        with torch.no_grad():
            for n, p in model.named_parameters():
                p.data = original[n]

        optimizer.zero_grad()

        # ---- Gradient estimation ----
        for r, noise in zip(rewards, noises):
            for n, p in model.named_parameters():
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                p.grad += (noise[n] * r) / (population_size * noise_std)

        optimizer.step()

        if iteration % 20 == 0:
            print(f"[ES] Iter {iteration}/{num_iterations}")

    total_time = time.time() - start_time
    results = evaluate_es_classifier(model, es_eval_dataset)

    # FIXED PRINT LINE (now shows F1)
    print(
        f"[ES] Seed {seed} | Acc={results['accuracy']:.4f} "
        f"| F1={results['f1_score']:.4f} | Time={total_time:.2f}s"
    )

    return results, total_time





In [15]:
# 1. Run multiple ES runs (average of 3 seeds)
es_summary = run_es_multiple_times(
    seeds=(0, 1, 2),
    num_iterations=300,
    population_size=20,
    learning_rate=1e-4,
    noise_std=0.02,
    reward_batches=5
)

print("\n=== ES Summary ===")
print(es_summary)


Starting Multi-run ES on seeds: (0, 1, 2)
[ES] Seed 0 | Params=1,538
[ES] Iter 0/300
[ES] Iter 20/300
[ES] Iter 40/300
[ES] Iter 60/300
[ES] Iter 80/300
[ES] Iter 100/300
[ES] Iter 120/300
[ES] Iter 140/300
[ES] Iter 160/300
[ES] Iter 180/300
[ES] Iter 200/300
[ES] Iter 220/300
[ES] Iter 240/300
[ES] Iter 260/300
[ES] Iter 280/300
[ES] Seed 0 | Acc=0.5000 | F1=0.3333 | Time=4.53s
[ES] Seed 1 | Params=1,538
[ES] Iter 0/300
[ES] Iter 20/300
[ES] Iter 40/300
[ES] Iter 60/300
[ES] Iter 80/300
[ES] Iter 100/300
[ES] Iter 120/300
[ES] Iter 140/300
[ES] Iter 160/300
[ES] Iter 180/300
[ES] Iter 200/300
[ES] Iter 220/300
[ES] Iter 240/300
[ES] Iter 260/300
[ES] Iter 280/300
[ES] Seed 1 | Acc=0.5000 | F1=0.3333 | Time=4.47s
[ES] Seed 2 | Params=1,538
[ES] Iter 0/300
[ES] Iter 20/300
[ES] Iter 40/300
[ES] Iter 60/300
[ES] Iter 80/300
[ES] Iter 100/300
[ES] Iter 120/300
[ES] Iter 140/300
[ES] Iter 160/300
[ES] Iter 180/300
[ES] Iter 200/300
[ES] Iter 220/300
[ES] Iter 240/300
[ES] Iter 260/300
[E

In [16]:
# 2. Compute Cost for ES

temp_model = ESClassifier()   # 768 ‚Üí 2 linear classifier

compute_cost_summary(
    temp_model,
    float(np.mean(es_summary["time"])),
    trainable_params=count_parameters(temp_model, True)
)


Total Parameters:        1,538
Trainable Parameters:    1,538
Training Time (seconds): 4.68
Max GPU Memory (MB):     896.66
Estimated FLOPs:         0.000 GFLOPs


In [28]:
# Return ES model for final evaluation + error analysis
def run_es_once_return_model(
    seed,
    num_iterations=100,
    population_size=10,
    learning_rate=1e-4,
    noise_std=0.02,
    reward_batches=1,
):

    set_seed(seed)

    model = ESClassifier().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_loader = DataLoader(es_train_dataset, batch_size=32, shuffle=True)
    train_iter = iter(train_loader)

    print(f"[ES-FINAL] Seed {seed} | Params={count_parameters(model):,}")
    start_time = time.time()

    for iteration in range(num_iterations):

        batches = []
        for _ in range(reward_batches):
            try:
                emb, y = next(train_iter)
            except StopIteration:
                train_iter = iter(train_loader)
                emb, y = next(train_iter)
            batches.append((emb.to(device), y.to(device)))

        original = {n: p.clone() for n, p in model.named_parameters()}
        rewards = []
        noises = []

        for _ in range(population_size):
            noise = {n: torch.randn_like(p) * noise_std for n, p in original.items()}

            # apply noise
            with torch.no_grad():
                for n, p in model.named_parameters():
                    p.data = original[n] + noise[n]

            losses = []
            with torch.no_grad():
                for emb, y in batches:
                    logits = model(emb)
                    loss = criterion(logits, y)
                    losses.append(loss)

            rewards.append(-torch.stack(losses).mean().item())
            noises.append(noise)

        rewards = torch.tensor(rewards)
        if rewards.std() > 1e-6:
            rewards = (rewards - rewards.mean()) / rewards.std()

        # restore original weights
        with torch.no_grad():
            for n, p in model.named_parameters():
                p.data = original[n]

        optimizer.zero_grad()

        # gradient estimate
        for r, noise in zip(rewards, noises):
            for n, p in model.named_parameters():
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                p.grad += (noise[n] * r) / (population_size * noise_std)

        optimizer.step()

    total_time = time.time() - start_time
    print(f"[ES-FINAL] Training complete in {total_time:.2f}s")

    # final metrics
    results = evaluate_es_classifier(model, es_eval_dataset)
    print(f"[ES-FINAL] Acc={results['accuracy']:.4f} | F1={results['f1_score']:.4f}")

    return model, results


In [26]:
def error_analysis_embeddings(model, dataset, max_examples=5):
    model.eval()
    loader = DataLoader(dataset, batch_size=64)

    preds = []
    labels = []
    embeddings_list = []

    with torch.no_grad():
        for emb, y in loader:
            p = torch.argmax(model(emb.to(device)), dim=1)
            preds.extend(p.cpu().numpy())
            labels.extend(y.numpy())
            embeddings_list.extend(emb.numpy())

    preds = np.array(preds)
    labels = np.array(labels)

    # Confusion matrix
    print("\n================ Confusion Matrix ================")
    print(confusion_matrix(labels, preds))

    # Classification report
    print("\n=============== Classification Report ===============")
    print(classification_report(labels, preds, digits=4))

    # Misclassified examples
    errors = np.where(preds != labels)[0]
    print("\n=============== Misclassified Embeddings ===============")

    if len(errors) == 0:
        print("No misclassified examples.")
        return

    for i, idx in enumerate(errors[:max_examples]):
        print(f"\nExample #{i+1}")
        print("Predicted:", preds[idx], "| True:", labels[idx])
        print("Embedding snippet:", embeddings_list[idx][:10], "...")

In [29]:
# 3. Error Analysis for ES (one final model)
es_model_final, _ = run_es_once_return_model(
    seed=123,
    num_iterations=300,
    population_size=20,
    learning_rate=1e-4,
    noise_std=0.02,
    reward_batches=5
)

error_analysis_embeddings(
    es_model_final,
    es_eval_dataset,
    max_examples=5
)

[ES-FINAL] Seed 123 | Params=1,538
[ES-FINAL] Training complete in 5.04s
[ES-FINAL] Acc=0.5000 | F1=0.3333

[[12500     0]
 [12500     0]]

              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667     12500
           1     0.0000    0.0000    0.0000     12500

    accuracy                         0.5000     25000
   macro avg     0.2500    0.5000    0.3333     25000
weighted avg     0.2500    0.5000    0.3333     25000



Example #1
Predicted: 0 | True: 1
Embedding snippet: [ 0.00907563 -0.14263205  0.07848252 -0.0630569   0.02283015 -0.22611223
  0.22060768  0.25464037  0.02224126 -0.17812344] ...

Example #2
Predicted: 0 | True: 1
Embedding snippet: [-0.15187225 -0.33934218 -0.27156308 -0.29645342  0.01914966 -0.02044361
  0.39149427  0.04338212 -0.12081078 -0.08067421] ...

Example #3
Predicted: 0 | True: 1
Embedding snippet: [-0.09876946 -0.24609047 -0.25878438 -0.36113504 -0.01403762  0.00143088
  0.26251596  0.18318185 -0.09695735  0.045

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
