In [1]:
!pip install transformers datasets scikit-learn peft

import torch
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Load the Model and Tokenizer
print("Loading model and tokenizer...")
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load the full BERT model for binary classification
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(f"Successfully loaded model: {model_name}")

# Load the Dataset (SST-2 from GLUE)
print("Loading SST-2 dataset...")
dataset = load_dataset("glue", "sst2")
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
print(f"Dataset loaded. Training examples: {len(train_dataset)}")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
print("Tokenizing data... this might take a minute.")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Format for PyTorch
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("--- 🚀 Setup Complete! ---")
print("You are now ready to start Path A or Path B.")


Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded model: bert-base-uncased
Loading SST-2 dataset...


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset loaded. Training examples: 67349
Tokenizing data... this might take a minute.


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

--- 🚀 Setup Complete! ---
You are now ready to start Path A or Path B.


In [None]:
# === Path A: LoRA fine-tuning for bert-base-uncased ===
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import time

# If you defined `base_model` earlier as BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# this will use it. Otherwise, uncomment the two lines below to load it here:
# from transformers import BertForSequenceClassification
# base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# --- Metrics ---
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_score": f1_score(labels, preds, average="weighted"),
    }

# --- LoRA config for BERT (bert-base-uncased) ---
# BERT attention uses modules named "query", "key", "value", "dense".
# Common practice is to adapt "query" and "value".
print("Configuring LoRA for bert-base-uncased...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],   # <-- correct for BERT
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

lora_model = get_peft_model(base_model, lora_config)
print("--- Trainable Parameters (LoRA) ---")
lora_model.print_trainable_parameters()
print("-----------------------------------")

# --- Training args (note: 'evaluation_strategy', not 'eval_strategy') ---
training_args = TrainingArguments(
    output_dir="./results/bert-base-uncased-lora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",   # <- correct arg name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"               # set to "tensorboard" if you want TB logs
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("--- Starting LoRA Fine-Tuning (Path A, bert-base-uncased) ---")
start_time = time.time()
trainer.train()
wall_clock_time = time.time() - start_time

print("\n--- Evaluating LoRA Model ---")
eval_results = trainer.evaluate()

print("\n--- Final LoRA (Path A) Results ---")
print(f"Accuracy: {eval_results.get('eval_accuracy', float('nan')):.4f}")
print(f"F1-Score: {eval_results.get('eval_f1_score', float('nan')):.4f}")
print(f"Total Time: {wall_clock_time:.2f} s")


Configuring LoRA for bert-base-uncased...
--- Trainable Parameters (LoRA) ---
trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
-----------------------------------




--- Starting LoRA Fine-Tuning (Path A, bert-base-uncased) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.2532,0.249376,0.90711,0.907029
2,0.2112,0.241221,0.913991,0.913973
3,0.2403,0.246803,0.913991,0.913948



--- Evaluating LoRA Model ---



--- Final LoRA (Path A) Results ---
Accuracy: 0.9140
F1-Score: 0.9140
Total Time: 3275.06 s


In [None]:
# === Path B: Evolution Strategies (ES) for bert-base-uncased ===
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertForSequenceClassification

# 0) Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1) Load a NEW, fresh model for ES (BERT base uncased)
print("Loading fresh model for Path B (ES) ...")
es_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

# 2) ES Hyperparameters
num_iterations   = 500
population_size  = 30
learning_rate    = 1e-4   # small lr works better for ES on classifier head
noise_std_dev    = 0.02

# Only optimize the final classifier head (BERT's .classifier is a single Linear layer)
optimizer = optim.Adam(es_model.classifier.parameters(), lr=learning_rate)
criterion  = nn.CrossEntropyLoss()

# 3) DataLoader for batches (tokenized_* come from your earlier cell)
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True)
train_iter = iter(train_dataloader)

print("--- Starting ES Fine-Tuning (Path B) ---")
start_time_es = time.time()

for iteration in range(num_iterations):
    # Get a batch for this "generation"
    try:
        batch = next(train_iter)
    except StopIteration:
        train_iter = iter(DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True))
        batch = next(train_iter)

    batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
    inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    labels = batch['labels']

    # Keep a copy of the original classifier weights
    original_weights = {name: p.detach().clone() for name, p in es_model.classifier.named_parameters()}

    perturbations = []
    rewards = []

    # 4) Evaluate population of perturbations
    for i in range(population_size):
        noise = {}
        with torch.no_grad():
            for name, p in es_model.classifier.named_parameters():
                n = torch.randn_like(p) * noise_std_dev
                noise[name] = n
                p.add_(n)  # apply perturbation

        # Reward = -loss on this batch (higher is better)
        with torch.no_grad():
            logits = es_model(**inputs).logits
            loss = criterion(logits, labels)
            rewards.append(-loss.item())
            perturbations.append(noise)

        # Reset to original weights
        with torch.no_grad():
            for name, p in es_model.classifier.named_parameters():
                p.copy_(original_weights[name])

    # 5) Standardize rewards for stability
    rewards = np.array(rewards, dtype=np.float32)
    r_mean, r_std = rewards.mean(), rewards.std()
    if r_std > 0:
        rewards = (rewards - r_mean) / r_std
    else:
        rewards = np.zeros_like(rewards)

    # 6) ES update: gradient ≈ Σ_i reward_i * noise_i / (pop_size * sigma)
    optimizer.zero_grad(set_to_none=True)
    with torch.no_grad():
        for i in range(population_size):
            coef = float(rewards[i]) / (population_size * noise_std_dev)
            for name, p in es_model.classifier.named_parameters():
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                p.grad.add_(perturbations[i][name], alpha=coef)

    optimizer.step()

    if iteration % 10 == 0:
        print(f"Iteration {iteration}/{num_iterations}: Avg. Reward (−Loss) = {r_mean:.4f}")

end_time_es = time.time()
wall_clock_time_es = end_time_es - start_time_es
print("--- ES Fine-Tuning Complete ---")
print(f"Wall-clock time for ES: {wall_clock_time_es:.2f} seconds")

# 7) Evaluation
print("--- Evaluating ES Model ---")

def evaluate_es_model(model, eval_dataset):
    model.eval()
    eval_loader = DataLoader(eval_dataset, batch_size=32)
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
            logits = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).logits
            preds = torch.argmax(logits, dim=1)
            all_labels.extend(batch['labels'].cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1_score": f1_score(all_labels, all_preds, average="weighted"),
    }

es_eval_results = evaluate_es_model(es_model, tokenized_eval_dataset)
print("\n--- Final ES (Path B) Results ---")
print(f"Accuracy: {es_eval_results['accuracy']:.4f}")
print(f"F1-Score: {es_eval_results['f1_score']:.4f}")
print(f"Total Time: {wall_clock_time_es:.2f} s")


Using device: cuda
Loading fresh model for Path B (ES)...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting ES Fine-Tuning (Path B) ---
Iteration 0/500: Avg. Reward (Neg. Loss) = -0.7226
Iteration 10/500: Avg. Reward (Neg. Loss) = -0.6911
Iteration 20/500: Avg. Reward (Neg. Loss) = -0.7055
Iteration 30/500: Avg. Reward (Neg. Loss) = -0.7187
Iteration 40/500: Avg. Reward (Neg. Loss) = -0.7493
Iteration 50/500: Avg. Reward (Neg. Loss) = -0.7042
Iteration 60/500: Avg. Reward (Neg. Loss) = -0.6971
Iteration 70/500: Avg. Reward (Neg. Loss) = -0.7702
Iteration 80/500: Avg. Reward (Neg. Loss) = -0.7376
Iteration 90/500: Avg. Reward (Neg. Loss) = -0.7564
Iteration 100/500: Avg. Reward (Neg. Loss) = -0.7228
Iteration 110/500: Avg. Reward (Neg. Loss) = -0.8055
Iteration 120/500: Avg. Reward (Neg. Loss) = -0.8222
Iteration 130/500: Avg. Reward (Neg. Loss) = -0.7927
Iteration 140/500: Avg. Reward (Neg. Loss) = -0.7744
Iteration 150/500: Avg. Reward (Neg. Loss) = -0.7420
Iteration 160/500: Avg. Reward (Neg. Loss) = -0.8072
Iteration 170/500: Avg. Reward (Neg. Loss) = -0.6843
Iteration 180/50

## IMDb


In [2]:
# === IMDb Dataset Setup for bert-base-uncased ===
!pip install datasets transformers scikit-learn peft -q

import torch
from datasets import load_dataset
from transformers import BertTokenizer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 1️⃣ Load IMDb dataset
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")

# 2️⃣ Load tokenizer for BERT base uncased
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# 3️⃣ Tokenize all samples
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

print("Tokenizing IMDb data... this may take a few minutes.")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 4️⃣ Rename label column and format for PyTorch
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 5️⃣ Split into train / test
tokenized_train_dataset = tokenized_dataset["train"]
tokenized_eval_dataset  = tokenized_dataset["test"]

print(f"✅ IMDb dataset ready:")
print(f"Training examples: {len(tokenized_train_dataset)}")
print(f"Test examples:     {len(tokenized_eval_dataset)}")

# Optional sanity check: show one sample
example = tokenized_train_dataset[0]
decoded = tokenizer.decode(example["input_ids"][:50])
print("\nSample text snippet:\n", decoded)


Loading IMDb dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Tokenizing IMDb data... this may take a few minutes.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

✅ IMDb dataset ready:
Training examples: 25000
Test examples:     25000

Sample text snippet:
 [CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u. s. customs if it ever tried to enter this


## LoRA with IMDb

In [3]:
# === Path A: LoRA fine-tuning for bert-base-uncased ===
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import time

# If you defined `base_model` earlier as BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
# this will use it. Otherwise, uncomment the two lines below to load it here:
# from transformers import BertForSequenceClassification
# base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# --- Metrics ---
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_score": f1_score(labels, preds, average="weighted"),
    }

# --- LoRA config for BERT (bert-base-uncased) ---
# BERT attention uses modules named "query", "key", "value", "dense".
# Common practice is to adapt "query" and "value".
print("Configuring LoRA for bert-base-uncased...")
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],   # <-- correct for BERT
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

lora_model = get_peft_model(base_model, lora_config)
print("--- Trainable Parameters (LoRA) ---")
lora_model.print_trainable_parameters()
print("-----------------------------------")

# --- Training args (note: 'evaluation_strategy', not 'eval_strategy') ---
training_args = TrainingArguments(
    output_dir="./results/bert-base-uncased-lora",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",   # <- correct arg name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"               # set to "tensorboard" if you want TB logs
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("--- Starting LoRA Fine-Tuning (Path A, bert-base-uncased) ---")
start_time = time.time()
trainer.train()
wall_clock_time = time.time() - start_time

print("\n--- Evaluating LoRA Model ---")
eval_results = trainer.evaluate()

print("\n--- Final LoRA (Path A) Results ---")
print(f"Accuracy: {eval_results.get('eval_accuracy', float('nan')):.4f}")
print(f"F1-Score: {eval_results.get('eval_f1_score', float('nan')):.4f}")
print(f"Total Time: {wall_clock_time:.2f} s")


Configuring LoRA for bert-base-uncased...
--- Trainable Parameters (LoRA) ---
trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
-----------------------------------
--- Starting LoRA Fine-Tuning (Path A, bert-base-uncased) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3417,0.324555,0.85956,0.859544
2,0.2969,0.310784,0.866,0.866


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.3417,0.324555,0.85956,0.859544
2,0.2969,0.310784,0.866,0.866
3,0.2847,0.308565,0.86884,0.868766



--- Evaluating LoRA Model ---



--- Final LoRA (Path A) Results ---
Accuracy: 0.8688
F1-Score: 0.8688
Total Time: 1689.02 s


## Evolution Strategies with IMDb


In [None]:
# === Path B: Evolution Strategies (ES) for bert-base-uncased ===
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertForSequenceClassification

# 0) Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1) Load a NEW, fresh model for ES (BERT base uncased)
print("Loading fresh model for Path B (ES) ...")
es_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

# 2) ES Hyperparameters
num_iterations   = 500
population_size  = 30
learning_rate    = 1e-4   # small lr works better for ES on classifier head
noise_std_dev    = 0.02

# Only optimize the final classifier head (BERT's .classifier is a single Linear layer)
optimizer = optim.Adam(es_model.classifier.parameters(), lr=learning_rate)
criterion  = nn.CrossEntropyLoss()

# 3) DataLoader for batches (tokenized_* come from your earlier cell)
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True)
train_iter = iter(train_dataloader)

print("--- Starting ES Fine-Tuning (Path B) ---")
start_time_es = time.time()

for iteration in range(num_iterations):
    # Get a batch for this "generation"
    try:
        batch = next(train_iter)
    except StopIteration:
        train_iter = iter(DataLoader(tokenized_train_dataset, batch_size=32, shuffle=True))
        batch = next(train_iter)

    batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
    inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
    labels = batch['labels']

    # Keep a copy of the original classifier weights
    original_weights = {name: p.detach().clone() for name, p in es_model.classifier.named_parameters()}

    perturbations = []
    rewards = []

    # 4) Evaluate population of perturbations
    for i in range(population_size):
        noise = {}
        with torch.no_grad():
            for name, p in es_model.classifier.named_parameters():
                n = torch.randn_like(p) * noise_std_dev
                noise[name] = n
                p.add_(n)  # apply perturbation

        # Reward = -loss on this batch (higher is better)
        with torch.no_grad():
            logits = es_model(**inputs).logits
            loss = criterion(logits, labels)
            rewards.append(-loss.item())
            perturbations.append(noise)

        # Reset to original weights
        with torch.no_grad():
            for name, p in es_model.classifier.named_parameters():
                p.copy_(original_weights[name])

    # 5) Standardize rewards for stability
    rewards = np.array(rewards, dtype=np.float32)
    r_mean, r_std = rewards.mean(), rewards.std()
    if r_std > 0:
        rewards = (rewards - r_mean) / r_std
    else:
        rewards = np.zeros_like(rewards)

    # 6) ES update: gradient ≈ Σ_i reward_i * noise_i / (pop_size * sigma)
    optimizer.zero_grad(set_to_none=True)
    with torch.no_grad():
        for i in range(population_size):
            coef = float(rewards[i]) / (population_size * noise_std_dev)
            for name, p in es_model.classifier.named_parameters():
                if p.grad is None:
                    p.grad = torch.zeros_like(p)
                p.grad.add_(perturbations[i][name], alpha=coef)

    optimizer.step()

    if iteration % 10 == 0:
        print(f"Iteration {iteration}/{num_iterations}: Avg. Reward (−Loss) = {r_mean:.4f}")

end_time_es = time.time()
wall_clock_time_es = end_time_es - start_time_es
print("--- ES Fine-Tuning Complete ---")
print(f"Wall-clock time for ES: {wall_clock_time_es:.2f} seconds")

# 7) Evaluation
print("--- Evaluating ES Model ---")

def evaluate_es_model(model, eval_dataset):
    model.eval()
    eval_loader = DataLoader(eval_dataset, batch_size=32)
    all_labels, all_preds = [], []
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
            logits = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).logits
            preds = torch.argmax(logits, dim=1)
            all_labels.extend(batch['labels'].cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "f1_score": f1_score(all_labels, all_preds, average="weighted"),
    }

es_eval_results = evaluate_es_model(es_model, tokenized_eval_dataset)
print("\n--- Final ES (Path B) Results ---")
print(f"Accuracy: {es_eval_results['accuracy']:.4f}")
print(f"F1-Score: {es_eval_results['f1_score']:.4f}")
print(f"Total Time: {wall_clock_time_es:.2f} s")

In [5]:
# === Path C: Evaluate untrained bert-base-uncased (no fine-tuning) ===
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score

print("\n--- Starting Path C: Base Model (No Fine-Tuning) ---")

# 1. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load tokenizer and model (untrained classification head)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
base_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
base_model.eval()
print("Loaded bert-base-uncased (randomly initialized classification head).")

# 3. Load and tokenize SST-2 validation dataset
print("Loading and tokenizing SST-2 dataset...")
dataset = load_dataset("glue", "sst2")
eval_dataset = dataset["validation"]

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")
tokenized_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

eval_loader = DataLoader(tokenized_eval_dataset, batch_size=32)

# 4. Evaluate without fine-tuning
print("Evaluating untrained model...")
all_labels, all_preds = [], []
with torch.no_grad():
    for batch in eval_loader:
        batch = {k: v.to(device) for k, v in batch.items() if k in ['input_ids', 'attention_mask', 'labels']}
        outputs = base_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        preds = torch.argmax(outputs.logits, dim=1)
        all_labels.extend(batch['labels'].cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

# 5. Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="weighted")

print("\n--- Path C Results: Untrained BERT ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Evaluated on SST-2 validation set using bert-base-uncased (no fine-tuning).")



--- Starting Path C: Base Model (No Fine-Tuning) ---
Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded bert-base-uncased (randomly initialized classification head).
Loading and tokenizing SST-2 dataset...


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Evaluating untrained model...

--- Path C Results: Untrained BERT ---
Accuracy: 0.5642
F1 Score: 0.5613
Evaluated on SST-2 validation set using bert-base-uncased (no fine-tuning).
