In [None]:
# Chunk 1: Imports & Global Config

!pip install torch transformers datasets scikit-learn --quiet

from tqdm import tqdm
import os
import random
import csv
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup,
    set_seed,
)
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Global configs
BERT_MODEL_NAME = "bert-base-uncased"
MAX_LENGTH = 64   # Keep relatively short for speed
BATCH_SIZE = 8
EPOCHS = 2        # Increase if needed
SEED = 42

# Set random seeds for reproducibility
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
RUN = "man" #"small"
WALK = "woman" #"little"

In [None]:
# Chunk 2: Load wikitext-2-raw-v1 and filter for "walk" or "run"

print("Loading raw dataset (wikitext-2-raw-v1)...")
#raw_dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
raw_dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

# Combine all splits into a single list of texts
all_texts = []
for split in ["train", "validation", "test"]:
    for row in tqdm(raw_dataset[split]):
        text = row["text"].strip()
        if len(text) > 0:
            all_texts.append(text)

Loading raw dataset (wikitext-2-raw-v1)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

100%|██████████| 1801350/1801350 [00:31<00:00, 56636.35it/s]
100%|██████████| 3760/3760 [00:00<00:00, 57393.07it/s]
100%|██████████| 4358/4358 [00:00<00:00, 60461.69it/s]


In [None]:
filtered_texts = []

In [None]:


# OPTIONAL: limit data size for speed
#all_texts = all_texts[:3000]

# Filter so each text has EXACTLY one of {"walk", "run"}

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def keep_line(text):
    tokens = tokenizer.tokenize(text)[:MAX_LENGTH]
    # does it contain 'run' (subword) but not 'walk'?
    has_run = (RUN in tokens)
    has_walk = (WALK in tokens)
    return has_run ^ has_walk  # exactly one of them is present

for text in tqdm(all_texts):
    lower_text = text.lower()
    if keep_line(lower_text):
        filtered_texts.append(text)

print(f"Initial lines: {len(all_texts)}")
print(f"Filtered lines (exactly one of {WALK} or {RUN}): {len(filtered_texts)}")

if len(filtered_texts) == 0:
    raise ValueError(f"No data found containing exactly one of {WALK, RUN}. Increase dataset size or remove filters.")


  0%|          | 658/1170381 [00:00<08:52, 2195.53it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1170381/1170381 [09:37<00:00, 2026.22it/s]

Initial lines: 1170381
Filtered lines (exactly one of woman or man): 17636





In [None]:
len(filtered_texts)

17636

In [None]:
def get_label(text):
    tokens = tokenizer.tokenize(text)[:MAX_LENGTH]
    # does it contain 'run' (subword)
    has_run = (RUN in tokens)
    return 1 if has_run else 0



# Create a DatasetDict
dset = Dataset.from_dict({"text": filtered_texts})
dataset = DatasetDict({"all": dset})

# Label function: 1 if text has "run", else 0 (that means it has "walk")
def label_function(ex):
    lower_text = ex["text"].lower()
    label = get_label(lower_text)
    return {"label": label}

dataset = dataset.map(label_function, batched=False)

# Split into train (80%), validation (10%), test (10%)
dataset = dataset["all"].train_test_split(test_size=0.2, seed=SEED)
dataset = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"],
})

print(dataset)
print(np.sum(dataset['train']['label']), len(dataset['train']['label']))

Map:   0%|          | 0/17636 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 14108
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 3528
    })
})
11018 14108


In [None]:
!pwd

/content/drive/MyDrive/BERT-token-test


In [None]:
%cd drive/MyDrive/BERT-token-test/

/content/drive/MyDrive/BERT-token-test


In [None]:
def balance_dataset(dataset_split, seed=SEED):
    # Convert to lists
    labels = dataset_split["label"]

    # Gather indices for pos/neg
    pos_indices = [i for i, lab in enumerate(labels) if lab == 1]
    neg_indices = [i for i, lab in enumerate(labels) if lab == 0]

    random.seed(seed)
    random.shuffle(pos_indices)
    random.shuffle(neg_indices)

    # Find the smaller group
    min_count = min(len(pos_indices), len(neg_indices))

    # Undersample both to the smaller group size
    balanced_indices = pos_indices[:min_count] + neg_indices[:min_count]
    random.shuffle(balanced_indices)

    # Use 'select' to pick only these indices
    return dataset_split.select(balanced_indices)

balanced_train = balance_dataset(dataset["train"], seed=SEED)
balanced_val   = balance_dataset(dataset["validation"], seed=SEED)

# Now we have two balanced splits. Reconstruct the DatasetDict
balanced_dataset = DatasetDict({
    "train": balanced_train,
    "validation": balanced_val
})

print("Balanced train distribution:")
print("Positive labels:", sum(balanced_dataset["train"]["label"]),
      "Total:", len(balanced_dataset["train"]["label"]))

print("Balanced validation distribution:")
print("Positive labels:", sum(balanced_dataset["validation"]["label"]),
      "Total:", len(balanced_dataset["validation"]["label"]))


Balanced train distribution:
Positive labels: 3090 Total: 6180
Balanced validation distribution:
Positive labels: 841 Total: 1682


In [None]:
# dont do it
balanced_dataset.save_to_disk("balanced_dataset")
print("Dataset saved to 'balanced_dataset' directory.")

Saving the dataset (0/1 shards):   0%|          | 0/6180 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1682 [00:00<?, ? examples/s]

Dataset saved to 'balanced_dataset' directory.


In [None]:
# dont do it
balanced_dataset = load_from_disk("balanced_dataset")
print("Loaded dataset:")
print(balanced_dataset)

Loaded dataset:
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 6180
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1682
    })
})


In [None]:
# Chunk 3: Tokenization

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

# encoded_dataset = dataset.map(tokenize_function, batched=True)
encoded_dataset = balanced_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
encoded_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"],
)

train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]
# test_dataset = encoded_dataset["test"]

print("Number of training samples:", len(train_dataset))
print("Number of validation samples:", len(val_dataset))
# print("Number of test samples:", len(test_dataset))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/6180 [00:00<?, ? examples/s]

Map:   0%|          | 0/1682 [00:00<?, ? examples/s]

Number of training samples: 6180
Number of validation samples: 1682


In [None]:
print(balanced_dataset['train']['text'][1611])
print(balanced_dataset['train']['label'][1611])

After four years , the story presents tax attorney Loudon Trott ( Griffin Dunne ) on a busy day . He is getting married to the daughter of one of the richest men in New York , Simon Worthington . Loudon 's bride Wendy Worthington ( Haviland Morris ) is a selfish woman who is more consumed in her wedding plans than in the well @-@ being of her fiancé . Loudon , on the other hand , has a number of duties entrusted to him by his boss ( and future father @-@ in @-@ law ) , Mr. Worthington ( John McMartin ) . First he has to pick up a cougar for an exotic animal activist named Montgomery Bell ( John Mills ) , then to pick up Nikki , and lastly he has to make sure that Nikki catches the next bus to her hometown of Philadelphia .
0


In [None]:
# Chunk 4: Define BertProxyClassifier

class BertProxyClassifier(nn.Module):
    """
    BERT-based model for a binary classification task:
    Single sigmoid output with BCELoss.

    Configurable for:
      embedding_type: 'cls' or 'mean' (exclude padding)
      classifier_type: 'single_linear' or 'two_layer'
      freeze_bert: bool
    """
    def __init__(self,
                 bert_model_name=BERT_MODEL_NAME,
                 embedding_type="cls",
                 classifier_type="single_linear",
                 freeze_bert=False):
        super().__init__()
        self.embedding_type = embedding_type
        self.classifier_type = classifier_type
        self.bert = AutoModel.from_pretrained(bert_model_name)
        hidden_size = self.bert.config.hidden_size

        # Freeze or unfreeze BERT
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

        # Define classifier
        if classifier_type == "single_linear":
            # Mimics logistic regression on top of BERT embeddings
            self.classifier = nn.Sequential(
                nn.Dropout(0.1),
                nn.Linear(hidden_size, 1)
            )
        else:
            # A slightly more complex MLP
            self.classifier = nn.Sequential(
                nn.Dropout(0.1),
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Linear(hidden_size, 1),
            )

        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)

        if self.embedding_type == "cls":
            # [CLS] token is at index 0
            pooled = last_hidden_state[:, 0, :]
        else:
            # Mean pooling excluding padding
            mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            masked_embeddings = last_hidden_state * mask
            summed = torch.sum(masked_embeddings, dim=1)  # sum over seq_len
            denom = torch.clamp(mask.sum(dim=1), min=1e-9)
            pooled = summed / denom

        logits = self.classifier(pooled).squeeze(-1)  # (batch_size)
        probs = self.sigmoid(logits)

        loss = None
        if labels is not None:
            bce_loss_fn = nn.BCELoss()
            loss = bce_loss_fn(probs, labels.float())

        return {"loss": loss, "logits": logits, "probs": probs}


In [None]:
# Chunk 5: Training and evaluation helper functions

def compute_metrics(preds, labels):
    """
    Computes binary classification metrics given predicted probabilities and labels.
    preds: list of float probabilities
    labels: list of 0 or 1
    """
    preds_binary = [1 if p >= 0.5 else 0 for p in preds]
    acc = accuracy_score(labels, preds_binary)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds_binary, average="binary")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

def train_one_setting(embedding_type, classifier_type, freeze_bert, train_dataset, val_dataset):
    """
    Train and evaluate one experiment setting using the validation set.
    Returns a dict of best val metrics.
    """
    model = BertProxyClassifier(
        embedding_type=embedding_type,
        classifier_type=classifier_type,
        freeze_bert=freeze_bert
    ).to(device)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Only train parameters that are requires_grad == True
    lr = 1e-1 if freeze_bert else 2e-5

    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    best_val_f1 = 0.0
    best_metrics = {}

    epochs = EPOCHS+1 if freeze_bert else EPOCHS
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            output = model(input_ids, attention_mask, labels=labels)
            loss = output["loss"]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        # Validation
        model.eval()
        val_labels = []
        val_probs = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                out = model(input_ids, attention_mask)
                probs = out["probs"].detach().cpu().numpy()

                val_probs.extend(probs)
                val_labels.extend(labels.cpu().numpy())

        val_metrics = compute_metrics(val_probs, val_labels)
        if val_metrics["f1"] > best_val_f1:
            best_val_f1 = val_metrics["f1"]
            best_metrics = val_metrics

        print(f"Epoch {epoch+1}/{EPOCHS} | "
              f"Train Loss: {total_loss / len(train_loader):.4f} | "
              f"Val Acc: {val_metrics['accuracy']:.4f} | "
              f"Val F1: {val_metrics['f1']:.4f}")

    return best_metrics


In [None]:
# Chunk 6: Run all experiments and record best validation metrics (no test usage)

settings = []
count = 0
for embedding_type in ["mean", "cls",]:
    for classifier_type in ["single_linear", "two_layer"]:
        for freeze_bert in [True, False]:
            count += 1
            if count == 8:
                settings.append((embedding_type, classifier_type, freeze_bert))

results = []

for (embedding_type, classifier_type, freeze_bert) in settings:
    print("\n==========================================")
    print(f"Running setting: embedding_type={embedding_type}, "
          f"classifier_type={classifier_type}, freeze_bert={freeze_bert}")

    # Train with the current setting and get best validation performance
    best_val_metrics = train_one_setting(
        embedding_type=embedding_type,
        classifier_type=classifier_type,
        freeze_bert=freeze_bert,
        train_dataset=train_dataset,
        val_dataset=val_dataset
    )

    # We'll simply record the best validation metrics for this setting.
    result = {
        "embedding_type": embedding_type,
        "classifier_type": classifier_type,
        "freeze_bert": freeze_bert,
        "val_accuracy": best_val_metrics.get("accuracy", 0.0),
        "val_precision": best_val_metrics.get("precision", 0.0),
        "val_recall": best_val_metrics.get("recall", 0.0),
        "val_f1": best_val_metrics.get("f1", 0.0),
    }

    results.append(result)
    print("Finished setting (best val metrics):", result)



Running setting: embedding_type=cls, classifier_type=two_layer, freeze_bert=False


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  1%|          | 7/773 [00:54<1:39:50,  7.82s/it]


KeyboardInterrupt: 

In [None]:
import torch
import gc

# Delete any variables referencing GPU memory
# del variable_name

# Collect garbage
gc.collect()

# Clear the PyTorch cache
torch.cuda.empty_cache()


Between tour legs Nine Inch Nails gave a performance as part of the Year Zero game . A small group of fans received fictional in @-@ game telephone @-@ calls that invited them to a " resistance meeting " in a Los Angeles parking lot . Those who arrived were given " resistance kits " , some of which contained cellphones that would later inform the participants of further details . After receiving instructions from the cellphones , fans who attended a fictional Art is Resistance meeting in Los Angeles were rewarded with an unannounced performance by Nine Inch Nails . The concert was cut short as the meeting was raided by a fictional SWAT team and the audience was rushed out of the building .
1


In [None]:
# Chunk 7: Save all experiment results to CSV (only validation metrics)

csv_file = "bert_experiment_results.csv"
csv_columns = [
    "embedding_type",
    "classifier_type",
    "freeze_bert",
    "val_accuracy",
    "val_precision",
    "val_recall",
    "val_f1"
]

with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()
    for r in results:
        writer.writerow(r)

print(f"\nAll experiments complete. Validation results saved to '{csv_file}'.")


In [None]:
# dont run!!

# # Chunk 7: Save all experiment results to CSV

# csv_file = "bert_experiment_results.csv"
# csv_columns = [
#     "embedding_type", "classifier_type", "freeze_bert",
#     "val_accuracy", "val_precision", "val_recall", "val_f1",
#     "test_accuracy", "test_precision", "test_recall", "test_f1"
# ]

# with open(csv_file, "w", newline="", encoding="utf-8") as f:
#     writer = csv.DictWriter(f, fieldnames=csv_columns)
#     writer.writeheader()
#     for r in results:
#         writer.writerow(r)

# print(f"\nAll experiments complete. Results saved to '{csv_file}'.")
