In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from torch.optim import AdamW
import os

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset = load_dataset("ag_news")
train_dataset = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
num_samples = len(train_dataset)
indices = np.arange(num_samples)

np.random.seed(42)
np.random.shuffle(indices)

labeled_size = int(0.05 * num_samples)

labeled_indices = indices[:labeled_size]
unlabeled_indices = indices[labeled_size:]

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [None]:
labeled_dataset = train_dataset.select(labeled_indices)
tokenized_labeled = labeled_dataset.map(tokenize_function, batched=True)
tokenized_labeled.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

labeled_loader = DataLoader(tokenized_labeled, batch_size=32, shuffle=True)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for batch in labeled_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Initial training complete")

Initial training complete


In [None]:
unlabeled_subset = train_dataset.select(unlabeled_indices[:20000])
tokenized_unlabeled = unlabeled_subset.map(tokenize_function, batched=True)
tokenized_unlabeled.set_format(type="torch", columns=["input_ids", "attention_mask"])

unlabeled_loader = DataLoader(tokenized_unlabeled, batch_size=32)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
model.eval()
uncertainty_scores = []

with torch.no_grad():
    for batch in unlabeled_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs = softmax(logits, dim=1)

        entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=1)
        uncertainty_scores.extend(entropy.cpu().numpy())

uncertainty_scores = np.array(uncertainty_scores)

print("Entropy computed:", len(uncertainty_scores))

Entropy computed: 20000


In [None]:
mean = np.mean(uncertainty_scores)
std = np.std(uncertainty_scores)

lower_bound = mean - 0.5 * std
upper_bound = mean + 1.0 * std

clipped_indices = np.where(
    (uncertainty_scores >= lower_bound) &
    (uncertainty_scores <= upper_bound)
)[0]

print("Samples after clipping:", len(clipped_indices))

Samples after clipping: 9949


In [None]:
os.makedirs("ActiveLearningProject/data", exist_ok=True)

np.save("ActiveLearningProject/data/uncertainty_scores.npy", uncertainty_scores)
np.save("ActiveLearningProject/data/clipped_indices.npy", clipped_indices)

print("Module 2 completed and saved")

Module 2 completed and saved


In [13]:
import os
import numpy as np

base_path = "/content/drive/MyDrive/ActiveLearningProject/data/"
os.makedirs(base_path, exist_ok=True)

np.save(base_path + "uncertainty_scores.npy", uncertainty_scores)
np.save(base_path + "clipped_indices.npy", clipped_indices)

print("Module 2 saved permanently to Google Drive")

Module 2 saved permanently to Google Drive


In [14]:
import os
print(os.listdir("/content/drive/MyDrive/ActiveLearningProject/data/"))

['uncertainty_scores.npy', 'ranked_indices.npy', 'final_selected_original_indices.npy', 'final_selected_indices.npy', 'clipped_indices.npy']
