In [None]:
!pip install transformers datasets torch scikit-learn




In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.nn.functional import softmax
from torch.utils.data import DataLoader


In [None]:
dataset = load_dataset("ag_news")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print(len(train_dataset))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

120000


In [None]:
num_samples = len(train_dataset)
indices = np.arange(num_samples)

np.random.seed(42)
np.random.shuffle(indices)

labeled_size = int(0.05 * num_samples)

labeled_indices = indices[:labeled_size]
unlabeled_indices = indices[labeled_size:]

print("Labeled samples:", len(labeled_indices))
print("Unlabeled samples:", len(unlabeled_indices))


Labeled samples: 6000
Unlabeled samples: 114000


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
)

print("Model Loaded Successfully")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Model Loaded Successfully


In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )


In [None]:
labeled_dataset = train_dataset.select(labeled_indices)

tokenized_labeled = labeled_dataset.map(tokenize_function, batched=True)

tokenized_labeled.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

print("Tokenized labeled data:", len(tokenized_labeled))


Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Tokenized labeled data: 6000


In [None]:
from torch.utils.data import DataLoader

labeled_loader = DataLoader(tokenized_labeled, batch_size=32, shuffle=True)

print("Labeled DataLoader ready")


Labeled DataLoader ready


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Using device:", device)


Using device: cuda


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()

for epoch in range(1):  # 1 epoch
    total_loss = 0

    for batch in labeled_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print("Epoch finished. Avg loss:", total_loss / len(labeled_loader))


Epoch finished. Avg loss: 0.4058234493069826


In [None]:
unlabeled_dataset = train_dataset.select(unlabeled_indices[:20000])

tokenized_unlabeled = unlabeled_dataset.map(tokenize_function, batched=True)

tokenized_unlabeled.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"]
)

unlabeled_loader = DataLoader(tokenized_unlabeled, batch_size=32)

print("Unlabeled loader ready:", len(tokenized_unlabeled))


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Unlabeled loader ready: 20000


In [None]:
from torch.nn.functional import softmax

model.eval()

uncertainty_scores = []

with torch.no_grad():
    for batch in unlabeled_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        probs = softmax(logits, dim=1)

        entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=1)

        uncertainty_scores.extend(entropy.cpu().numpy())

uncertainty_scores = np.array(uncertainty_scores)

print("Uncertainty computed:", len(uncertainty_scores))


Uncertainty computed: 20000


In [None]:
ranked_indices = np.argsort(-uncertainty_scores)

print("Top 5 uncertainty scores:")
print(uncertainty_scores[ranked_indices[:5]])


Top 5 uncertainty scores:
[1.384732  1.3827468 1.3816227 1.3794459 1.3792123]


In [None]:
import os
os.makedirs("ActiveLearningProject/data", exist_ok=True)

print("Project folder created")


Project folder created


In [None]:
import numpy as np

np.save("ActiveLearningProject/data/uncertainty_scores.npy", uncertainty_scores)
np.save("ActiveLearningProject/data/ranked_indices.npy", ranked_indices)

print("Module 2 outputs saved successfully")


Module 2 outputs saved successfully


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

project_path = "/content/drive/MyDrive/ActiveLearningProject"
os.makedirs(project_path + "/data", exist_ok=True)
os.makedirs(project_path + "/models", exist_ok=True)
os.makedirs(project_path + "/results", exist_ok=True)

print("Drive project folders created")


Drive project folders created


In [None]:
np.save(project_path + "/data/uncertainty_scores.npy", uncertainty_scores)
np.save(project_path + "/data/ranked_indices.npy", ranked_indices)

print("Module 2 saved permanently to Drive")


Module 2 saved permanently to Drive


In [None]:
top_k = 1000  # choose 1000 most uncertain samples

top_uncertain_indices = ranked_indices[:top_k]

print("Selected top uncertain samples:", len(top_uncertain_indices))


Selected top uncertain samples: 1000


In [None]:
def get_embeddings(model, dataset_subset):
    loader = DataLoader(dataset_subset, batch_size=32)
    model.eval()
    embeddings = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.distilbert(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            cls_embedding = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.cpu().numpy())

    return np.vstack(embeddings)


In [None]:
top_uncertain_dataset = tokenized_unlabeled.select(top_uncertain_indices)

embeddings = get_embeddings(model, top_uncertain_dataset)

print("Embedding shape:", embeddings.shape)


Embedding shape: (1000, 768)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)

print("Similarity matrix shape:", similarity_matrix.shape)


Similarity matrix shape: (1000, 1000)


In [None]:
density_scores = similarity_matrix.mean(axis=1)

print("Density scores computed:", len(density_scores))


Density scores computed: 1000


In [None]:
final_k = 200

dense_ranked = np.argsort(-density_scores)

final_selected_indices = top_uncertain_indices[dense_ranked[:final_k]]

print("Final selected samples:", len(final_selected_indices))


Final selected samples: 200


In [None]:
np.save(project_path + "/data/final_selected_indices.npy", final_selected_indices)

print("Module 3 outputs saved permanently")


Module 3 outputs saved permanently


In [None]:
print("Uncertainty count:", len(uncertainty_scores))
print("Top uncertain selected:", len(top_uncertain_indices))
print("Final dense selected:", len(final_selected_indices))


Uncertainty count: 20000
Top uncertain selected: 1000
Final dense selected: 200


In [None]:
# Convert back to original dataset indices

original_indices_subset = unlabeled_indices[:20000]

final_selected_original_indices = original_indices_subset[final_selected_indices]

print("Original dataset indices ready:", len(final_selected_original_indices))


Original dataset indices ready: 200


In [None]:
np.save(project_path + "/data/final_selected_original_indices.npy",
        final_selected_original_indices)

print("Saved original indices for Module 1")


Saved original indices for Module 1


In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/ActiveLearningProject

!git init


/content/drive/MyDrive/ActiveLearningProject
[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/ActiveLearningProject/.git/


In [6]:
!ls -a


data  .git  models  results


In [7]:
!git remote add origin https://github.com/manashri834/ActiveLearningProject.git


In [8]:
!git remote -v


origin	https://github.com/manashri834/ActiveLearningProject.git (fetch)
origin	https://github.com/manashri834/ActiveLearningProject.git (push)


In [11]:
!git add .


In [12]:
!git commit -m "Initial project commit"


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@74d0275a9e16.(none)')


In [13]:
!git config --global user.email "manashri771@example.com"
!git config --global user.name "manashri834"


In [15]:
%cd /content/drive/MyDrive/ActiveLearningProject


/content/drive/MyDrive/ActiveLearningProject


In [16]:
!git config --global user.email "manashri771@gmail.com"
!git config --global user.name "manashri834"


In [30]:
!git init
!git add .
!git commit -m "Initial commit"
!git branch -M main
!git remote add origin https://github.com/USERNAME/REPO.git
!git push -u origin main

Reinitialized existing Git repository in /content/drive/MyDrive/ActiveLearningProject/.git/
On branch main
nothing to commit, working tree clean
error: remote origin already exists.
fatal: could not read Username for 'https://github.com': No such device or address


In [31]:
!git remote set-url origin https://manashri834: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@github.com/manashri834/ActiveLearningProject.git
!git push -u origin main

Branch 'main' set up to track remote branch 'main' from 'origin'.
Everything up-to-date
