In [1]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import DataLoader

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import numpy as np

base_path = "/content/drive/MyDrive/ActiveLearningProject/data/"

uncertainty_scores = np.load(base_path + "uncertainty_scores.npy")
clipped_indices = np.load(base_path + "clipped_indices.npy")

print("Loaded clipped pool:", len(clipped_indices))

Loaded clipped pool: 9949


In [13]:
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("ag_news")
train_dataset = dataset["train"]

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4
).to(device)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# same unlabeled subset size as Module-2
num_samples = len(train_dataset)
indices = np.arange(num_samples)

np.random.seed(42)
np.random.shuffle(indices)

labeled_size = int(0.05 * num_samples)

unlabeled_indices = indices[labeled_size:]

# use SAME 20000 subset as Module 2
unlabeled_dataset = train_dataset.select(unlabeled_indices[:20000])
tokenized_unlabeled = unlabeled_dataset.map(tokenize_function, batched=True)
tokenized_unlabeled.set_format(type="torch", columns=["input_ids", "attention_mask"])

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
clipped_dataset = tokenized_unlabeled.select(clipped_indices)
print("Clipped dataset size:", len(clipped_dataset))

Clipped dataset size: 9949


In [15]:
clipped_dataset = clipped_dataset.select(range(2000))
print("Reduced clipped dataset:", len(clipped_dataset))

Reduced clipped dataset: 2000


In [19]:
from torch.utils.data import DataLoader
import numpy as np

def get_embeddings(model, dataset_subset):
    loader = DataLoader(dataset_subset, batch_size=32)
    model.eval()

    embeddings = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.distilbert(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            cls_embed = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embed.cpu().numpy())

    return np.vstack(embeddings)

embeddings = get_embeddings(model, clipped_dataset)

print("Embeddings shape:", embeddings.shape)

Embeddings shape: (2000, 768)


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)
density_scores = similarity_matrix.mean(axis=1)

print("Density scores computed:", len(density_scores))

Density scores computed: 2000


In [21]:
dense_ranked = np.argsort(-density_scores)

In [22]:
final_k = 50
similarity_threshold = 0.9

selected = []

for idx in dense_ranked:
    if len(selected) == 0:
        selected.append(idx)
        continue

    sims = similarity_matrix[idx, selected]

    if np.max(sims) < similarity_threshold:
        selected.append(idx)

    if len(selected) >= final_k:
        break

final_selected_indices = clipped_indices[selected]

print("Final selected samples:", len(final_selected_indices))

Final selected samples: 50


In [23]:
np.save(base_path + "final_selected_indices.npy", final_selected_indices)

print("Module 3 completed and saved to Drive")

Module 3 completed and saved to Drive
