In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from datasets import load_from_disk
from src.data import ClassificationActiveDataModule
import srsly
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from src.estimators import SimilaritySearchStrategyForSequenceClassification

In [3]:
data_path = Path("/home/pl487/allset/data/prepared/agnews_bert_tiny")
dataset_dict = load_from_disk(data_path)
meta = srsly.read_yaml(data_path / "metadata.yaml")
meta

{'train_val_seed': 1994,
 'embedding_model': 'all-mpnet-base-v2',
 'embedding_dim': 768,
 'num_elements': 120000,
 'numpy_embeddings_path': '/home/pl487/allset/data/processed/agnews/index.npy',
 'hnsw_index_path': '/home/pl487/allset/data/processed/agnews/index.bin',
 'name_or_path': 'google/bert_uncased_L-2_H-128_A-2',
 'name_or_path_alias': 'bert_tiny'}

In [4]:
tokenizer = AutoTokenizer.from_pretrained(meta["name_or_path"])

In [5]:
datamodule = ClassificationActiveDataModule.from_dataset_dict(dataset_dict, tokenizer=tokenizer)

In [6]:
datamodule.load_index(meta["hnsw_index_path"], embedding_dim=meta["embedding_dim"])

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    meta["name_or_path"],
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
    num_labels=len(datamodule.labels),
)
active_estimator = SimilaritySearchStrategyForSequenceClassification(model=model, seed=42)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [8]:
active_estimator.active_fit(
    max_rounds=2, query_size=100, active_datamodule=datamodule, limit_test_batches=10,
)

Completed rounds:   0%|          | 0/3 [00:00<?, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

{'hparams/test_accuracy': 0.6343749761581421,
 'hparams/test_f1_macro': 0.5850603580474854,
 'hparams/test_f1_micro': 0.6343749761581421,
 'hparams/test_precision_macro': 0.6814135909080505,
 'hparams/test_precision_micro': 0.6343749761581421,
 'hparams/test_recall_macro': 0.5752501487731934,
 'hparams/test_recall_micro': 0.6343749761581421,
 'hparams/test_accuracy_auc': 1.0828125,
 'hparams/test_f1_macro_auc': 1.0067735,
 'hparams/test_f1_micro_auc': 1.0828125,
 'hparams/test_precision_macro_auc': 1.0375003,
 'hparams/test_precision_micro_auc': 1.0828125,
 'hparams/test_recall_macro_auc': 1.1186674,
 'hparams/test_recall_micro_auc': 1.0828125}

In [9]:
active_estimator.progress_tracker.budget_tracker

BudgetTracker(max=120000, total=200, current=200, progress_bar=None, query_size=100)

In [10]:
datamodule.train_size()

200

In [132]:
loader = datamodule.train_loader()
batch = next(iter(loader))
_ = batch.pop("on_cpu")
batch_size = loader.batch_size

In [133]:
import torch

In [134]:
loss = model(**batch).loss

In [135]:
grads = torch.autograd.grad(loss, list(model.parameters()))

In [136]:
def compute_grad(model, input_ids, attn_mask, target):
    input_ids = input_ids.unsqueeze(0)  # prepend batch dimension for processing
    attn_mask = attn_mask.unsqueeze(0)  # prepend batch dimension for processing
    target = target.unsqueeze(0)
    loss = model(input_ids=input_ids, attention_mask=attn_mask, labels=target).loss
    return torch.autograd.grad(loss, list(model.parameters()))

In [137]:
input_ids, attention_mask, target = batch["input_ids"], batch["attention_mask"], batch["labels"]

In [138]:
def select(i):
    return input_ids[i], attention_mask[i], target[i]

In [141]:
norms = np.array([[g.norm(2).item() for g in compute_grad(model, *select(i))] for i in range(batch_size)])

In [143]:
norms.shape

(32, 41)

In [144]:
from functorch import make_functional_with_buffers, vmap, grad

fmodel, params, buffers = make_functional_with_buffers(model)

In [145]:
def compute_loss_stateless_model(fmodel, params, buffers, input_ids, att_mask, label):
    input_ids = input_ids.unsqueeze(0)
    att_mask = att_mask.unsqueeze(0)
    label = label.unsqueeze(0)
    
    return fmodel(
        params, 
        buffers, 
        input_ids=input_ids,
        attention_mask=att_mask,
        labels=label,
    ).loss


In [146]:
compute_loss_stateless_model(fmodel, params, buffers, *select(0))

tensor(0.9802, grad_fn=<NllLossBackward0>)

In [159]:
ft_compute_grad = grad(compute_loss_stateless_model, argnums=1)

In [231]:
ft_compute_grad(fmodel, params, buffers, *select(0))[0].requires_grad

True

In [215]:
%%timeit
fnorms = np.array(
    [
        [g.norm(2).item() for g in ft_compute_grad(fmodel, params, buffers, *select(i))] 
        for i in range(batch_size)
    ]
)

737 ms ± 97.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [205]:
def compute_norm(fmodel, params, buffers, input_ids, attention_mask, target):
    grads = ft_compute_grad(fmodel, params, buffers, input_ids, attention_mask, target)
    return tuple(g.norm() for g in grads)

In [206]:
ft_compute_sample_grad = vmap(compute_norm, in_dims=(None, None, None, 0, 0, 0),  randomness="same")

In [222]:
fnorms_vmap = torch.stack(ft_compute_sample_grad(fmodel, params, buffers, input_ids, attention_mask, target)).T


In [227]:
fnorms_vmap

tensor([[2.6633, 2.2930, 7.1850,  ..., 0.1084, 8.1522, 0.8480],
        [0.3866, 0.3121, 0.9092,  ..., 0.0408, 3.8506, 0.3948],
        [0.5067, 0.4632, 2.2373,  ..., 0.0462, 4.0455, 0.4326],
        ...,
        [0.4729, 0.3754, 1.1339,  ..., 0.0403, 3.6902, 0.3785],
        [1.2566, 1.0544, 3.6953,  ..., 0.0599, 4.6749, 0.4826],
        [2.7047, 2.5846, 8.7111,  ..., 0.0636, 3.3735, 0.3857]],
       grad_fn=<PermuteBackward0>)

In [214]:
fnorms

array([[2.66334009, 2.29299068, 7.18503428, ..., 0.10841203, 8.15219402,
        0.84802312],
       [0.38656864, 0.31205383, 0.90919632, ..., 0.04075364, 3.85057497,
        0.39479101],
       [0.50670964, 0.46319047, 2.23734379, ..., 0.04616484, 4.04549932,
        0.43263713],
       ...,
       [0.47293448, 0.37543818, 1.13386583, ..., 0.04028693, 3.69024682,
        0.37846082],
       [1.25664675, 1.05440533, 3.695292  , ..., 0.05989493, 4.67485762,
        0.4826256 ],
       [2.70469236, 2.58462548, 8.71106911, ..., 0.06355889, 3.37346792,
        0.38572755]])

In [101]:
# we can double check that the results using functorch grad and vmap match the results of hand processing each one individually:
for per_sample_grad, ft_per_sample_grad in zip(per_sample_grads, ft_per_sample_grads):
    assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=3e-3, rtol=1e-5)

In [105]:
ft_per_sample_grad

tensor([[ 0.0307,  0.0219, -0.6248,  0.5722],
        [ 0.0294,  0.0250,  0.2493, -0.3037],
        [ 0.0409,  0.0385, -0.3404,  0.2611],
        [ 0.0328,  0.0213,  0.2582, -0.3123],
        [ 0.1614, -0.2251,  0.0420,  0.0216],
        [ 0.0413,  0.0260, -0.4326,  0.3653],
        [-0.8004,  0.0363,  0.2792,  0.4849],
        [ 0.1024, -0.2112,  0.0718,  0.0369],
        [ 0.1002, -0.1997,  0.0623,  0.0371],
        [ 0.0420,  0.0284, -0.4032,  0.3328],
        [ 0.0320,  0.0208, -0.7117,  0.6589],
        [ 0.0382,  0.0187,  0.2570, -0.3139],
        [ 0.1120, -0.1897,  0.0466,  0.0311],
        [ 0.0415,  0.0428, -0.3445,  0.2601],
        [ 0.0402,  0.0227, -0.4714,  0.4085],
        [-0.4027,  0.0814,  0.1441,  0.1772],
        [ 0.0413,  0.0460, -0.3370,  0.2497],
        [ 0.0307,  0.0227,  0.2523, -0.3057],
        [ 0.0563,  0.0316, -0.3872,  0.2992],
        [ 0.0290,  0.0246, -0.6338,  0.5801],
        [-0.3803,  0.0918,  0.1306,  0.1579],
        [ 0.0296,  0.0229,  0.3387

In [None]:
import time
from pathlib import Path

import pandas as pd
import srsly
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data.active_datamodule import ActiveClassificationDataModule
from src.data.datamodule import ClassificationDataModule
from src.enums import SpecialKeys
from src.estimator import Estimator
from src.huggingface import (
    EstimatorForSequenceClassification,
    UncertaintyBasedStrategyForSequenceClassification,
)

In [None]:
data_path = Path("../data/prepared/ag_news")
dataset_dict = load_from_disk(data_path)
metadata = srsly.read_yaml(data_path / "metadata.yaml")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(metadata["name_or_path"])

In [None]:
datamodule = ClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    metadata["name_or_path"],
    num_labels=len(datamodule.labels),
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
)

In [None]:
estimator = EstimatorForSequenceClassification(model)

In [None]:
out = estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=10,
    max_epochs=1,
)

In [None]:
active_estimator = UncertaintyBasedStrategyForSequenceClassification(
    model, score_fn="margin_confidence"
)

In [None]:
out = active_estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=10,
)

In [None]:
active_datamodule = ActiveClassificationDataModule.from_dataset_dict(
    dataset_dict,
    tokenizer=tokenizer,
)

In [None]:
active_out = active_estimator.active_fit(
    active_datamodule=active_datamodule,
    max_rounds=3,
    query_size=50,
    validation_perc=0.3,
    fit_kwargs={
        "max_epochs": 3,
        "limit_train_batches": 3,
        "limit_validation_batches": 3,
    },
    test_kwargs={"limit_batches": 3},
    pool_kwargs={"limit_batches": 3},
)

In [None]:
active_datamodule.save_labelled_dataset("results")

In [None]:
df = pd.read_parquet("results/labelled_dataset.parquet")

In [None]:
df