In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from energizer.datastores import PandasDataStoreForSequenceClassification
from src.strategies import (
    FullGuide,
    FullGuideWithSampling,
    GradNormGuide,
    RandomStrategy,
)

In [3]:
model_name = "google/bert_uncased_L-2_H-128_A-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds_dict = load_dataset("pietrolesci/imdb_indexed").map(
    lambda ex: tokenizer(ex["text"]), batched=True, num_proc=4
)

Found cached dataset parquet (/home/pl487/.cache/huggingface/datasets/pietrolesci___parquet/pietrolesci--imdb_indexed-ba54f49ea7c65a7f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___parquet/pietrolesci--imdb_indexed-ba54f49ea7c65a7f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f1a781728613916d_*_of_00004.arrow
Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___parquet/pietrolesci--imdb_indexed-ba54f49ea7c65a7f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d4be535ccf8b4ca3_*_of_00004.arrow


In [4]:
ds = PandasDataStoreForSequenceClassification()
ds.from_dataset_dict(
    ds_dict,
    input_names=["input_ids", "attention_mask"],
    target_name="labels",
    tokenizer=tokenizer,
    uid_name="uid",
)

In [5]:
emb_name = "embedding_all-mpnet-base-v2"
ds.add_index(emb_name)

In [6]:
pos = ds.data.loc[ds.data["labels"] == 1]

In [7]:
v, d = ds.search(np.stack(pos[emb_name].values), query_size=100, query_in_set=False)

In [8]:
s = (1 - d).flatten()

In [9]:
s[s.argsort()]

array([0.19704247, 0.19751698, 0.1978823 , ..., 1.0000014 , 1.0000014 ,
       1.0000015 ], dtype=float32)

In [10]:
s.max()

1.0000015

In [None]:
ids = v.flatten()[d.flatten().argsort()]
_, udx = np.unique(ids, return_index=True)
oids = ids[np.sort(udx)]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    id2label=ds.id2label,
    label2id=ds.label2id,
    num_labels=len(ds.labels),
)

# estimator = RandomStrategy(model=model, accelerator="gpu")
estimator = FullGuideWithSampling(
    temperatures=[1.0, 1.2],
    model=model,
    accelerator="gpu",
    num_neighbours=100,
    subset_size=10_000,
    seed=42,
    score_fn="least_confidence",
)
# estimator = GradNormGuide(
#     model=model,
#     accelerator="gpu",
#     num_neighbours=100,
#     num_influential=50,
#     subset_size=10_000,
#     seed=42,
#     norm_type=2,
#     score_fn="least_confidence",
# )

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [12]:
ds.label(list(range(100)), -1)
ds.prepare_for_loading(batch_size=32, eval_batch_size=512)

In [13]:
r = estimator.active_fit(
    ds,
    query_size=50,
    max_epochs=1,
    limit_test_batches=2,
    max_rounds=2,
    limit_pool_batches=2,
)

Completed rounds:   0%|          | 0/3 [00:00<?, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

Pool: 0it [00:00, ?it/s]

In [None]:
ds.data

In [None]:
len(ds_dict["train"]), len(ds_dict["validation"]), len(ds.data), len(
    ds_dict["train"]
) + len(ds_dict["validation"])

In [None]:
pd.read_parquet(
    "/home/pl487/allset/outputs/debug/imdb/randomguide_2023-05-19T16-28-44/logs/labelled_dataset.parquet"
)

In [None]:
pd.merge(
    df.loc[df["train_uid"].notna(), ["uid", "train_uid", "comment_text", "labels"]],
    df[["labels", "uid", "comment_text"]],
    left_on="train_uid",
    right_on="uid",
    how="inner",
    suffixes=["", "_train"],
)

In [None]:
ds.data.groupby("uid").size().sort_values()

In [None]:
df = ds.data

df.loc[df["is_labelled"] == True].groupby("labelling_round").size()

In [None]:
assert df["uid"].nunique() == len(df)

In [None]:
i = df["train_uid"][df["train_uid"].notna()].unique()

In [None]:
df["uid"].isin(df["train_uid"]).sum() / df["train_uid"].notna().sum()

In [None]:
df.loc[df["train_uid"].notna(), "train_uid"].nunique()

In [None]:
df.loc[df["train_uid"].notna(), "labels"].value_counts()

In [None]:
ds.data.loc[~ds.data["train_uid"].isna()]

In [None]:
r = estimator.fit(train_loader=ds.test_loader(), max_epochs=1, limit_train_batches=2)

In [None]:
estimator.current_pool["train_ids"]

In [None]:
a, b = ds.get_embeddings([96, 156443]).tolist()

In [None]:
datamodule = ClassificationActiveDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer
)

In [None]:
datamodule.load_index(meta["hnsw_index_path"], embedding_dim=meta["embedding_dim"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    meta["name_or_path"],
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
    num_labels=len(datamodule.labels),
)
active_estimator = SimilaritySearchStrategyForSequenceClassification(
    model=model, seed=42
)

In [None]:
active_estimator.active_fit(
    max_rounds=2,
    query_size=100,
    active_datamodule=datamodule,
    limit_test_batches=10,
)

In [None]:
active_estimator.progress_tracker.budget_tracker

In [None]:
datamodule.train_size()

In [None]:
loader = datamodule.train_loader()
batch = next(iter(loader))
_ = batch.pop("on_cpu")
batch_size = loader.batch_size

In [None]:
import torch

In [None]:
loss = model(**batch).loss

In [None]:
grads = torch.autograd.grad(loss, list(model.parameters()))

In [None]:
def compute_grad(model, input_ids, attn_mask, target):
    input_ids = input_ids.unsqueeze(0)  # prepend batch dimension for processing
    attn_mask = attn_mask.unsqueeze(0)  # prepend batch dimension for processing
    target = target.unsqueeze(0)
    loss = model(input_ids=input_ids, attention_mask=attn_mask, labels=target).loss
    return torch.autograd.grad(loss, list(model.parameters()))

In [None]:
input_ids, attention_mask, target = (
    batch["input_ids"],
    batch["attention_mask"],
    batch["labels"],
)

In [None]:
def select(i):
    return input_ids[i], attention_mask[i], target[i]

In [None]:
norms = np.array(
    [
        [g.norm(2).item() for g in compute_grad(model, *select(i))]
        for i in range(batch_size)
    ]
)

In [None]:
norms.shape

In [None]:
from functorch import grad, make_functional_with_buffers, vmap

fmodel, params, buffers = make_functional_with_buffers(model)

In [None]:
def compute_loss_stateless_model(
    fmodel, params, buffers, input_ids, att_mask, label
):
    input_ids = input_ids.unsqueeze(0)
    att_mask = att_mask.unsqueeze(0)
    label = label.unsqueeze(0)

    return fmodel(
        params,
        buffers,
        input_ids=input_ids,
        attention_mask=att_mask,
        labels=label,
    ).loss

In [None]:
compute_loss_stateless_model(fmodel, params, buffers, *select(0))

In [None]:
ft_compute_grad = grad(compute_loss_stateless_model, argnums=1)

In [None]:
ft_compute_grad(fmodel, params, buffers, *select(0))[0].requires_grad

In [None]:
%%timeit
fnorms = np.array(
    [
        [
            g.norm(2).item()
            for g in ft_compute_grad(fmodel, params, buffers, *select(i))
        ]
        for i in range(batch_size)
    ]
)

In [None]:
def compute_norm(fmodel, params, buffers, input_ids, attention_mask, target):
    grads = ft_compute_grad(
        fmodel, params, buffers, input_ids, attention_mask, target
    )
    return tuple(g.norm() for g in grads)

In [None]:
ft_compute_sample_grad = vmap(
    compute_norm, in_dims=(None, None, None, 0, 0, 0), randomness="same"
)

In [None]:
fnorms_vmap = torch.stack(
    ft_compute_sample_grad(
        fmodel, params, buffers, input_ids, attention_mask, target
    )
).T

In [None]:
fnorms_vmap

In [None]:
fnorms

In [None]:
# we can double check that the results using functorch grad and vmap match the results of hand processing each one individually:
for per_sample_grad, ft_per_sample_grad in zip(
    per_sample_grads, ft_per_sample_grads
):
    assert torch.allclose(per_sample_grad, ft_per_sample_grad, atol=3e-3, rtol=1e-5)

In [None]:
ft_per_sample_grad

In [None]:
import time
from pathlib import Path

import pandas as pd
import srsly
from datasets import load_from_disk
from torch.utils.data import DataLoader
from tqdm.auto import tqdm, trange
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data.active_datamodule import ActiveClassificationDataModule
from src.data.datamodule import ClassificationDataModule
from src.enums import SpecialKeys
from src.estimator import Estimator
from src.huggingface import (
    EstimatorForSequenceClassification,
    UncertaintyBasedStrategyForSequenceClassification,
)

In [None]:
data_path = Path("../data/prepared/ag_news")
dataset_dict = load_from_disk(data_path)
metadata = srsly.read_yaml(data_path / "metadata.yaml")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(metadata["name_or_path"])

In [None]:
datamodule = ClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    metadata["name_or_path"],
    num_labels=len(datamodule.labels),
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
)

In [None]:
estimator = EstimatorForSequenceClassification(model)

In [None]:
out = estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=10,
    max_epochs=1,
)

In [None]:
active_estimator = UncertaintyBasedStrategyForSequenceClassification(
    model, score_fn="margin_confidence"
)

In [None]:
out = active_estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=10,
)

In [None]:
active_datamodule = ActiveClassificationDataModule.from_dataset_dict(
    dataset_dict,
    tokenizer=tokenizer,
)

In [None]:
active_out = active_estimator.active_fit(
    active_datamodule=active_datamodule,
    max_rounds=3,
    query_size=50,
    validation_perc=0.3,
    fit_kwargs={
        "max_epochs": 3,
        "limit_train_batches": 3,
        "limit_validation_batches": 3,
    },
    test_kwargs={"limit_batches": 3},
    pool_kwargs={"limit_batches": 3},
)

In [None]:
active_datamodule.save_labelled_dataset("results")

In [None]:
df = pd.read_parquet("results/labelled_dataset.parquet")

In [None]:
df