In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import seaborn as sns
import srsly
from datasets import load_from_disk
from scipy.special import entr, softmax
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data import ClassificationActiveDataModule, ClassificationDataModule
from src.estimators import (
    EstimatorForSequenceClassification,
    UncertaintyBasedStrategyForSequenceClassification,
)

pd.set_option("display.max_colwidth", None)

In [3]:
data_path = Path("../data/prepared/agnews_bert_tiny/")

In [4]:
dataset_dict = load_from_disk(data_path)
train_val = dataset_dict["train"].train_test_split(0.3)
dataset_dict["train"] = train_val["train"]
dataset_dict["validation"] = train_val["test"]

metadata = srsly.read_yaml(data_path / "metadata.yaml")
tokenizer = AutoTokenizer.from_pretrained(metadata["name_or_path"])
datamodule = ClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer, batch_size=64, eval_batch_size=256
)

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    metadata["name_or_path"],
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
    num_labels=len(datamodule.labels),
)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [6]:
estimator = EstimatorForSequenceClassification(model, accelerator="cuda")

In [7]:
out = estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=30,
    validation_frequency=1,
)

Completed epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [8]:
test_out = estimator.test(
    datamodule.test_loader(),
    limit_batches=11,
    # progress_bar=False,
)

Test: 0it [00:00, ?it/s]

In [9]:
validation_out = estimator.validate(
    datamodule.validation_loader(),
    limit_batches=10,
)

Validation: 0it [00:00, ?it/s]

In [10]:
active_datamodule = ClassificationActiveDataModule.from_dataset_dict(
    dataset_dict, tokenizer
)

In [11]:
metadata

{'train_val_seed': 1994,
 'embedding_model': 'all-mpnet-base-v2',
 'embedding_dim': 768,
 'num_elements': 120000,
 'numpy_embeddings_path': '/home/pl487/allset/data/processed/agnews/index.npy',
 'hnsw_index_path': '/home/pl487/allset/data/processed/agnews/index.bin',
 'name_or_path': 'google/bert_uncased_L-2_H-128_A-2',
 'name_or_path_alias': 'bert_tiny'}

In [12]:
active_datamodule.load_index(
    metadata["hnsw_index_path"],
    embedding_dim=metadata["embedding_dim"],
)
active_datamodule.index

<hnswlib.Index(space='cosine', dim=768)>

In [13]:
active_estimator = UncertaintyBasedStrategyForSequenceClassification(
    model=model, score_fn="entropy"
)

In [31]:
active_estimator.active_fit(
    active_datamodule,
    max_rounds=2,
    max_budget=36500,
    limit_train_batches=20,
    limit_validation_batches=22,
    limit_test_batches=21,
    limit_pool_batches=30,
    # val_perc=0.4,
    query_size=25,
)

Completed rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Completed epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 174:   0%|          | 0/8 [00:00<?, ?it/s]

Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Test: 0it [00:00, ?it/s]

Pool: 0it [00:00, ?it/s]

{'hparams/test_accuracy_auc': 0.74404764,
 'hparams/test_f1_macro_auc': 0.7446096,
 'hparams/test_f1_micro_auc': 0.74404764,
 'hparams/test_precision_macro_auc': 0.7539522,
 'hparams/test_precision_micro_auc': 0.74404764,
 'hparams/test_recall_macro_auc': 0.74936473,
 'hparams/test_recall_micro_auc': 0.74404764}

In [27]:
active_estimator.progress_tracker.budget_tracker

BudgetTracker(min=None, max=36500, total=36125, current=36125, progress_bar=None, query_size=25)

In [23]:
active_datamodule.data_statistics

{'train_size': 75,
 'validation_size': 36000,
 'test_size': 7600,
 'pool_size': 83925,
 'total_labelled_size': 36075}

In [None]:
df = (
    dataset_dict["validation"]
    .to_pandas()
    .drop(columns=["input_ids", "attention_mask", "token_type_ids"])
    .assign(labels=lambda df_: df_["labels"].map(datamodule.id2label))
)

In [None]:
df = pd.concat(
    [
        df,
        pd.DataFrame(
            data=softmax(validation_out["logits"], axis=-1),
            columns=datamodule.id2label.values(),
        ),
    ],
    axis=1,
)

In [None]:
df["entropy"] = entr(df.iloc[:, -4:].values).sum(-1)
df["pred"] = df.iloc[:, -4:].values.argmax(-1)
df["pred"] = df["pred"].map(datamodule.id2label)

In [None]:
validation_out["metrics"]

In [None]:
validation_out["unique_id"]

In [None]:
df = pd.DataFrame(
    data=validation_out["logits"],
    columns=list(range(validation_out["logits"].shape[1])),
).assign(
    pred=lambda df_: df_.values.argmax(-1),
    unique_id=validation_out["unique_id"],
)

In [None]:
df = pd.merge(
    df, dataset_dict["validation"].to_pandas(), on="unique_id", how="inner"
)

In [None]:
df = df.iloc[:, :-3]

In [None]:
df = df.assign(
    pred=lambda df_: df_["pred"].map(datamodule.id2label),
    labels=lambda df_: df_["labels"].map(datamodule.id2label),
)

In [None]:
(df["pred"] == df["labels"]).mean()

In [None]:
df["entropy"] = entr(softmax(df.iloc[:, :4], axis=-1)).sum(-1)

In [None]:
df.sort_values("entropy", ascending=False)