In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import seaborn as sns
import srsly
from datasets import load_from_disk
from scipy.special import entr, softmax
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data import ClassificationActiveDataModule, ClassificationDataModule
from src.estimators import (
    EstimatorForSequenceClassification,
    RandomStrategyForSequenceClassification,
    # UncertaintyBasedStrategyForSequenceClassification,
    # SEALSRandomStrategyForSequenceClassification,
)
from src.energizer.utilities import local_seed
from lightning.fabric.loggers import TensorBoardLogger
from copy import deepcopy

pd.set_option("display.max_colwidth", None)

In [3]:
data_path = Path("../data/prepared/agnews_bert_tiny/")

In [4]:
metadata = srsly.read_yaml(data_path / "metadata.yaml")
tokenizer = AutoTokenizer.from_pretrained(metadata["name_or_path"])

---
### Train

In [None]:
dataset_dict = load_from_disk(data_path)
train_val = dataset_dict["train"].train_test_split(0.3)
dataset_dict["train"] = train_val["train"]
dataset_dict["validation"] = train_val["test"]

datamodule = ClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer, batch_size=64, eval_batch_size=256
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    metadata["name_or_path"],
    id2label=datamodule.id2label,
    label2id=datamodule.label2id,
    num_labels=len(datamodule.labels),
)

In [None]:
estimator = EstimatorForSequenceClassification(model, accelerator="cuda")

In [None]:
out = estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=30,
    validation_interval=1,
)

In [None]:
test_out = estimator.test(
    datamodule.test_loader(),
    limit_batches=11,
    # progress_bar=False,
)

---
### Active train

In [5]:
dataset_dict = load_from_disk(data_path)
datamodule = ClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer, batch_size=64, eval_batch_size=256
)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['unique_id', 'labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['unique_id', 'labels', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7600
    })
})

In [6]:
with local_seed(42):
    model = AutoModelForSequenceClassification.from_pretrained(
        metadata["name_or_path"],
        id2label=datamodule.id2label,
        label2id=datamodule.label2id,
        num_labels=len(datamodule.labels),
    )

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [7]:
list_dfs = []

for _ in range(3):
    active_datamodule = ClassificationActiveDataModule.from_dataset_dict(
        dataset_dict, tokenizer, seed=42, batch_size=64, eval_batch_size=256
    )


    active_estimator = RandomStrategyForSequenceClassification(
        model=deepcopy(model), 
        seed=42, 
        loggers=[TensorBoardLogger("logs")],
        deterministic=True,
    )

    active_estimator.active_fit(
        active_datamodule,
        max_rounds=2,
        # max_budget=200,
        # limit_train_batches=20,
        # limit_validation_batches=22,
        # limit_test_batches=21,
        # limit_pool_batches=30,
        # validation_perc=0.4,
        # validation_interval=1,
        # validation_sampling="stratified",
        query_size=25,
    )

    list_dfs.append(active_datamodule.get_labelled_dataset())
    # from copy import deepcopy
    # tb_logger2 = TensorBoardLogger("logs2")
    # active_estimator2 = RandomStrategyForSequenceClassification(
    #     model=deepcopy(model), seed=42, loggers=[tb_logger2],
    # )

Completed rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

Completed rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

Completed rounds:   0%|          | 0/2 [00:00<?, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

In [None]:
(
    list_dfs[1].equals(list_dfs[2]), 
    list_dfs[1].equals(list_dfs[0]), 
    list_dfs[0].equals(list_dfs[2]),
)

In [None]:
active_estimator.active_fit(
    active_datamodule,
    max_rounds=2,
    # max_budget=200,
    limit_train_batches=20,
    limit_validation_batches=22,
    limit_test_batches=21,
    limit_pool_batches=30,
    validation_perc=0.4,
    validation_interval=1,
    validation_sampling="stratified",
    query_size=25,
)

In [None]:
active_estimator.progress_tracker.budget_tracker

In [None]:
active_estimator.progress_tracker.round_tracker

In [None]:
active_datamodule.data_statistics()

In [None]:
active_estimator2.replay_active_fit(
    active_datamodule,
    limit_train_batches=20,
    limit_validation_batches=22,
    limit_test_batches=21,
    limit_pool_batches=30,
)

In [None]:
active_estimator.progress_tracker.budget_tracker

In [None]:
active_datamodule.query_size

In [None]:
active_datamodule.train_loader(0).dataset

In [None]:
df = active_datamodule._df

In [None]:
df.loc[(df["labelling_round"] < 0) & (df["is_labelled"] == True)].shape

In [None]:
active_datamodule.has_train_data(0 - 1), active_datamodule.train_size(0 -1) 

In [None]:
df.labelling_round.value_counts()

In [None]:
active_estimator = UncertaintyBasedStrategyForSequenceClassification(
    model=model, score_fn="entropy"
)

In [None]:
active_estimator.active_fit(
    active_datamodule,
    max_rounds=2,
    max_budget=36500,
    limit_train_batches=20,
    limit_validation_batches=22,
    limit_test_batches=21,
    limit_pool_batches=30,
    # val_perc=0.4,
    query_size=25,
)

In [None]:
active_datamodule.data_statistics

In [None]:
df = (
    dataset_dict["validation"]
    .to_pandas()
    .drop(columns=["input_ids", "attention_mask", "token_type_ids"])
    .assign(labels=lambda df_: df_["labels"].map(datamodule.id2label))
)

In [None]:
df = pd.concat(
    [
        df,
        pd.DataFrame(
            data=softmax(validation_out["logits"], axis=-1),
            columns=datamodule.id2label.values(),
        ),
    ],
    axis=1,
)

In [None]:
df["entropy"] = entr(df.iloc[:, -4:].values).sum(-1)
df["pred"] = df.iloc[:, -4:].values.argmax(-1)
df["pred"] = df["pred"].map(datamodule.id2label)

In [None]:
validation_out["metrics"]

In [None]:
validation_out["unique_id"]

In [None]:
df = pd.DataFrame(
    data=validation_out["logits"],
    columns=list(range(validation_out["logits"].shape[1])),
).assign(
    pred=lambda df_: df_.values.argmax(-1),
    unique_id=validation_out["unique_id"],
)

In [None]:
df = pd.merge(
    df, dataset_dict["validation"].to_pandas(), on="unique_id", how="inner"
)

In [None]:
df = df.iloc[:, :-3]

In [None]:
df = df.assign(
    pred=lambda df_: df_["pred"].map(datamodule.id2label),
    labels=lambda df_: df_["labels"].map(datamodule.id2label),
)

In [None]:
(df["pred"] == df["labels"]).mean()

In [None]:
df["entropy"] = entr(softmax(df.iloc[:, :4], axis=-1)).sum(-1)

In [None]:
df.sort_values("entropy", ascending=False)