In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tqdm.auto import tqdm, trange
import time
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from datasets import load_from_disk
from torch.utils.data import DataLoader
from src.estimator import Estimator
from src.data.datamodule import ClassificationDataModule
from src.data.active_datamodule import ActiveClassificationDataModule
from src.enums import SpecialColumns
from src.transformers import EstimatorForSequenceClassification, ActiveEstimatorForSequenceClassification
import pandas as pd
from pathlib import Path
import srsly

In [3]:
data_path = Path("../data/prepared/ag_news")
dataset_dict = load_from_disk(data_path)
metadata = srsly.read_yaml(data_path / "metadata.yaml")



In [4]:
tokenizer = AutoTokenizer.from_pretrained(metadata["name_or_path"])

In [5]:
datamodule = ClassificationDataModule.from_dataset_dict(dataset_dict, tokenizer=tokenizer)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    metadata["name_or_path"], 
    num_labels=len(datamodule.labels), 
    id2label=datamodule.id2label, 
    label2id=datamodule.label2id,
)

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

In [7]:
estimator = EstimatorForSequenceClassification(model)

In [8]:
out = estimator.fit(
    train_loader=datamodule.train_loader(),
    validation_loader=datamodule.validation_loader(),
    limit_train_batches=10,
    limit_validation_batches=10,
    num_epochs=1
)

Completed epochs:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/3275 [00:00<?, ?it/s]

Validation:   0%|          | 0/475 [00:00<?, ?it/s]

In [11]:
out.output[0]

FitEpochOutput(epoch=0, train=EpochOutput(metrics={'accuracy': 0.46875, 'f1_macro': 0.4418}, output= ..10 batches.. ), validation=EpochOutput(metrics={'accuracy': 0.75, 'f1_macro': 0.723064}, output= ..10 batches.. ))

In [None]:
active_estimator = ActiveEstimatorForSequenceClassification(model)

In [None]:
out = active_estimator.fit(
    train_loader=datamodule.train_dataloader(),
    validation_loader=datamodule.val_dataloader(),
    limit_train_batches=10,
    limit_validation_batches=10,
)

In [None]:
active_datamodule = ActiveClassificationDataModule.from_dataset_dict(
    dataset_dict, tokenizer=tokenizer,
)

In [None]:
active_estimator.active_fit(
    active_datamodule=active_datamodule,
    num_rounds=3,
    query_size=50,
    val_perc=0.3,
    fit_kwargs={"num_epochs": 3},
    test_kwargs={"limit_batches": 3},
)

In [None]:
from torchmetrics.classification import Accuracy, ConfusionMatrix, F1Score, AUROC, PrecisionRecallCurve
from torchmetrics import MetricCollection

In [None]:
class HuggingFaceEstimator(Estimator):
    def training_step(self, model, batch, batch_idx, metrics):
        out = model(**batch)
        m = metrics(out.logits, batch["labels"])
        return {"loss": out.loss, "logits": out.logits, "metrics": m}

    def validation_step(self, model, batch, batch_idx, metrics):
        return model(**batch)

    def test_step(self, model, batch, batch_idx, metrics):
        return model(**batch)

    def configure_metrics(self, stage=None):
        return MetricCollection({
            "accuracy": Accuracy("multiclass", num_classes=4),
            "confmat": ConfusionMatrix("multiclass", num_classes=4),
            "f1_macro": F1Score("multiclass", num_classes=4, average="macro"),
        })

In [None]:
from lightning.fabric.loggers import CSVLogger

In [None]:
logger = CSVLogger(root_dir="logs", flush_logs_every_n_steps=1)

In [None]:
estimator = HuggingFaceEstimator(model=model)

In [None]:
outputs = estimator.fit(
    datamodule.train_dataloader(),
    datamodule.test_dataloader(),
    limit_train_batches=5,
    limit_validation_batches=5,
    optimizer="sgd",
    scheduler="constant_schedule",
    # dry_run=True,
)

In [None]:
active_datamodule = ActiveClassificationDataModule.from_dataset_dict(dataset_dict, tokenizer=tokenizer)

In [None]:
from src.enums import SpecialColumns

In [None]:
class ActiveHuggingFaceEstimator(ActiveEstimator):
    def training_step(self, model, batch, batch_idx, metrics):
        out = model(**batch)
        m = metrics(out.logits, batch["labels"])
        return {"loss": out.loss, "logits": out.logits, "metrics": m}

    def validation_step(self, model, batch, batch_idx, metrics):
        return model(**batch)

    def test_step(self, model, batch, batch_idx, metrics):
        return model(**batch)

    def configure_metrics(self, stage=None):
        return MetricCollection({
            "accuracy": Accuracy("multiclass", num_classes=4),
            "confmat": ConfusionMatrix("multiclass", num_classes=4),
            "f1_macro": F1Score("multiclass", num_classes=4, average="macro"),
        })

    def pool_step(self, model, batch, batch_idx, metrics):
        on_cpu = batch.pop("on_cpu")
        logits = model(**batch).logits
        ids = on_cpu[SpecialColumns.ID]

        return {"scores": logits.mean(-1), SpecialColumns.ID: ids}

In [None]:
active_estimator = ActiveHuggingFaceEstimator(model=model)

In [None]:
output = active_estimator.active_fit(
    active_datamodule,
    num_rounds=2,
    query_size=2,
    val_perc=None,
    fit_kwargs={"num_epochs": 1},
    validate_kwargs={"dry_run": True},
    test_kwargs={"dry_run": True},
    pool_kwargs={"dry_run": True},
)

In [None]:
output

In [None]:
active_datamodule.label([0], round_id=0)

In [None]:
train_df = active_datamodule._df.loc[
    (active_datamodule._df[SpecialColumns.IS_LABELLED] == True) & (active_datamodule._df[SpecialColumns.IS_VALIDATION] == False)
]

In [None]:
ds = Dataset.from_pandas(train_df, preserve_index=False)

In [None]:
ds[:].keys()

In [None]:
sampler = active_datamodule.get_sampler("validation")

In [None]:
collate_fn = active_datamodule.get_collate_fn()

In [None]:
sampler.batch_size = 2000

In [None]:
dl = DataLoader(ds, sampler=sampler, collate_fn=collate_fn)

In [None]:
batch = next(iter(dl))

In [None]:
batch

In [None]:
next(iter(active_datamodule.pool_dataloader()))["input_ids"].shape

In [None]:
from dataclasses import dataclass, asdict

@dataclass
class Ciao(dict):
    a: str = "a"
    b: str = "b"

    def __call__(self):
        return asdict(self)

In [None]:
c = Ciao()

In [None]:
c

In [None]:
{**c}

In [None]:
BATCH_SIZE = 32
EVAL_BATCH_SIZE = 512
LEARNING_RATE = 0.0001

In [None]:

# renames "label" to "labels"
collator = DataCollatorWithPadding(
    tokenizer=tokenizer, padding=True, return_tensors="pt"
)

# load dataset
dataset = load_dataset("pietrolesci/ag_news", "concat")

# tokenize
dataset = dataset.map(lambda ex: tokenizer(ex["text"]), batched=True)
columns_to_keep = ["label", "input_ids", "token_type_ids", "attention_mask"]

# train-val split and record datasets
train_set, test_set = dataset["train"], dataset["test"]
_split = train_set.train_test_split(0.3)
_, val_set = _split["train"], _split["test"]

labels = train_set.features["label"].names
num_classes = len(labels)

# create dataloaders
batch_size = BATCH_SIZE
eval_batch_size = EVAL_BATCH_SIZE  # this is use when evaluating on the pool too
train_dl = DataLoader(
    train_set.with_format(columns=columns_to_keep),
    batch_size=batch_size,
    collate_fn=collator,
    num_workers=2,
)
val_dl = DataLoader(
    val_set.with_format(columns=columns_to_keep),
    batch_size=eval_batch_size,
    collate_fn=collator,
    num_workers=2,
)
test_dl = DataLoader(
    test_set.with_format(columns=columns_to_keep),
    batch_size=eval_batch_size,
    collate_fn=collator,
    num_workers=2,
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=4) 

In [None]:
class HuggingFaceEstimator(Estimator):
    def training_step(self, model, batch, batch_idx):
        return model(**batch)

    def validation_step(self, model, batch, batch_idx):
        return model(**batch)

    def test_step(self, model, batch, batch_idx):
        return model(**batch)

In [None]:
estimator = HuggingFaceEstimator(model=model)

In [None]:
estimator.fit(
    train_loader=train_dl,
    validation_loader=val_dl,
    learning_rate=LEARNING_RATE,
    optimizer="adamw",
    limit_train_batches=3,
    limit_validation_batches=3,
)

In [None]:
class TransformerModel(LightningModule):
    def __init__(self, model) -> None:
        super().__init__()
        self.model = model

    def training_step(self, batch_idx, batch):
        return model(**batch)

In [None]:
hparams = Hparams()

In [None]:
trainer = Trainer()

In [None]:
batch = next(iter(train_dl))

In [None]:
batch_idx = 0

In [None]:
lm = TransformerModel(model)

In [None]:
trainer.train_batch_loop(
    batch_idx=batch_idx,
    batch=batch,
    model=lm,
    optimizer=optimizer,
    scheduler=None,
)

In [None]:
trainer.train_epoch_loop(
    train_loader=train_dl,
    model=lm,
    optimizer=optimizer,
    scheduler=None,
    hparams=hparams,
    epoch=0
)