In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from energizer.datastores.classification import PandasDataStoreForSequenceClassification
from energizer.estimator import Estimator
from energizer.utilities import move_to_cpu
from energizer.enums import InputKeys, OutputKeys, RunningStage
from energizer import seed_everything
from energizer.callbacks import GradNorm
from energizer.active_learning.datastores.classification import ActivePandasDataStoreForSequenceClassification
from typing import List, Dict
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torchmetrics import MetricCollection
from torchmetrics.classification import Accuracy, F1Score, Precision, Recall
from datasets import load_dataset
from energizer.active_learning.strategies.random import RandomStrategy
from energizer.active_learning.strategies.uncertainty import UncertaintyBasedStrategy

In [3]:
dataset_dict = load_dataset("pietrolesci/agnews")
dataset_dict["train"] = dataset_dict["train"].select(range(1000))

model_name = "google/bert_uncased_L-2_H-128_A-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset_dict = dataset_dict.map(lambda ex: tokenizer(ex["text"]), batched=True)

In [4]:
ds = PandasDataStoreForSequenceClassification.from_dataset_dict(
    dataset_dict=dataset_dict,
    input_names=["input_ids", "attention_mask"],
    target_name="labels",
    tokenizer=tokenizer,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
class EstimatorForSequenceClassification(Estimator):

    def step(
        self,
        stage: RunningStage,
        model,
        batch: Dict,
        batch_idx: int,
        loss_fn,
        metrics: MetricCollection,
    ) -> torch.Tensor:
        
        _ = batch.pop(InputKeys.ON_CPU, None)

        out = model(**batch)
        if stage == RunningStage.POOL:
            return self.score_fn(out.logits)

        out_metrics = metrics(out.logits, batch[InputKeys.TARGET])

        if stage == RunningStage.TRAIN:
            logs = {OutputKeys.LOSS: out.loss, **out_metrics}
            self.log_dict({f"{stage}/{k}": v for k, v in logs.items()}, step=self.tracker.global_batch)

        return out.loss
    
    def epoch_end(self, stage: RunningStage, output: List[np.ndarray], metrics: MetricCollection) -> float:
        aggregated_metrics = move_to_cpu(metrics.compute())  # NOTE: metrics are still on device
        aggregated_loss = round(np.mean(output).item(), 6)
        
        logs = {OutputKeys.LOSS: aggregated_loss, **aggregated_metrics}
        self.log_dict({f"{stage}_end/{k}": v for k, v in logs.items()}, step=self.tracker.safe_global_epoch)

        return aggregated_loss

    def configure_metrics(self, *_) -> MetricCollection:
        num_classes = self.model.num_labels
        task = "multiclass"
        # NOTE: you are in charge of moving it to the correct device
        return MetricCollection(
            {
                "accuracy": Accuracy(task, num_classes=num_classes),
                "f1_macro": F1Score(task, num_classes=num_classes, average="macro"),
                "precision_macro": Precision(task, num_classes=num_classes, average="macro"),
                "recall_macro": Recall(task, num_classes=num_classes, average="macro"),
                "f1_micro": F1Score(task, num_classes=num_classes, average="micro"),
                "precision_micro": Precision(task, num_classes=num_classes, average="micro"),
                "recall_micro": Recall(task, num_classes=num_classes, average="micro"),
            }
        ).to(self.device)
    


In [6]:
seed_everything(42)
model = AutoModelForSequenceClassification.from_pretrained(
    ds.tokenizer.name_or_path,
    id2label=ds.id2label,
    label2id=ds.label2id,
    num_labels=len(ds.labels),
)


Global seed set to 42
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
ds.prepare_for_loading()

estimator = EstimatorForSequenceClassification(
    model, 
    accelerator="gpu",
    tf32_mode="high",
    # loggers=[TensorBoardLogger("./")],
    # callbacks=[GradNorm(2), PytorchTensorboardProfiler("./profiler_logs")],
)

estimator.fit(
    train_loader=ds.train_loader(),
    validation_loader=ds.test_loader(),
    validation_freq="1:step",
    limit_train_batches=5,
    limit_validation_batches=1,
    max_epochs=2,
    learning_rate=0.001,
    optimizer="adamw",
    gradient_accumulation_steps=2,
    scheduler="cosine_schedule_with_warmup",
    scheduler_kwargs={"num_warmup_steps": .1},
)

estimator.test(loader=ds.test_loader())

Optimisation steps:   0%|          | 0/5 [00:00<?, ?it/s]

Completed epochs:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/5 [00:00<?, ?it/s]

Validation:   0%|          | 0/1 [00:00<?, ?it/s]

Test:   0%|          | 0/238 [00:00<?, ?it/s]

1.370891

In [8]:
class RandomStrategyForSequenceClassification(EstimatorForSequenceClassification, RandomStrategy):
    ...

random = RandomStrategyForSequenceClassification(
    model=model, accelerator="gpu", tf32_mode="high",
)

ads = ActivePandasDataStoreForSequenceClassification.from_dataset_dict(
    dataset_dict=dataset_dict,
    input_names=["input_ids", "attention_mask"],
    target_name="labels",
    tokenizer=tokenizer,
)

ads.prepare_for_loading()

random.active_fit(
    datastore=ads,
    validation_freq="1:step",
    limit_train_batches=5,
    limit_validation_batches=1,
    max_epochs=2,
    max_rounds=5,
    learning_rate=0.001,
    optimizer="adamw",
    gradient_accumulation_steps=2,
    scheduler="cosine_schedule_with_warmup",
    scheduler_kwargs={"num_warmup_steps": .1},
    query_size=15,
    limit_test_batches=2
)


Completed rounds:   0%|          | 0/6 [00:00<?, ?it/s]

Labelled:   0%|          | 0/75 [00:00<?, ?it/s]

Optimisation steps: 0it [00:00, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

[{<RunningStage.TEST: 'test'>: 1.253059},
 {<RunningStage.TRAIN: 'train'>: [(1.218048, []), (1.256889, [])],
  <RunningStage.TEST: 'test'>: 1.253059},
 {<RunningStage.TRAIN: 'train'>: [(1.312426, []), (1.313902, [])],
  <RunningStage.TEST: 'test'>: 1.253059},
 {<RunningStage.TRAIN: 'train'>: [(1.267999, []), (1.27012, [])],
  <RunningStage.TEST: 'test'>: 1.115026},
 {<RunningStage.TRAIN: 'train'>: [(1.292171, []), (1.29874, [])],
  <RunningStage.TEST: 'test'>: 1.17038},
 {<RunningStage.TRAIN: 'train'>: [(1.246048, []), (1.197541, [])],
  <RunningStage.TEST: 'test'>: 1.086082}]

In [9]:

class UncertaintyStrategy(EstimatorForSequenceClassification, UncertaintyBasedStrategy):
    def pool_step(self, model, batch, batch_idx: int, metrics):
        return super().step(RunningStage.POOL, model, batch, batch_idx, None, metrics)


least_conf = UncertaintyStrategy(
    model=model, accelerator="gpu", tf32_mode="high", score_fn="least_confidence",
)
ads = ActivePandasDataStoreForSequenceClassification.from_dataset_dict(
    dataset_dict=dataset_dict,
    input_names=["input_ids", "attention_mask"],
    target_name="labels",
    tokenizer=tokenizer,
)

ads.prepare_for_loading()
least_conf.active_fit(
    datastore=ads,
    validation_freq="1:step",
    limit_train_batches=5,
    limit_validation_batches=1,
    max_epochs=2,
    max_rounds=5,
    learning_rate=0.001,
    optimizer="adamw",
    gradient_accumulation_steps=2,
    scheduler="cosine_schedule_with_warmup",
    scheduler_kwargs={"num_warmup_steps": .1},
    query_size=15,
    limit_test_batches=2
)


Completed rounds:   0%|          | 0/6 [00:00<?, ?it/s]

Labelled:   0%|          | 0/75 [00:00<?, ?it/s]

Optimisation steps: 0it [00:00, ?it/s]

Completed epochs: 0it [00:00, ?it/s]

Epoch 0: 0it [00:00, ?it/s]

Test: 0it [00:00, ?it/s]

Pool: 0it [00:00, ?it/s]

TypeError: pool_step() takes 5 positional arguments but 6 were given

In [None]:
least_conf.tracker.global_budget, least_conf.tracker.budget_tracker

In [None]:
least_conf.tracker.budget_tracker

In [None]:
least_conf.tracker.step_tracker.total