In [43]:
from typing import Optional, Dict, Any, Callable

## Pytorch recap

In [1]:
import torch

In [2]:
class MyModel(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(10, 2)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.linear(x)

## Data loading

In [8]:
from datasets import load_dataset

In [7]:
dataset = load_dataset("pietrolesci/ag_news", "concat")
dataset

Reusing dataset ag_news (/home/pl487/.cache/huggingface/datasets/pietrolesci___ag_news/concat/1.0.0/5ee6e111adc7a901ca734b79fbebff09d9dba91722387a794efff8d9c178a6a3)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

## Data Preprocessing

In [9]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [86]:
dataset = dataset.map(lambda ex: tokenizer(ex["text"], truncation=True), batched=True)

  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [87]:
train_dataset, test_dataset = dataset["train"], dataset["test"]

In [88]:
_split = train_dataset.train_test_split(0.3)
train_dataset, val_dataset = _split["train"], _split["test"]

columns_to_keep = ['label', 'input_ids', 'token_type_ids', 'attention_mask']
train_dataset = train_dataset.with_format(columns=columns_to_keep)
val_dataset = val_dataset.with_format(columns=columns_to_keep)

In [99]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt",
)

train_dataloader = DataLoader(train_dataset, batch_size=1, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=1, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=data_collator)

In [94]:

next(iter(DataLoader(t, batch_size=32, collate_fn=dc)))

{'input_ids': tensor([[  101, 13329,  5576,  ...,     0,     0,     0],
        [  101,  3009,  2873,  ...,     0,     0,     0],
        [  101,  6184,  5926,  ...,     0,     0,     0],
        ...,
        [  101, 23745,  1010,  ...,     0,     0,     0],
        [  101,  2088,  2924,  ...,     0,     0,     0],
        [  101,  5087,  2142,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 1, 1, 0, 0, 0, 0, 2, 2, 1, 2, 0, 3, 1, 0, 2, 1, 1, 0, 0, 1, 1, 0,
        2, 3, 3, 0, 1, 0, 0, 1])}

4

## All in one place: DataModule

In [54]:
from pytorch_lightning import LightningDataModule
from transformers import DataCollatorWithPadding

In [106]:
class AGNewsDataModule(LightningDataModule):
    def __init__(self, batch_size: int = 32, val_perc: float = 0.3) -> None:
        super().__init__()
        self.batch_size = batch_size
        self.val_perc = val_perc

    def setup(self, stage: Optional[str] = None) -> None:
        # load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        
        # load dataset
        dataset = load_dataset("pietrolesci/ag_news", "concat")
        
        # tokenize
        dataset = dataset.map(lambda ex: tokenizer(ex["text"]), batched=True)
        columns_to_keep = ['label', 'input_ids', 'token_type_ids', 'attention_mask']
        
        # train-val split and record datasets
        train_dataset, test_dataset = dataset["train"], dataset["test"]
        self.test_dataset = test_dataset.with_format(columns=columns_to_keep)

        if self.val_perc:
            _split = train_dataset.train_test_split(0.3)
            train_dataset, val_dataset = _split["train"], _split["test"]
            self.val_dataset = val_dataset.with_format(columns=columns_to_keep)


        self.train_dataset = train_dataset.with_format(columns=columns_to_keep)


    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def val_dataloader(self) -> Optional[DataLoader]:
        return DataLoader(self.val_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    @property
    def collate_fn(self) -> Callable:
        return DataCollatorWithPadding(
            tokenizer=self.tokenizer,
            padding=True,
            return_tensors="pt",
        )

    @property
    def num_classes(self) -> int:
        if hasattr(self, "train_dataset"):
            return self.train_dataset.features["label"].num_classes
    

In [107]:
datamodule = AGNewsDataModule()
datamodule.prepare_data()
datamodule.setup()

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/pl487/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/reso

  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___ag_news/concat/1.0.0/5ee6e111adc7a901ca734b79fbebff09d9dba91722387a794efff8d9c178a6a3/cache-781789b6431d3668.arrow
Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___ag_news/concat/1.0.0/5ee6e111adc7a901ca734b79fbebff09d9dba91722387a794efff8d9c178a6a3/cache-98d09ef7ac3d157e.arrow


In [108]:
next(iter(datamodule.train_dataloader()))

{'input_ids': tensor([[  101,  3725,  2000,  ...,     0,     0,     0],
        [  101,  7842,  2226,  ...,     0,     0,     0],
        [  101,  5747,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  8840,  7941,  ...,     0,     0,     0],
        [  101,  1044,  1054,  ..., 10528,  1012,   102],
        [  101,  4021,  2000,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 0, 2, 1, 1, 1, 3, 1, 3, 3, 0, 0, 0, 2, 1, 0, 2, 2, 3, 3, 0, 3, 1,
        0, 0, 3, 1, 3, 3, 2, 2])}

## Model

In [109]:
from pytorch_lightning import LightningModule
from transformers import AutoModelForSequenceClassification, AdamW, get_constant_schedule_with_warmup

In [115]:
class TransformerModel(LightningModule):
    def __init__(self, model_name: str, num_classes: int, learning_rate: float = 0.00001, num_warmup_steps: int = 2_000) -> None:
        super().__init__()
        self.model_name = model_name
        self.num_classes = num_classes
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_classes)
        self.learning_rate = learning_rate
        self.num_warmup_steps = num_warmup_steps

    def common_step(self, batch: Any, stage: str):
        """Outputs loss and logits, logs loss and metrics."""
        out = self(batch)
        _, loss = out.logits, out.loss
        self.log(f"{stage}_loss", loss)
        return loss

    def forward(self, batch) -> torch.Tensor:
        return self.model(**batch)

    def training_step(self, batch: Any, batch_idx: int = 0, optimizer_idx: int = 0) -> Dict[str, Any]:
        return self.common_step(batch, "train")

    def validation_step(self, batch: Any, batch_idx: int = 0) -> Dict[str, Any]:
        return self.common_step(batch, "val")

    def test_step(self, batch: Any, batch_idx: int = 0) -> Dict[str, Any]:
        return self.common_step(batch, "test")

    def configure_optimizers(self) -> Dict[str, Any]:
        optimizer = AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=self.learning_rate)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.num_warmup_steps),
                "monitor": "loss",
                "frequency": 1,
                "interval": "step",
            }
        }


In [116]:
model = TransformerModel("bert-base-uncased", num_classes=datamodule.num_classes)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/pl487/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_e

## Train!

In [117]:
from pytorch_lightning import Trainer

In [118]:
trainer = Trainer(fast_dev_run=True)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Running in fast_dev_run mode: will run a full train, val, test and prediction loop using 1 batch(es).


In [119]:
trainer.fit(model, datamodule)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/pl487/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/reso

  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___ag_news/concat/1.0.0/5ee6e111adc7a901ca734b79fbebff09d9dba91722387a794efff8d9c178a6a3/cache-781789b6431d3668.arrow
Loading cached processed dataset at /home/pl487/.cache/huggingface/datasets/pietrolesci___ag_news/concat/1.0.0/5ee6e111adc7a901ca734b79fbebff09d9dba91722387a794efff8d9c178a6a3/cache-98d09ef7ac3d157e.arrow

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.941   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]