<a href="https://colab.research.google.com/github/maxmatical/ml-cheatsheet/blob/master/Pytorch_Lightning_BERT_Huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/

In [3]:
%%capture
!pip install transformers
!pip install pytorch_lightning

In [4]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import OneCycleLR, ReduceLROnPlateau
from torch.optim import AdamW




from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
  

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:

??LearningRateMonitor

In [134]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
model = AutoModel.from_pretrained("distilroberta-base")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Data

In [4]:
!gdown --id 1VuQ-U7TtggShMeuRSA_hzC8qGDl2LRkr

Downloading...
From: https://drive.google.com/uc?id=1VuQ-U7TtggShMeuRSA_hzC8qGDl2LRkr
To: /content/toxic_comments.csv
100% 68.8M/68.8M [00:00<00:00, 166MB/s]


In [5]:
df = pd.read_csv("toxic_comments.csv")

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
train_df, val_df = train_test_split(df, test_size=0.15)

In [8]:
# subsample clean comments
LABEL_COLUMNS = df.columns.tolist()[2:]

train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]

train_df = pd.concat([
  train_toxic,
  train_clean.sample(15_000)
])

train_df.shape, val_df.shape

((28825, 8), (23936, 8))

In [12]:
class ToxicCommentsDataset(Dataset):

  def __init__(
    self,
    data: pd.DataFrame,
    tokenizer: AutoTokenizer,
    max_token_len: int = 128

  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    comment_text = data_row.comment_text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      comment_text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',

    )

    return dict(
      comment_text=comment_text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=torch.FloatTensor(labels)
    )

In [16]:
# test
train_dataset = ToxicCommentsDataset(
  train_df,
  tokenizer,
  max_token_len=512
)

sample_item = train_dataset[0]
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

In [18]:
print(sample_item["comment_text"], sample_item["labels"])
print(sample_item["input_ids"].shape)

Self-appointed, self-aggrandising and self-important guardian of Wikipedia. My rampant self-righteousness really knows no bounds and I truly am a complete wanker. tensor([1., 0., 0., 0., 0., 0.])
torch.Size([512])


In [169]:
class ToxicCommentsDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=512):
    super().__init__()
    self.train_df, self.test_df = train_df, test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = ToxicCommentsDataset(
        self.train_df,
        self.tokenizer,
        self.max_token_len
    )

    self.test_dataset = ToxicCommentsDataset(
        self.test_df,
        self.tokenizer,
        self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers=1
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers=1
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = self.batch_size,
        shuffle = False,
        num_workers=1
    )

In [177]:
data_module = ToxicCommentsDataModule(
    train_df,
    val_df,
    tokenizer,
    batch_size = 12,
    max_token_len = 512
)


In [179]:
data_module.setup() # call this before getting len of dataloader

In [180]:
len(data_module.train_dataloader())

2403

# Model

In [128]:
sample_batch = next(iter(DataLoader(train_dataset, batch_size=8, num_workers=1)))
sample_batch["input_ids"].shape, sample_batch["attention_mask"].shape

(torch.Size([8, 512]), torch.Size([8, 512]))

In [135]:
output = model(sample_batch["input_ids"], sample_batch["attention_mask"])

In [138]:
output.pooler_output.shape

torch.Size([8, 768])

In [164]:
class ToxicCommentClassifier(pl.LightningModule):
  def __init__(self, n_classes: int):
    super().__init__()
    self.model = model
    self.classifier = nn.Linear(self.model.config.hidden_size, n_classes)
    self.criterion = nn.BCEWithLogitsLoss() # bce for multi-label
    # self.criterion = nn.BCELoss()

    # manually define opt step
    self.automatic_optimization = False

  def forward(self, input_ids, attention_mask, labels=None):
    out = self.model(input_ids, attention_mask=attention_mask)
    out = self.classifier(out.pooler_output)
    # out = torch.sigmoid(out)

    loss = 0
    if labels is not None:
      loss = self.criterion(out, labels)
    return loss, out

  def training_step(self, batch, batch_idx):
    # batch comes from dataset
    # maybe do this https://pytorch-lightning.readthedocs.io/en/latest/common/optimizers.html#learning-rate-scheduling-manual
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    loss, out = self(input_ids, attention_mask, labels)
    
    self.log("train_loss", loss, on_epoch=True, sync_dist=True, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": out, "labels": labels}

    # manual optimization
    opt = self.optimizers()
    opt.zero_grad()
    self.manual_backwards(loss)
    opt.step()

    # lr schedule
    one_cylce_sch, reduce_lr_on_plateau_sch = self.lr_schedulers()
    one_cylce_sch.step()

  def validation_step(self, batch, batch_idx):
    # batch comes from dataset
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    loss, out = self(input_ids, attention_mask, labels)
    
    self.log("val_loss", loss, on_epoch=True, sync_dist=True, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    one_cylce_sch, reduce_lr_on_plateau_sch = self.lr_schedulers()
    sch.step(self.trainer.callback_metrics["val_loss"])

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    scheduler1 = OneCycleLR(
      optimizer,
      max_lr=2e-5,
      pct_start=0.3,
      steps_per_epoch=2403,
      epochs=10

    )

    scheduler2 = ReduceLROnPlateau(
        optimizer,
        patience=3
    )
    return [optimizer], [scheduler1, scheduler2]

In [165]:
toxic_comment_model = ToxicCommentClassifier(len(LABEL_COLUMNS))

In [166]:
# callbacks
lr_monitor_cb = LearningRateMonitor()

checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=6)

logger = TensorBoardLogger("lightning_logs", name="toxic-comments")

In [167]:
trainer = pl.Trainer(
    logger=logger,
    callbacks = [checkpoint_callback, early_stopping_callback],
    stochastic_weight_avg=False,
    max_epochs = 10,
    gpus=1,
    progress_bar_refresh_rate=30
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [168]:
trainer.fit(toxic_comment_model, data_module)

  f"DataModule.{name} has already been called, so it will not be called again. "
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | model      | RobertaModel      | 82.1 M
1 | classifier | Linear            | 4.6 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
82.1 M    Trainable params
0         Non-trainable params
82.1 M    Total params
328.492   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
