# Multi-label Text Classification with BERT and PyTorch Lightning, finetuned on GoEmotions [Script B]

*This notebook contains the script used to build our first multi-label classification model, which recognizes emotions from reviews collected in Script A. In it, we build a BERTBase model using the PyTorch Lightning library. Note that this notebook's code was written following a tutorial on multi-label text classification for detection of toxic tweets published by Venelin Valkov. However, the content of the code was written to serve our own model goals.*

---
*References: https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/*


## Package Installation, Imports & Setup

In [None]:
!nvidia-smi

Thu Sep  2 10:57:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch

# Check if a GPU is available. If it is, use it, otherwise, use CPU
if torch.cuda.is_available():      
    device = torch.device("cuda")
    print('There are %d free GPUs.' % torch.cuda.device_count())
    print('GPU to use:', torch.cuda.get_device_name(0))

else:
    print('GPUs are not available. CPU will be used')
    device = torch.device("cpu")

There are 1 free GPUs.
GPU to use: Tesla T4


In [None]:
!pip install pytorch-lightning==1.2.8 --quiet
!pip install transformers==4.7.0 --quiet

In [None]:
! pip install transformers datasets --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

import io
from google.colab import files

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


## Data Import & Cleaning

In [None]:
#Import raw GoEmotions dataset from the HuggingFace dataset library
#and store in raw_df dataframe
from datasets import load_dataset
raw_dataset = load_dataset("go_emotions", "raw")
raw_df = pd.DataFrame(raw_dataset['train'])

In [None]:
#Clean dataframe and keep relevant columns
EMOTION_LABELS = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
keep_cols = ['text']+EMOTION_LABELS
raw_df = raw_df[keep_cols]

In [None]:
#Clean all text

#Replace non-alphabetical characters with whitespaces
raw_df['text'] = raw_df['text'].replace('[^a-zA-Z0-9 ]', ' ', regex=True)
#Ensure that words are separated by single whitespace
raw_df['text'] = raw_df['text'].str.strip()

In [None]:
#Split dataset between train and test sets

train_size = 0.7
val_test_split = 0.5

#Prepare train and test
train_df = raw_df.sample(frac=train_size,random_state=200)
test_val_df = raw_df.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

#Split test into two to have val set
test_df = test_val_df.sample(frac=val_test_split,random_state=200)
val_df = test_val_df.drop(test_df.index).reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Dataset Shape: {}".format(raw_df.shape))
print("Training Set Shape: {}".format(train_df.shape))
print("Validation Set Shape: {}".format(val_df.shape))
print("Test Set Shape: {}".format(test_df.shape))

In [None]:
#Preview train dataset distribution
EMOTION_LABELS = raw_df.columns.tolist()[2:]
train_df[EMOTION_LABELS].sum().sort_values().plot(kind="barh");

In [None]:
#Undersample neutral text. Randomly select 1000 instances and drop the rest from
#the dataframe
neutral_df = train_df[train_df.neutral == 1]
neutral_keep_df = neutral_df.sample(5000)
train_df = pd.concat([train_df, neutral_df, neutral_df]).drop_duplicates(keep=False)
train_df = pd.concat([train_df, neutral_keep_df])

## Tokenization

In [None]:
#Use Cased version of BERT Base pretrained model
BERT_MODEL_FORMAT = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_FORMAT)

In [None]:
#Set the max token count to 512 to allow our model to process sentences with up
#to 512 tokens
MAX_TOKEN_COUNT = 512

In [None]:
#Wrap the tokenization process in a Dataset and convert labels to tensors
#This code was taken from the reference mentioned above with only few
#tweaks made to it

class TweetDataset(Dataset):

  def __init__(
    self, 
    data: pd.DataFrame, 
    tokenizer: BertTokenizer, 
    max_token_len: int = 512
  ):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
    
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]

    Text = data_row.text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      Text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      Text=Text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels=torch.FloatTensor(labels)
    )

In [None]:
#Tokenize training set
train_dataset = TweetDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

In [None]:
#Wrap the dataset with the tokenization process into a LightningDataModule
#This code was taken from the reference mentioned above with only few
#tweaks made to it

class TweetDataModule(pl.LightningDataModule):

  def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
    super().__init__()
    self.batch_size = batch_size
    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def setup(self, stage=None):
    self.train_dataset = TweetDataset(
      self.train_df,
      self.tokenizer,
      self.max_token_len
    )

    self.test_dataset = TweetDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )

  def train_dataloader(self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=4
    )

  def val_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=4
    )

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      num_workers=4
    )

In [None]:
#Set our hyperparameters
N_EPOCHS = 3
BATCH_SIZE = 12

data_module = TweetDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

## Model Building

*As described in our report, we use a pre-trained BERT module, specifically BERTBase uncased to build our model on top of. To give it multi-label text classification capacities, we add one linear layer as the classifier and a sigmoid activation function. We wrap it all up in a Lightning Module*

In [None]:
#This code was taken from the reference mentioned above with only few
#tweaks made to it to adapt it to our multi-label text classification task

class TweetTagger(pl.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_FORMAT, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCELoss()

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)    
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss, "predictions": outputs, "labels": labels}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def training_epoch_end(self, outputs):
    
    labels = []
    predictions = []
    for output in outputs:
      for out_labels in output["labels"].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output["predictions"].detach().cpu():
        predictions.append(out_predictions)

    labels = torch.stack(labels).int()
    predictions = torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)


  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )

In [None]:
#We define training steps based on our hyperparameters
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
# We use part of our training steps to warm up
warmup_steps = total_training_steps // 4

In [None]:
#Model creation
model = TweetTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps 
)

## Model Training

In [None]:
#Remove saved lighting logs and checkpoints if any
!rm -rf lightning_logs/
!rm -rf checkpoints/

In [None]:
#Save top model checkpoint for recovery
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [None]:
#Use Tensor Hoard to log model's training
logger = TensorBoardLogger("lightning_logs", name="Text")

In [None]:
#If validation loss hasn't improved in 2 epochs, stop epochs
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
#Define trainer with checkpoint and callback parameters
trainer = pl.Trainer(
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=1,
  progress_bar_refresh_rate=30
)

#Train model
trainer.fit(model, data_module)

In [None]:
#Load best model from trainer
trained_model = TweetTagger.load_from_checkpoint(
  trainer.checkpoint_callback.best_model_path,
  n_classes=len(LABEL_COLUMNS)
)

trained_model.eval()
trained_model.freeze()

#Save best model from trainer
torch.save(trained_model, 'goemotions_model.pth')
files.download('goemotions_model.pth')

## Model Evaluation

In [None]:
trained_model = trained_model.to(device)

val_dataset = TweetDataset(
  test_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
  _, prediction = trained_model(
    item["input_ids"].unsqueeze(dim=0).to(device), 
    item["attention_mask"].unsqueeze(dim=0).to(device)
  )
  predictions.append(prediction.flatten())
  labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

In [None]:
#Check model accuracy. We use a treshold of 0.5 to consider the presence
#or absence of an emotion
THRESHOLD = 0.2
accuracy(predictions, labels, threshold=THRESHOLD)

In [None]:
#Get AUROC for each emotion
print("Area Under The ROC Curve per Emotion")
for i, name in enumerate(EMOTION_LABELS):
  tag_auroc = auroc(predictions[:, i], labels[:, i], pos_label=1)
  print(f"{name}: {tag_auroc}")

In [None]:
#Get classification report for each emotion
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > THRESHOLD, upper, lower)

print(classification_report(
  y_true, 
  y_pred, 
  target_names=LABEL_COLUMNS, 
  zero_division=0
))

## Predictions

In [None]:
def review_encoding(sent_model, review):

  THRESHOLD = 0.2
  sentiments = []

  encoding = tokenizer.encode_plus(
    review,
    add_special_tokens=True,
    truncation=True,
    max_length=512,
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors='pt',
  )

  _, sent_prediction = sent_model(encoding["input_ids"], encoding["attention_mask"])
  sent_prediction = sent_prediction.flatten().numpy()

  for label, prediction in zip(LABEL_COLUMNS, sent_prediction):
    
    if prediction < THRESHOLD:
      continue

    sentiments.append({
      "sentiment": label,
      "prediction": prediction
    })

  return sentiments

In [None]:
#Reload model if necessary
#reconstructed_model = torch.load("saved_models/goemotions_model.pth")

In [None]:
#Get reviews saved from Script A
reviews = pd.read_csv('/datasets/reviews.csv')
#Keep relvant columns and clean up
columns = ['headline', 'pros', 'cons']
reviews['review'] = reviews[columns].apply(lambda row: '. '.join(row.values.astype(str)), axis=1)

#Split review in sentences in order to get all emotions present
s = reviews['review'].str.split('.').apply(Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'review'
del reviews['review']
reviews = reviews.join(s)

#Drop empty lines
reviews['review'] = reviews['review'].apply(lambda x: x.strip())
drop_empty = reviews[reviews['review'] == '']
reviews = pd.concat([reviews, drop_empty, drop_empty]).drop_duplicates(keep=False)

In [None]:
#Run model on tokenized reviews
reviews['sentiments'] = reviews['review'].apply(lambda x: review_encoding(reconstructed_model, str(x)))

In [None]:
#Save review emotions for use in upcoming scripts
reviews.to_csv('review_sentiments.csv') 
files.download('review_sentiments.csv')