## NLP CW - Michelle Lo, Hetty Symes, Evelyn Nutton

RoBERTa base model

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, RobertaModel, AutoTokenizer, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, get_scheduler
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE, RandomOverSampler
import nlpaug.augmenter.word as naw
import sacremoses
import nltk
import math
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /homes/en120/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /homes/en120/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:

train_df = pd.read_csv("train_dev_data/train_set.csv")
test_df = pd.read_csv("train_dev_data/dev_set.csv")


### Data Augmentation via Back Translation

In [4]:
back_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-de',
    to_model_name='Helsinki-NLP/opus-mt-de-en',
    device=device,
    max_length=len(max(train_df["text"].to_list(),key=len))
)
underrep = train_df[train_df['label'] == 1]
underrep_augment = underrep.copy().dropna()
underrep_augment_data_text = []
batch_size_aug = 32
for i in range(0,len(underrep_augment),batch_size_aug):

    underrep_augment_data_text.extend(back_aug.augment(underrep_augment["text"].to_list()[i:i+batch_size_aug]))


print(underrep_augment_data_text)
underrep_augment_data = pd.DataFrame(underrep_augment_data_text)
underrep_augment["text"] = underrep_augment_data


train_df_augment = pd.concat([train_df,underrep_augment.dropna()])

print(train_df['label'].value_counts())
print(train_df_augment['label'].value_counts())

train_df = train_df_augment
train_df.head()

['Arshad said that besides learning many new aspects of sports leadership he learned how fast developing nations were using sports as a tool of development and in this effort the disabled and the underprivileged were not left behind in any phase.', 'Fast food staff that is fed disabled man Internet sensation', "Vanessa had feelings of hopelessness in her last days, which increased when her call for help revealed no response from the resident judge she placed in the custody of the state. Her letter was sent to these court officials by an investigator from the Children's Advocate's Office.", 'In September Major Nottle walked from Melbourne to Canberra to plead for a national solution to the homeless problem.', "The demography of Pakistan and India are very similar. Poverty is a widespread topic. According to the FAO, 40 percent of children in Pakistan are malnourished and underweight due to lack of access to adequate food. And that's not because it's not enough; Pakistan is the 8th large

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0


### Over Sampling

In [5]:
print(underrep.head())
print(underrep_augment.tail())
train_df.to_csv("train_dev_data/train_set_aug.csv")

print(train_df['label'].value_counts())

X_train = train_df[['text']]  # Feature columns
y_train = train_df['label']  # Target column

# Initialize the random oversampler
ros = RandomOverSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)


    par_id      art_id   keyword country  \
32      33   @@8301323  disabled      pk   
33      34  @@24150149  disabled      ng   
41      42   @@4591694  hopeless      jm   
76      77  @@22454828  homeless      nz   
82      83   @@4672144  homeless      pk   

                                                 text  label  orig_label  
32  Arshad said that besides learning many new asp...      1           2  
33  Fast food employee who fed disabled man become...      1           3  
41  Vanessa had feelings of hopelessness in her fi...      1           3  
76  In September , Major Nottle set off on foot fr...      1           3  
82  The demographics of Pakistan and India are ver...      1           3  
      par_id      art_id     keyword country text  label  orig_label
8370   10424   @@4665292       women      jm  NaN      1           3
8371   10445   @@3923193     refugee      gb  NaN      1           3
8372   10454  @@22338535  vulnerable      ie  NaN      1           4
8373   10

### Combining Resampled and Augmented Data into New Data Frame

In [6]:
# Update the dataset with the resampled values
train_df = pd.DataFrame(X_resampled, columns=X_train.columns)
train_df['label'] = y_resampled


# Verify the oversampling result
print(train_df['label'].value_counts())
train_df.head()

label
0    7581
1    7581
Name: count, dtype: int64


Unnamed: 0,text,label
0,"We 're living in times of absolute insanity , ...",0
1,"In Libya today , there are countless number of...",0
2,"""White House press secretary Sean Spicer said ...",0
3,Council customers only signs would be displaye...,0
4,""""""" Just like we received migrants fleeing El ...",0


### Loading the Roberta Base Model

In [7]:
# Load the pre-trained model
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, do_lower_case=True)
pretrained_model = RobertaModel.from_pretrained(checkpoint, num_labels=2)
pretrained_model = pretrained_model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### PCLData class

In [8]:
# Class for the data
class PCLData(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = data
        self.text = self.data.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
MAX_LEN = 256
# LEARNING_RATE = 1e-05

In [10]:
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))
train_dataset = PCLData(train_df, tokenizer, MAX_LEN)
test_dataset = PCLData(test_df, tokenizer, MAX_LEN)

test_params = {'batch_size': 4, 'shuffle': True, 'num_workers': 0}
testing_loader = DataLoader(test_dataset, **test_params)

# train_params = {'batch_size': 16,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

# test_params = {'batch_size': 4,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

# training_loader = DataLoader(train_dataset, **train_params)
# testing_loader = DataLoader(test_dataset, **test_params)

TRAIN Dataset: (15162, 2)
TEST Dataset: (2094, 7)


### Fine Tuning Model

In [11]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = pretrained_model
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [12]:
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

### Alternative Loss - Self Adjusting Dice Loss
Taken from the unofficial Pytorch implementation of https://arxiv.org/abs/1911.02855, which can be founds here https://github.com/fursovia/self-adj-dice.


In [None]:
# Loss

# Taken from the SelfAdjDiceLoss python module source code which cannot be imported regularly due to pytorch compatibility issues
class SelfAdjDiceLoss(torch.nn.Module):
    r"""
    Creates a criterion that optimizes a multi-class Self-adjusting Dice Loss
    ("Dice Loss for Data-imbalanced NLP Tasks" paper)

    Args:
        alpha (float): a factor to push down the weight of easy examples
        gamma (float): a factor added to both the nominator and the denominator for smoothing purposes
        reduction (string): Specifies the reduction to apply to the output:
            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
            ``'mean'``: the sum of the output will be divided by the number of
            elements in the output, ``'sum'``: the output will be summed.

    Shape:
        - logits: `(N, C)` where `N` is the batch size and `C` is the number of classes.
        - targets: `(N)` where each value is in [0, C - 1]
    """

    def __init__(self, alpha: float = 1.0, gamma: float = 1.0, reduction: str = "mean") -> None:
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        probs = torch.softmax(logits, dim=1)
        print(logits.shape)
        probs = torch.gather(probs, dim=1, index=targets.unsqueeze(1))

        probs_with_factor = ((1 - probs) ** self.alpha) * probs
        loss = 1 - (2 * probs_with_factor + self.gamma) / (probs_with_factor + 1 + self.gamma)

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        elif self.reduction == "none" or self.reduction is None:
            return loss
        else:
            raise NotImplementedError(f"Reduction `{self.reduction}` is not supported.")

In [17]:

# Creating the loss function and optimizer
# criterion = SelfAdjDiceLoss()
loss_function = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [13]:

def train(model, epoch, optimizer, training_loader, scheduler=None):
    tr_loss = 0; n_correct = 0; steps = 0; seen = 0
    model.train()
    for i,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        preds = model(ids, mask, token_type_ids)
        loss = loss_function(preds, targets)
        tr_loss += loss.item()
        _, pred_labels = torch.max(preds.data, dim=1)
        n_correct += calcuate_accuracy(pred_labels, targets)

        steps += 1
        seen+=targets.size(0)
        
        if i%5000==0:
            curr_loss = tr_loss/steps
            curr_acc = (n_correct*100)/seen 
            print(f"Training Loss per 5000 steps: {curr_loss}")
            print(f"Training Accuracy per 5000 steps: {curr_acc}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

    print(f'Total Accuracy for Epoch {epoch}: {(n_correct*100)/seen}')
    epoch_loss = tr_loss/steps
    epoch_accu = (n_correct*100)/seen
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [14]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; steps=0; seen=0
    preds_model = torch.tensor([]).to(device); targets_model = torch.tensor([]).to(device)

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            preds = model(ids, mask, token_type_ids).squeeze()
            
            _, pred_labels = torch.max(preds.data, dim=1)
            n_correct += calcuate_accuracy(pred_labels, targets)

            steps += 1
            seen+=targets.size(0)

            preds_model = torch.cat((preds_model, pred_labels))
            targets_model = torch.cat((targets_model, targets))
            
    epoch_accu = (n_correct*100)/seen

    
    
    return epoch_accu, preds_model, targets_model

# acc, preds, targets = valid(model, testing_loader)
# print("Accuracy on test data = %0.2f%%" % acc)

# Hyperparameter tuning

In [18]:
def train_with_hyperparameters(learning_rate, batch_size, epochs, use_scheduler=True, gamma=0.9):
    train_params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0}
    training_loader = DataLoader(train_dataset, **train_params)
    model = RobertaClass().to(device)
    optimizer = torch.optim.AdamW(params =  model.parameters(), lr=learning_rate)

    scheduler = None
    if use_scheduler:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
    

    for epoch in range(epochs):
        train(model, epoch, optimizer, training_loader, scheduler)

    acc, preds, targets = valid(model, testing_loader)
    print("Accuracy on test data = %0.2f%%" % acc)
    print(classification_report(targets.cpu().numpy(), preds.cpu().numpy()))
    

In [None]:
# Hyperparameters to tune: learning rate and batch size

batch_sizes = [4, 16, 32]
learning_rates = [1e-5, 1e-3, 1e-2]
gamma_rates = [0.3, 0.5, 0.9]


for batch_size in batch_sizes:
    for lr in learning_rates:
        print(f"Batch size {batch_size}, LR {lr}, no scheduler")
        train_with_hyperparameters(lr, batch_size, 5, use_scheduler=False)
        for gamma in gamma_rates:
            print(f"Batch size {batch_size}, LR {lr}, scheduler with gamma {gamma}")
            train_with_hyperparameters(lr, batch_size, 5, use_scheduler=True, gamma=gamma)

Batch size 4, LR 1e-05, no scheduler


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.6926743388175964
Training Accuracy per 5000 steps: 75.0


3791it [19:21,  3.26it/s]


Total Accuracy for Epoch 0: 89.53963857010949
Training Loss Epoch: 0.25571005186333196
Training Accuracy Epoch: 89.53963857010949


1it [00:00,  4.53it/s]

Training Loss per 5000 steps: 0.005704062525182962
Training Accuracy per 5000 steps: 100.0


3791it [19:20,  3.27it/s]


Total Accuracy for Epoch 1: 97.75755177417227
Training Loss Epoch: 0.07045120535857613
Training Accuracy Epoch: 97.75755177417227


1it [00:00,  4.40it/s]

Training Loss per 5000 steps: 1.4391340017318726
Training Accuracy per 5000 steps: 75.0


1915it [09:46,  3.26it/s]

In [None]:
EPOCHS = 5
FINAL_LR = 1e-4
FINAL_TRAIN_BATCH_SIZE = 16
FINAL_GAMMA = 0.9

train_with_hyperparameters(FINAL_LR, FINAL_TRAIN_BATCH_SIZE, EPOCHS)

# Evaluation

In [None]:
print("Classification Report:")
print(classification_report(targets.cpu().numpy(), preds.cpu().numpy()))

# Confusion matrix
cm = confusion_matrix(targets.cpu().numpy(), preds.cpu().numpy())
print("Confusion Matrix:")
print(cm)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()