## NLP CW - Michelle Lo, Hetty Symes, Evelyn Nutton

RoBERTa base model

In [None]:

import pandas as pd
import torch
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, AutoTokenizer
import nltk
from dataset.dont_patronize_me import DontPatronizeMe

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:

train_df = pd.read_csv("train_dev_data/train_set.csv")
test_df = pd.read_csv("train_dev_data/dev_set.csv")
dontpatroniseme = DontPatronizeMe(None, "test_data/task4_test.tsv")
dontpatroniseme.load_test()
official_test_df =  dontpatroniseme.test_set_df


### Combining Resampled and Augmented Data into New Data Frame

In [None]:
# Load pre-sampled, pre-augmented dataset
train_df = pd.read_csv("train_dev_data/train_set_aug_resampled.csv")

# Verify the oversampling result
print(train_df['label'].value_counts())
train_df.head()

### Loading the Roberta Base Model

In [None]:
# Load the pre-trained model
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, do_lower_case=True)
pretrained_model = RobertaModel.from_pretrained(checkpoint, num_labels=2)
pretrained_model = pretrained_model.to(device)

### PCLData class

In [None]:
# Class for the data
class PCLData(Dataset):
    def __init__(self, data, tokenizer, max_len, test=False):
        self.tokenizer = tokenizer
        self.data = data
        self.text = self.data.text
        self.test = test
        self.targets = None if test else self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor([]) if self.test else torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
MAX_LEN = 256

In [None]:
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))
train_dataset = PCLData(train_df, tokenizer, MAX_LEN)
test_dataset = PCLData(test_df, tokenizer, MAX_LEN)

test_params = {'batch_size': 4, 'shuffle': True, 'num_workers': 0}
testing_loader = DataLoader(test_dataset, **test_params)

### Fine Tuning Model

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = pretrained_model
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

### Alternative Loss - Self Adjusting Dice Loss
Taken from the unofficial Pytorch implementation of https://arxiv.org/abs/1911.02855, which can be founds here https://github.com/fursovia/self-adj-dice.


In [None]:
# Loss

# Taken from the SelfAdjDiceLoss python module source code which cannot be imported regularly due to pytorch compatibility issues
class SelfAdjDiceLoss(torch.nn.Module):
    r"""
    Creates a criterion that optimizes a multi-class Self-adjusting Dice Loss
    ("Dice Loss for Data-imbalanced NLP Tasks" paper)

    Args:
        alpha (float): a factor to push down the weight of easy examples
        gamma (float): a factor added to both the nominator and the denominator for smoothing purposes
        reduction (string): Specifies the reduction to apply to the output:
            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
            ``'mean'``: the sum of the output will be divided by the number of
            elements in the output, ``'sum'``: the output will be summed.

    Shape:
        - logits: `(N, C)` where `N` is the batch size and `C` is the number of classes.
        - targets: `(N)` where each value is in [0, C - 1]
    """

    def __init__(self, alpha: float = 1.0, gamma: float = 1.0, reduction: str = "mean") -> None:
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        probs = torch.softmax(logits, dim=1)
        print(logits.shape)
        probs = torch.gather(probs, dim=1, index=targets.unsqueeze(1))

        probs_with_factor = ((1 - probs) ** self.alpha) * probs
        loss = 1 - (2 * probs_with_factor + self.gamma) / (probs_with_factor + 1 + self.gamma)

        if self.reduction == "mean":
            return loss.mean()
        elif self.reduction == "sum":
            return loss.sum()
        elif self.reduction == "none" or self.reduction is None:
            return loss
        else:
            raise NotImplementedError(f"Reduction `{self.reduction}` is not supported.")

In [None]:

# Creating the loss function and optimizer
# criterion = SelfAdjDiceLoss()
loss_function = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:

def train(model, epoch, optimizer, training_loader, scheduler=None):
    tr_loss = 0; n_correct = 0; steps = 0; seen = 0
    model.train()
    for i,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        preds = model(ids, mask, token_type_ids)
        loss = loss_function(preds, targets)
        tr_loss += loss.item()
        _, pred_labels = torch.max(preds.data, dim=1)
        n_correct += calcuate_accuracy(pred_labels, targets)

        steps += 1
        seen+=targets.size(0)
        
        if i%5000==0:
            curr_loss = tr_loss/steps
            curr_acc = (n_correct*100)/seen 
            print(f"Training Loss per 5000 steps: {curr_loss}")
            print(f"Training Accuracy per 5000 steps: {curr_acc}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

    print(f'Total Accuracy for Epoch {epoch}: {(n_correct*100)/seen}')
    epoch_loss = tr_loss/steps
    epoch_accu = (n_correct*100)/seen
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; steps=0; seen=0
    preds_model = torch.tensor([]).to(device); targets_model = torch.tensor([]).to(device)

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            preds = model(ids, mask, token_type_ids).squeeze()
            
            _, pred_labels = torch.max(preds.data, dim=1)
            n_correct += calcuate_accuracy(pred_labels, targets)

            steps += 1
            seen+=targets.size(0)

            preds_model = torch.cat((preds_model, pred_labels))
            targets_model = torch.cat((targets_model, targets))
            
    epoch_accu = (n_correct*100)/seen

    
    
    return epoch_accu, preds_model, targets_model

# acc, preds, targets = valid(model, testing_loader)
# print("Accuracy on test data = %0.2f%%" % acc)

# Hyperparameter tuning

In [None]:
def train_with_hyperparameters(save_model_name, learning_rate, batch_size, epochs, use_scheduler=False, gamma=0.9):
    train_params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 0}
    training_loader = DataLoader(train_dataset, **train_params)
    model = RobertaClass().to(device)
    optimizer = torch.optim.AdamW(params =  model.parameters(), lr=learning_rate)

    scheduler = None
    if use_scheduler:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma)
    

    for epoch in range(epochs):
        train(model, epoch, optimizer, training_loader, scheduler)

    torch.save(model.state_dict(), f"models/{save_model_name}.pt")

    acc, preds, targets = valid(model, testing_loader)
    print("Accuracy on test data = %0.2f%%" % acc)
    print(classification_report(targets.cpu().numpy(), preds.cpu().numpy()))
    

In [None]:
# Hyperparameters to tune: learning rate and batch size

batch_sizes = [4, 16, 32]
learning_rates = [1e-5, 1e-3, 1e-2]
gamma_rates = [0.3, 0.5, 0.9]

model_id = 0

for batch_size in batch_sizes:
    for lr in learning_rates:
        print(f"Model {model_id}: Batch size {batch_size}, LR {lr}, no scheduler")
        train_with_hyperparameters(model_id, lr, batch_size, 5, use_scheduler=False)
        model_id += 1
        for gamma in gamma_rates:
            print(f"Model {model_id}: Batch size {batch_size}, LR {lr}, scheduler with gamma {gamma}")
            train_with_hyperparameters(model_id, lr, batch_size, 5, use_scheduler=True, gamma=gamma)
            model_id += 1