# Data Sampling Approaches

In this notebook we experiment with vaious data sampling approaches to improve the model's performance.

Our analysis of the data showed that the dataset had imbalanced classes. The number of examples without patronising and condescending language (PCL) is much higher than the number of examples with PCL. Models trained on imbalanced datasets may learn biased prior probabilities.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import pipeline, RobertaModel, AutoTokenizer, AutoModelForSequenceClassification, AdamW, DataCollatorWithPadding, get_scheduler
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler

In [None]:
from torch import cuda

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
# Load the pre-trained model
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, do_lower_case=True)
pretrained_model = RobertaModel.from_pretrained(checkpoint, num_labels=2)
pretrained_model = pretrained_model.to(device)

MAX_LEN = 256
LEARNING_RATE = 1e-05

In [None]:
# Class for the data
class PCLData(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = data
        self.text = self.data.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=self.max_len,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 4,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = pretrained_model
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
    

model = RobertaClass()
model.to(device)

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamWs(params =  model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct


def train(epoch, training_loader):
    tr_loss = 0; n_correct = 0; steps = 0; seen = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        preds = model(ids, mask, token_type_ids)
        loss = loss_function(preds, targets)
        tr_loss += loss.item()
        _, pred_labels = torch.max(preds.data, dim=1)
        n_correct += calcuate_accuracy(pred_labels, targets)

        steps += 1
        seen+=targets.size(0)
        
        if _%5000==0:
            curr_loss = tr_loss/steps
            curr_acc = (n_correct*100)/seen 
            print(f"Training Loss per 5000 steps: {curr_loss}")
            print(f"Training Accuracy per 5000 steps: {curr_acc}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Total Accuracy for Epoch {epoch}: {(n_correct*100)/seen}')
    epoch_loss = tr_loss/steps
    epoch_accu = (n_correct*100)/seen
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 


def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; steps=0; seen=0
    preds_model = torch.tensor([]).to(device); targets_model = torch.tensor([]).to(device)

    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            preds = model(ids, mask, token_type_ids).squeeze()
            
            _, pred_labels = torch.max(preds.data, dim=1)
            n_correct += calcuate_accuracy(pred_labels, targets)

            steps += 1
            seen+=targets.size(0)

            preds_model = torch.cat((preds_model, pred_labels))
            targets_model = torch.cat((targets_model, targets))
            
    epoch_accu = (n_correct*100)/seen

    return epoch_accu, preds_model, targets_model

## Approach 1: oversampling

Random oversampling: a random choice of minority instances are duplicated.

In [None]:
train_dataset = pd.read_csv('train_dev_data/train_set.csv')
test_dataset = pd.read_csv('train_dev_data/dev_set.csv')
print(train_dataset['label'].value_counts())

X_train = train_dataset[['text']]  # Feature columns
y_train = train_dataset['label']  # Target column

# Initialize the random oversampler
ros = RandomOverSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Update the dataset with the resampled values
train_dataset = pd.DataFrame(X_resampled, columns=X_train.columns)
train_dataset['label'] = y_resampled

# Verify the oversampling result
print(train_dataset['label'].value_counts())

In [None]:
from datasets import Dataset

test_dataset = test_dataset[['text', 'label']]

train_dataset = PCLData(train_dataset, tokenizer, MAX_LEN)
test_dataset = PCLData(test_dataset, tokenizer, MAX_LEN)

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch, training_loader)

In [None]:
acc, preds, targets = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
print("Classification Report:")
print(classification_report(targets.cpu().numpy(), preds.cpu().numpy()))

# Confusion matrix
cm = confusion_matrix(targets.cpu().numpy(), preds.cpu().numpy())
print("Confusion Matrix:")
print(cm)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
output_model_file = 'roberta_oversampling.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

## Approach 2: undersampling

Random undersampling: a random choice of majority instances are removed from the dataset.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X_train = train_dataset[['text']]  # Feature columns
y_train = train_dataset['label']  # Target column

# Initialize the random oversampler
rus = RandomUnderSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Update the dataset with the resampled values
train_dataset = pd.DataFrame(X_resampled, columns=X_train.columns)
train_dataset['label'] = y_resampled

# Verify the oversampling result
print(train_dataset['label'].value_counts())


In [None]:
from datasets import Dataset

test_dataset = test_dataset[['text', 'label']]

train_dataset = PCLData(train_dataset, tokenizer, MAX_LEN)
test_dataset = PCLData(test_dataset, tokenizer, MAX_LEN)

training_loader = DataLoader(train_dataset, **train_params)
testing_loader = DataLoader(test_dataset, **test_params)

In [None]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch, training_loader)

In [None]:
acc, preds, targets = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

In [None]:
print("Classification Report:")
print(classification_report(targets.cpu().numpy(), preds.cpu().numpy()))

# Confusion matrix
cm = confusion_matrix(targets.cpu().numpy(), preds.cpu().numpy())
print("Confusion Matrix:")
print(cm)

# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
output_model_file = 'roberta_oversampling.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

## Approach 3: added synthetically generated data


In [None]:
from imblearn.over_sampling import SMOTE
