In [2]:
!pip install transformers datasets imblearn evaluate

Collecting datasets
  Using cached datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Using cached datasets-3.3.0-py3-none-any.whl (484 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets, imblearn
Successfully installed datasets-3.3.0 dill-0.3.8 imblearn-0.0 multipr

# Data Sampling Approaches

In this notebook we experiment with vaious data sampling approaches to improve the model's performance.

Our analysis of the data showed that the dataset had imbalanced classes. The number of examples without patronising and condescending language (PCL) is much higher than the number of examples with PCL. Models trained on imbalanced datasets may learn biased prior probabilities.

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

In [4]:
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', padding=True, truncation=True, do_lower_case=True)

def tokenize_sample(sample):
  # make sure types are as expected
  sample['text'] = [str(i) for i in sample['text']]
  sample['label'] = [int(i) for i in sample['label']]
  return tokenizer(sample['text'], padding=True, truncation=True, max_length=256)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [68]:
# Load model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
train_dataset = pd.read_csv('train_dev_data/train_set.csv')
test_dataset = pd.read_csv('train_dev_data/dev_set.csv')

print(train_dataset['label'].value_counts())

label
0    7581
1     794
Name: count, dtype: int64


## Approach 1: oversampling

Random oversampling: a random choice of minority instances are duplicated.

In [7]:
X_train = train_dataset[['text']]  # Feature columns
y_train = train_dataset['label']  # Target column

# Initialize the random oversampler
ros = RandomOverSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Update the dataset with the resampled values
train_dataset = pd.DataFrame(X_resampled, columns=X_train.columns)
train_dataset['label'] = y_resampled

# Verify the oversampling result
print(train_dataset['label'].value_counts())


label
0    7581
1    7581
Name: count, dtype: int64


In [9]:
from datasets import Dataset

test_dataset = test_dataset[['text', 'label']]

ds_train = Dataset.from_pandas(train_dataset)
ds_test = Dataset.from_pandas(test_dataset)

In [10]:
# Tokenize dataset
tokenized_train_dataset = ds_train.map(tokenize_sample, batched=True)
tokenized_test_dataset = ds_test.map(tokenize_sample, batched=True)

Map:   0%|          | 0/15162 [00:00<?, ? examples/s]

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

In [11]:
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [29]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [30]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_oversampled_roberta')
tokenizer.save_pretrained('./fine_tuned_oversampled_roberta')

Epoch,Training Loss,Validation Loss
1,0.0149,0.785616
2,0.0112,0.730022


('./fine_tuned_oversampled_roberta/tokenizer_config.json',
 './fine_tuned_oversampled_roberta/special_tokens_map.json',
 './fine_tuned_oversampled_roberta/vocab.json',
 './fine_tuned_oversampled_roberta/merges.txt',
 './fine_tuned_oversampled_roberta/added_tokens.json')

In [31]:
trainer.evaluate()

{'eval_loss': 0.730021595954895,
 'eval_runtime': 26.5794,
 'eval_samples_per_second': 78.783,
 'eval_steps_per_second': 19.715,
 'epoch': 2.0}

In [45]:
import numpy as np

model.eval()
eval_pred = trainer.predict(tokenized_test_dataset)
eval_labels = eval_pred.label_ids
predictions = np.argmax(eval_pred.predictions, axis=1)

4014

In [52]:
metric = evaluate.combine(["precision", "recall", "accuracy"])
metric.compute(predictions=predictions, references=eval_labels)

{'precision': 0.6335403726708074,
 'recall': 0.5125628140703518,
 'accuracy': 0.9255014326647565}

## Approach 2: undersampling

Random undersampling: a random choice of majority instances are removed from the dataset.

In [56]:
from imblearn.under_sampling import RandomUnderSampler

X_train = train_dataset[['text']]  # Feature columns
y_train = train_dataset['label']  # Target column

# Initialize the random oversampler
rus = RandomUnderSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Update the dataset with the resampled values
train_dataset = pd.DataFrame(X_resampled, columns=X_train.columns)
train_dataset['label'] = y_resampled

# Verify the oversampling result
print(train_dataset['label'].value_counts())


label
0    794
1    794
Name: count, dtype: int64


In [57]:
from datasets import Dataset

test_dataset = test_dataset[['text', 'label']]

ds_train = Dataset.from_pandas(train_dataset)
ds_test = Dataset.from_pandas(test_dataset)

In [58]:
# Tokenize dataset
tokenized_train_dataset = ds_train.map(tokenize_sample, batched=True)
tokenized_test_dataset = ds_test.map(tokenize_sample, batched=True)

Map:   0%|          | 0/1588 [00:00<?, ? examples/s]

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

In [59]:
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [60]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [61]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_undersampled_roberta')
tokenizer.save_pretrained('./fine_tuned_undersampled_roberta')

Epoch,Training Loss,Validation Loss
1,No log,0.741363
2,No log,0.813857


('./fine_tuned_undersampled_roberta/tokenizer_config.json',
 './fine_tuned_undersampled_roberta/special_tokens_map.json',
 './fine_tuned_undersampled_roberta/vocab.json',
 './fine_tuned_undersampled_roberta/merges.txt',
 './fine_tuned_undersampled_roberta/added_tokens.json')

In [62]:
trainer.evaluate()

{'eval_loss': 0.7413634061813354,
 'eval_runtime': 26.3552,
 'eval_samples_per_second': 79.453,
 'eval_steps_per_second': 19.882,
 'epoch': 2.0}

In [63]:
import numpy as np
import evaluate

model.eval()
eval_pred = trainer.predict(tokenized_test_dataset)
eval_labels = eval_pred.label_ids
predictions = np.argmax(eval_pred.predictions, axis=1)

In [64]:
metric = evaluate.combine(["precision", "recall", "accuracy"])
metric.compute(predictions=predictions, references=eval_labels)

{'precision': 0.6352941176470588,
 'recall': 0.542713567839196,
 'accuracy': 0.9269340974212035}

## Approach 3: added synthetically generated data
