# TRAINING WEIGHTED BINARY BERT CLASSIFIER

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle

# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from ray import tune

# Evaluation
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight

## Load Datasets

In [2]:
# Load datasets
training_data = pd.read_pickle('./Data/Clean Training Data/training_data_binary.pkl')

df_raw = {}

# write to dict
for dataset in training_data:
    df_raw[dataset] = training_data[dataset].copy() #.sample(n=2000, random_state=123)

In [3]:
# Split each dataset into training and validation set
df_train, df_valtest, df_val, df_test = {}, {}, {}, {}

for dataset in df_raw:
    df_train[dataset], df_valtest[dataset] = train_test_split(df_raw[dataset], test_size=0.2, stratify=df_raw[dataset].label, random_state=123)
    df_val[dataset], df_test[dataset] = train_test_split(df_valtest[dataset], test_size=0.5, stratify=df_valtest[dataset].label, random_state=123)

In [4]:
# Split up text and label columns in dataframes into series for each dataset
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}

for dataset in df_raw:
    train_texts[dataset] = df_train[dataset].text.astype("string").tolist()
    val_texts[dataset] = df_val[dataset].text.astype("string").tolist()
    test_texts[dataset] = df_test[dataset].text.astype("string").tolist()
    
    train_labels[dataset] = df_train[dataset].label.tolist()
    val_labels[dataset] = df_val[dataset].label.tolist()
    test_labels[dataset] = df_test[dataset].label.tolist()

In [5]:
# compute class weights based on training data label distribution
class_weights = {}

for dataset in df_raw:
    class_weights[dataset] = compute_class_weight('balanced', classes = np.unique(train_labels[dataset]), y = train_labels[dataset])

class_weights

{'davidson2017': array([0.53061771, 8.66520979]),
 'founta2018': array([ 0.52612333, 10.06998993])}

## Tokenize Texts

In [6]:
# import tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [7]:
# Tokenize text series for each dataset
train_encodings, val_encodings, test_encodings = {}, {}, {}

for dataset in df_raw:
    train_encodings[dataset] = tokenizer(train_texts[dataset], truncation=True, padding=True)
    val_encodings[dataset] = tokenizer(val_texts[dataset], truncation=True, padding=True)
    test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True)

## Create PyTorch Datasets 

In [8]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset, val_dataset, test_dataset = {}, {}, {}
    
for dataset in df_raw:
    train_dataset[dataset] = HateDataset(train_encodings[dataset], train_labels[dataset])
    val_dataset[dataset] = HateDataset(val_encodings[dataset], val_labels[dataset])
    test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])

## Train Weighted Models

In [9]:
# check CUDA availability
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.device_count(), 'GPUs')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

True
Tesla K80
4 GPUs


device(type='cuda')

In [10]:
# Define training arguments
training_args = {}

for dataset in df_raw:
    training_args[dataset] = TrainingArguments(
        save_steps = 2500,
        output_dir='./Models/BERT_{}_weighted/Checkpoints'.format(dataset), # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        evaluation_strategy = 'epoch',
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        learning_rate = 5e-5,
        seed = 123
    )

In [11]:
# define Trainers with weighted loss for each dataset

class WeightedTrainerD17(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights['davidson2017'])).to(device)
        return weighted_loss(logits, labels)
    
class WeightedTrainerF18(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights['founta2018'])).to(device)
        return weighted_loss(logits, labels)

In [12]:
# define explicit model initialisation to allow for hyperparameter search
def model_init():
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

In [None]:
# Instantiate weighted trainer objects for each dataset
trainer = {}

for dataset in df_raw:
    if dataset == 'davidson2017':
        trainer[dataset] = WeightedTrainerD17(                       
            args=training_args[dataset],                  
            train_dataset=train_dataset[dataset],         
            eval_dataset=val_dataset[dataset],            
            model_init = model_init
        )
    if dataset == 'founta2018':
        trainer[dataset] = WeightedTrainerF18(                        
            args=training_args[dataset],                  
            train_dataset=train_dataset[dataset],         
            eval_dataset=val_dataset[dataset],            
            model_init = model_init
        )

In [None]:
# perform hyperparameter tuning using grid search
def custom_hp_space(trial):
    return {
        "learning_rate": tune.grid_search([2e-5, 3e-5, 5e-5]),
        "num_train_epochs": tune.grid_search([2, 3, 4]),
        "per_device_train_batch_size": tune.grid_search([16,32])
    }

best_run = {}

for dataset in trainer:
    best_run[dataset] = trainer[dataset].hyperparameter_search(
        backend = 'ray',
        hp_space = custom_hp_space,
        direction = 'minimize',
        n_trials = 1,
        progress_reporter = tune.JupyterNotebookReporter(overwrite=True)
    )

In [None]:
# report best runs
for dataset in trainer:
    print(dataset.upper())
    print(best_run[dataset])
    print()

In [None]:
# set trainer attributes to optimal configuration from hyperparameter tuning
for dataset in trainer:
    for n, v in best_run[dataset].hyperparameters.items():
        setattr(trainer[dataset].args, n, v)

In [None]:
# Train "best" models for each dataset
for dataset in trainer:
    print('Training weighted {} BERT model'.format(dataset))
    trainer[dataset].train()

## Save Model and Tokenizer

In [15]:
for dataset in trainer:    
    trainer[dataset].save_model('./Models/BERT_{}_weighted/Final'.format(dataset))
    tokenizer.save_pretrained('./Models/BERT_{}_weighted/Final'.format(dataset))

## Reload Models
So that models can be evaluated on test set even after kernel resets

In [12]:
# load fine-tuned models
models = {}

for dataset in ['davidson2017','founta2018']:
        models[dataset] = BertForSequenceClassification.from_pretrained('./Models/BERT_{}_weighted/Final'.format(dataset))

In [13]:
# Instantiate trainer objects for each model (already fine-tuned so no longer necessary to specify training and eval data)
# output directory is redundant because there is no further training but needs to be specified anyway

trainer = {}

for model in models:
    trainer[model] = Trainer(
        model=models[model],         
        args=TrainingArguments(
            output_dir='./Models/BERT_{}_weighted/Test'.format(model),
            per_device_eval_batch_size = 64)
)

## Evaluate Models on Test Data

In [None]:
# Evaluate each model on its corresponding test set

results = {}

for dataset in trainer:
    print('Evaluating weighted {} BERT model on test data'.format(dataset))
    results[dataset] = trainer[dataset].predict(test_dataset[dataset])
    for metric in results[dataset].metrics:
        print(metric, results[dataset].metrics['{}'.format(metric)])
    print()

In [15]:
# write predictions to series
pred_labels={}

for dataset in trainer:

    preds=[]
    
    for row in results[dataset][0]:
        preds.append(int(np.argmax(row)))
    
    pred_labels[dataset] = pd.Series(preds)

# print classification reports for each model

for dataset in trainer:
        print(dataset.upper())
        print(classification_report(test_labels[dataset],pred_labels[dataset]))
        print()

DAVIDSON2017
              precision    recall  f1-score   support

           0       0.98      0.93      0.95      2336
           1       0.36      0.64      0.46       143

    accuracy                           0.91      2479
   macro avg       0.67      0.78      0.71      2479
weighted avg       0.94      0.91      0.93      2479


FOUNTA2018
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      9504
           1       0.36      0.57      0.44       496

    accuracy                           0.93     10000
   macro avg       0.67      0.76      0.70     10000
weighted avg       0.95      0.93      0.94     10000




In [16]:
# f1 scores
for dataset in trainer:
        print(dataset.upper())
        for average in ['micro', 'macro', 'weighted']:
            print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[dataset], average=average)))
        print()

DAVIDSON2017
micro F1 score: 91.45%
macro F1 score: 70.77%
weighted F1 score: 92.52%

FOUNTA2018
micro F1 score: 92.93%
macro F1 score: 70.30%
weighted F1 score: 93.65%



In [17]:
# distribution of predictions
for dataset in trainer:
        print(dataset.upper())
        print(pred_labels[dataset].value_counts())
        print()

DAVIDSON2017
0    2228
1     251
dtype: int64

FOUNTA2018
0    9225
1     775
dtype: int64

