# TRAINING / FINE-TUNING WEIGHTED BINARY BERT CLASSIFIER

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Evaluation
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight

## Load Datasets

In [2]:
## Load datasets
#training_data = pd.read_pickle('./Data/Clean Training Data/training_data_binary.pkl')
#
#df_raw = {}
#
## write to dict
#for dataset in training_data:
#    df_raw[dataset] = training_data[dataset].copy() #.sample(n=100, random_state=123)
#
#
#FOR NOW: Train just on "combined" dataset 
#
## delete all but "combined" dict entries
#for model in ['davidson2017', 'founta2018', 'dynabench2021']:
#    del df_raw[model]
#
## Split each dataset into training and validation set
#df_train, df_valtest, df_val, df_test = {}, {}, {}, {}
#
#for dataset in df_raw:
#    df_train[dataset], df_valtest[dataset] = train_test_split(df_raw[dataset], test_size=0.2, stratify=df_raw[dataset].label, random_state=123)
#    df_val[dataset], df_test[dataset] = train_test_split(df_valtest[dataset], test_size=0.5, stratify=df_valtest[dataset].label, random_state=123)
#
## Split up text and label columns in dataframes into series for each dataset
#train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}
#
#for dataset in df_raw:
#    train_texts[dataset] = df_train[dataset].text.astype("string").tolist()
#    val_texts[dataset] = df_val[dataset].text.astype("string").tolist()
#    test_texts[dataset] = df_test[dataset].text.astype("string").tolist()
#    
#    train_labels[dataset] = df_train[dataset].label.tolist()
#    val_labels[dataset] = df_val[dataset].label.tolist()
#    test_labels[dataset] = df_test[dataset].label.tolist()

# User input ( only variable declaration )

In [None]:
dataset_list = ['davidson2017', 'founta2018', 'dynabench2021','combined']

In [3]:
# initialise train,val,test dictionnaries
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}

# write to dict
for dataset in dataset_list:
    data_train = pd.read_csv(f"/home/ec2-user/Projects/v0.1/Data/{dataset}/{dataset}_train.csv",lineterminator="\n")#training_data[dataset].copy()#.sample(n=100, random_state=123)#df_train[dataset].text.astype("string").tolist()
    data_test = pd.read_csv(f"/home/ec2-user/Projects/v0.1/Data/{dataset}/{dataset}_test.csv",lineterminator="\n")#training_data[dataset].copy()#.sample(n=100, random_state=123)#df_train[dataset].text.astype("string").tolist()
    data_val = pd.read_csv(f"/home/ec2-user/Projects/v0.1/Data/{dataset}/{dataset}_valid.csv",lineterminator="\n")#training_data[dataset].copy()#.sample(n=100, random_state=123)#df_train[dataset].text.astype("string").tolist()
    
    train_texts[dataset] = data_train.text.astype("string").tolist()
    val_texts[dataset] = data_val.text.astype("string").tolist()
    test_texts[dataset] = data_test.text.astype("string").tolist()
    
    train_labels[dataset] = data_train.label.tolist()
    val_labels[dataset] = data_val.label.tolist()
    test_labels[dataset] = data_test.label.tolist()

    

In [4]:
# compute class weights based on training data label distribution
class_weights = {}

for dataset in dataset_list:
    class_weights[dataset] = compute_class_weight('balanced', classes = np.unique(train_labels[dataset]), y = train_labels[dataset])

class_weights

{'davidson2017': array([0.53061771, 8.66520979]),
 'founta2018': array([ 0.52612333, 10.06998993]),
 'dynabench2021': array([1.00262325, 0.99739044]),
 'combined': array([0.58043118, 3.60824742])}

## Tokenize Texts

In [9]:
# import tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [10]:
# Tokenize text series for each dataset
train_encodings, val_encodings, test_encodings = {}, {}, {}

for dataset in dataset_list:
    train_encodings[dataset] = tokenizer(train_texts[dataset], truncation=True, padding=True,max_length = 64)
    val_encodings[dataset] = tokenizer(val_texts[dataset], truncation=True, padding=True,max_length = 64)
    test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True,max_length = 64)
    
# sanity check for length of encoding vectors
len(train_encodings[dataset]["input_ids"][0])

64

## Create PyTorch Datasets 

In [11]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset, val_dataset, test_dataset = {}, {}, {}
    
for dataset in df_raw:
    train_dataset[dataset] = HateDataset(train_encodings[dataset], train_labels[dataset])
    val_dataset[dataset] = HateDataset(val_encodings[dataset], val_labels[dataset])
    test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])

## Train Weighted Models

In [12]:
# check CUDA availability

if torch.cuda.is_available():
    print('CUDA available')
    print(torch.cuda.get_device_name())
    print(torch.cuda.device_count(), 'GPUs')
else:
    print('CUDA unavailable')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

CUDA unavailable


device(type='cpu')

In [13]:
# Define training arguments
training_args = {}

for dataset in dataset_list:
    training_args[dataset] = TrainingArguments(
        save_steps = 10000,
        output_dir = './Models/BERT_{}_weighted/Checkpoints'.format(dataset), # output directory
        num_train_epochs = 3,              # total number of training epochs
        per_device_train_batch_size = 16,  # batch size per device during training
        per_device_eval_batch_size = 64,   # batch size for evaluation
        evaluation_strategy = 'epoch',
        warmup_steps = 500,                # number of warmup steps for learning rate scheduler
        weight_decay = 0.01,               # strength of weight decay
        learning_rate = 5e-5,
        seed = 123
    )

In [14]:
# define Trainer with weighted loss for combined dataset

class WeightedTrainerCombined(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        weighted_loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights['combined'])).to(device)
        return weighted_loss(logits, labels)

In [15]:
# define explicit model initialisation to allow for hyperparameter search
def model_init():
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

In [16]:
# Instantiate weighted trainer objects for each dataset
trainer = {}

for dataset in dataset_list:
    if dataset == 'combined':
        trainer[dataset] = WeightedTrainerCombined(                       
            args=training_args[dataset],                  
            train_dataset=train_dataset[dataset],         
            eval_dataset=val_dataset[dataset],            
            model_init = model_init
        )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Fine-tune models for each dataset
for dataset in trainer:
    print('Training weighted {} BERT model'.format(dataset))
    trainer[dataset].train()

Training weighted combined BERT model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss


## Save Model and Tokenizer

In [None]:
for dataset in trainer:    
    trainer[dataset].save_model('./Models/BERT_{}_weighted/Final'.format(dataset))
    tokenizer.save_pretrained('./Models/BERT_{}_weighted/Final'.format(dataset))

## Reload Models
So that models can be evaluated on test set even after kernel resets

In [19]:
# load fine-tuned models

def load_model(dataset):
    model =  BertForSequenceClassification.from_pretrained(f'./Models/BERT_{dataset}_weighted_120221/Final')
    trainer = WeightedTrainerCombined(
        model = model,
        args = TrainingArguments(
            output_dir=f'./Models/BERT_{dataset}_weighted/Test',
            per_device_eval_batch_size = 64)
    )
    return trainer

trainer = {}

for dataset in ['combined']:
    trainer[dataset] = load_model(dataset)

## Evaluate Models on Test Data

In [20]:
# Evaluate each model on its corresponding test set

results = {}

for dataset in trainer:
    print(f'Evaluating weighted {dataset} BERT model on test data')
    results[dataset] = trainer[dataset].predict(test_dataset[dataset])
    for metric in results[dataset].metrics:
        print(metric, results[dataset].metrics['{}'.format(metric)])
    print()

Evaluating weighted combined BERT model on test data


eval_loss 0.30845358967781067
eval_runtime 833.389
eval_samples_per_second 19.923



In [21]:
# write predictions to series
pred_labels={}

for dataset in trainer:

    preds=[]
    
    for row in results[dataset][0]:
        preds.append(int(np.argmax(row)))
    
    pred_labels[dataset] = pd.Series(preds)

# print classification reports for each model

for dataset in trainer:
        print(dataset.upper())
        print(classification_report(test_labels[dataset],pred_labels[dataset]))
        print()

COMBINED
              precision    recall  f1-score   support

           0       0.94      0.93      0.94     13738
           1       0.69      0.72      0.70      2866

    accuracy                           0.90     16604
   macro avg       0.81      0.83      0.82     16604
weighted avg       0.90      0.90      0.90     16604




In [22]:
# f1 scores
for dataset in trainer:
        print(dataset.upper())
        for average in ['micro', 'macro', 'weighted']:
            print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[dataset], average=average)))
        print()

COMBINED
micro F1 score: 89.56%
macro F1 score: 82.07%
weighted F1 score: 89.66%



In [23]:
# distribution of predictions
for dataset in trainer:
        print(dataset.upper())
        print(pred_labels[dataset].value_counts())
        print()

COMBINED
0    13594
1     3010
dtype: int64

