# TRAINING UNWEIGHTED MULTICLASS BERT CLASSIFIER

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle

# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Evaluation
from sklearn.metrics import classification_report, f1_score

  from .autonotebook import tqdm as notebook_tqdm


## Load Datasets

In [2]:
# Load datasets
training_data = pd.read_pickle('./Data/Clean Training Data/training_data_multiclass.pkl')

df_raw = {}

# write to dict
for dataset in training_data:
    df_raw[dataset] = training_data[dataset].copy() #.sample(n=10000, random_state=123)

In [3]:
# for multiclass: convert string label names into integer IDs

# df_raw['davidson2017'].label.replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace = True)
# df_raw['founta2018'].label.replace({"hateful": 0, "abusive": 1, "normal": 2, "spam": 3}, inplace = True)

# Ensure the column is of type 'string'
df_raw['davidson2017']['label'] = df_raw['davidson2017']['label'].astype(str)
df_raw['founta2018']['label'] = df_raw['founta2018']['label'].astype(str)

# Replace integer values in a pandas Series
df_raw['davidson2017']['label'] = df_raw['davidson2017']['label'].replace({"hateful": "0", "offensive": "1", "neither": "2"})
df_raw['founta2018']['label'] = df_raw['founta2018']['label'].replace({"hateful": "0", "abusive": "1", "normal": "2", "spam": "3"})

# Now, you should have the desired integer replacements in the StringArray column.

for dataset in df_raw:
    print(dataset)
    print(df_raw[dataset].groupby('label').text.count())
    print()

davidson2017
label
0     1430
1    19190
2     4163
Name: text, dtype: int64

founta2018
label
0     4965
1    27150
2    53851
3    14030
Name: text, dtype: int64



In [4]:
# Split each dataset into training and validation set
df_train, df_valtest, df_val, df_test = {}, {}, {}, {}

for dataset in df_raw:
    df_train[dataset], df_valtest[dataset] = train_test_split(df_raw[dataset], test_size=0.2, stratify=df_raw[dataset].label, random_state=123)
    df_val[dataset], df_test[dataset] = train_test_split(df_valtest[dataset], test_size=0.5, stratify=df_valtest[dataset].label, random_state=123)

In [5]:
# Split up text and label columns in dataframes into series for each dataset
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}

for dataset in df_raw:
    train_texts[dataset] = df_train[dataset].text.astype("string").tolist()
    val_texts[dataset] = df_val[dataset].text.astype("string").tolist()
    test_texts[dataset] = df_test[dataset].text.astype("string").tolist()
    
    train_labels[dataset] = df_train[dataset].label.tolist()
    val_labels[dataset] = df_val[dataset].label.tolist()
    test_labels[dataset] = df_test[dataset].label.tolist()

## Tokenize Texts

In [6]:
# import tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [7]:
# Tokenize text series for each dataset
train_encodings, val_encodings, test_encodings = {}, {}, {}

for dataset in df_raw:
    train_encodings[dataset] = tokenizer(train_texts[dataset], truncation=True, padding=True)
    val_encodings[dataset] = tokenizer(val_texts[dataset], truncation=True, padding=True)
    test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True)

## Create PyTorch Datasets 

In [8]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset, val_dataset, test_dataset = {}, {}, {}
    
for dataset in df_raw:
    train_dataset[dataset] = HateDataset(train_encodings[dataset], train_labels[dataset])
    val_dataset[dataset] = HateDataset(val_encodings[dataset], val_labels[dataset])
    test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])

## Train Unweighted Multiclass Models

In [18]:
# check CUDA availability
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name())
# print(torch.cuda.device_count(), 'GPUs')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
device

device(type='cpu')

In [19]:
# Define training arguments, matching weighted binary model (for which we did hyperparameter tuning)
training_args = {}

for dataset in df_raw:
    training_args[dataset] = TrainingArguments(
        save_steps = 2500,
        output_dir='./Models/BERT_{}_multiclass/Checkpoints'.format(dataset), # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        evaluation_strategy = 'epoch',
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        learning_rate = 5e-5,
        seed = 123
    )

In [20]:
# define explicit model initialisation. Different functions for each dataset to have correct number of labels (could be more elegant)
def model_init_D17(dataset):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

def model_init_F18(dataset):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

In [21]:
# Instantiate trainer objects for each dataset
trainer = {}

for dataset in df_raw:
    if dataset == 'davidson2017':
        trainer[dataset] = Trainer(
            args=training_args[dataset],                  
            train_dataset=train_dataset[dataset],         
            eval_dataset=val_dataset[dataset],            
            model_init = model_init_D17
        )
    if dataset == 'founta2018':
        trainer[dataset] = Trainer(
            args=training_args[dataset],                  
            train_dataset=train_dataset[dataset],         
            eval_dataset=val_dataset[dataset],            
            model_init = model_init_F18
        )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacty of 2.00 GiB of which 14.51 MiB is free. Of the allocated memory 1.16 GiB is allocated by PyTorch, and 101.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
# Train models for each dataset
for dataset in trainer:
    print('Training multiclass {} BERT model'.format(dataset))
    trainer[dataset].train()

Training multiclass davidson2017 BERT model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 2.00 GiB of which 14.51 MiB is free. Of the allocated memory 1.16 GiB is allocated by PyTorch, and 101.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Save Model and Tokenizer

In [14]:
for dataset in trainer:    
    trainer[dataset].save_model('./Models/BERT_{}_multiclass/Final'.format(dataset))
    tokenizer.save_pretrained('./Models/BERT_{}_multiclass/Final'.format(dataset))

## Reload Models
So that models can be evaluated on test set even after kernel resets

In [9]:
# load fine-tuned models
models = {}

for dataset in ['davidson2017','founta2018']:
        models[dataset] = BertForSequenceClassification.from_pretrained("./Models/BERT_{}_multiclass/Final".format(dataset))

In [10]:
# Instantiate trainer objects for each model (already fine-tuned so no longer necessary to specify training and eval data)
# output directory is redundant because there is no further training but needs to be specified anyway

trainer = {}

for model in models:
    trainer[model] = Trainer(
        model=models[model],         
        args=TrainingArguments(
            output_dir='./Models/BERT_{}_multiclass/Test'.format(model),
            per_device_eval_batch_size = 64)
)

## Evaluate Models on Test Data

In [None]:
# Evaluate each model on its corresponding test set

results = {}

for dataset in trainer:
    print('Evaluating multiclass {} BERT model on test data'.format(dataset))
    results[dataset] = trainer[dataset].predict(test_dataset[dataset])
    for metric in results[dataset].metrics:
        print(metric, results[dataset].metrics['{}'.format(metric)])
    print()

In [12]:
# write predictions to series
pred_labels={}

for dataset in trainer:

    preds=[]
    
    for row in results[dataset][0]:
        preds.append(int(np.argmax(row)))
    
    pred_labels[dataset] = pd.Series(preds)

# print classification reports for each model

for dataset in trainer:
        print(dataset.upper(), 'multiclass')
        print(classification_report(test_labels[dataset],pred_labels[dataset]))
        print()

DAVIDSON2017
              precision    recall  f1-score   support

           0       0.49      0.37      0.42       143
           1       0.93      0.96      0.95      1919
           2       0.91      0.87      0.89       417

    accuracy                           0.91      2479
   macro avg       0.78      0.73      0.75      2479
weighted avg       0.90      0.91      0.91      2479


FOUNTA2018
              precision    recall  f1-score   support

           0       0.53      0.40      0.45       497
           1       0.85      0.93      0.89      2715
           2       0.86      0.86      0.86      5385
           3       0.62      0.56      0.59      1403

    accuracy                           0.82     10000
   macro avg       0.72      0.69      0.70     10000
weighted avg       0.81      0.82      0.81     10000




In [13]:
# f1 scores
for dataset in trainer:
        print(dataset.upper())
        for average in ['micro', 'macro', 'weighted']:
            print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[dataset], average=average)))
        print()

DAVIDSON2017
micro F1 score: 91.09%
macro F1 score: 75.25%
weighted F1 score: 90.70%

FOUNTA2018
micro F1 score: 81.67%
macro F1 score: 69.92%
weighted F1 score: 81.21%



In [14]:
for dataset in trainer:
        print(dataset.upper())
        print(pred_labels[dataset].value_counts())
        print()

DAVIDSON2017
1    1971
2     399
0     109
dtype: int64

FOUNTA2018
2    5369
1    2977
3    1280
0     374
dtype: int64

