# APPLYING BERT CLASSIFIERS TO HATECHECK

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle

# Preprocessing
import torch

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

## Load Test Suite

In [2]:
# load test suite
hatecheck_df = pd.read_csv('./Data/Test Suite/hatecheck_final_ACL.csv', index_col=0)
hatecheck_df

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].
...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.


## Convert Test Suite to PyTorch Dataset

In [3]:
# separate test suite into text and label series series
hatecheck_texts = hatecheck_df.test_case.astype("string").tolist()
hatecheck_labels = hatecheck_df.label_gold.replace({'hateful': 1, 'non-hateful': 0}).tolist()

In [4]:
# import tokenizer (which includes custom special tokens for URLs, mentions and emojis and is the same across training datasets)
tokenizer = BertTokenizerFast.from_pretrained("./Models/BERT_davidson2017_weighted/Final")

In [5]:
# tokenize text series
hatecheck_encodings = tokenizer(hatecheck_texts, truncation=True, padding=True)

In [6]:
# write test suite as PyTorch dataset
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

hatecheck_dataset = HateDataset(hatecheck_encodings, hatecheck_labels)

## Load Fine-Tuned BERT Models
Weighted binary models for Davidson and Founta datasets

In [7]:
# load fine-tuned models
models = {}

for dataset in ['davidson2017', 'founta2018']:
    models['{}_weighted'.format(dataset)] = BertForSequenceClassification.from_pretrained("./Models/BERT_{}_weighted/Final".format(dataset))

In [None]:
# Instantiate trainer objects for each model (already fine-tuned so no longer necessary to specify training and eval data)
# output directory is redundant because there is no further training but needs to be specified anyway

trainer = {}

for model in models:
    trainer[model] = Trainer(
        model=models[model],         
        args=TrainingArguments(
            output_dir='./Models/BERT_{}/Test'.format(model),
            per_device_eval_batch_size = 64)
)

## Apply BERT Models to Test Suite

In [None]:
# apply models to test suite
results = {}

for model in trainer:
    results[model] = trainer[model].predict(hatecheck_dataset)

## Merge BERT Results with Test Suite

In [10]:
# write predictions to series
label_pred={}

for model in trainer:

    preds=[]
    
    for row in results[model][0]:
        preds.append(int(np.argmax(row)))
    
    label_pred[model] = pd.Series(preds)
    label_pred[model].replace({1: 'hateful', 0: 'non-hateful'}, inplace=True)
    label_pred[model].name = 'pred_BERT_{}'.format(model)

In [None]:
# concatenate with test suite case ID column for export

export_df = hatecheck_df['case_id']

for model in trainer:
    export_df = pd.concat([export_df, label_pred[model]], axis= 1)
    
export_df

## Export Results

In [12]:
export_df.to_pickle('./Data/Test Suite/results_BERT_weighted.pkl')