# EVALUATE MODELS

In [12]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast

# Evaluation
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight

## USER INPUT: Pick Models and Test Sets for Evaluation

In [13]:
# PR: this should be the only place that we specify which models we want to test and which test sets we want to test them on
model_list = ['BERT_combined_unweighted', 'BERT_combined_weighted']
last_model = "120221"
what_pct = 0.01 # what pct of the datasets to use for eval
test_data_list = ['davidson2017', 'dynabench2021','founta2018','combined'] # example

# PR: from here on out, there should be no user input neccessary

In [14]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Load Models

In [15]:
# PR: DG, you should load the tokenizer as part of the model here already so you can use it further down

def loadModels(model_list):
    
    """
    Function to load models and return them in the correct format
    
    model_list : list 
            list of model names
    """
    
    models = {}
    for model_name in model_list:
        models[model_name] = BertForSequenceClassification.from_pretrained(f'./Models/{model_name}_{last_model}/Final')
        
       
    trainer = {}

    for model_name in models:
        print(model_name)
        trainer[model_name] = Trainer(
            model=models[model_name],         
            args=TrainingArguments(
                output_dir= f'./Models/{model_name}/Test',
                per_device_eval_batch_size = 64)
    )
        
    return trainer

trainer = loadModels(model_list)

BERT_combined_unweighted
BERT_combined_weighted


***
# 1.) Evaluate on Held-Out Test Sets
## Load Test Sets

In [16]:
# PR: DG, please adapt function to load correct data based on model list (might need simple regex) and then return it in the correct format
# using the full model names and regexing the dataset name will give us a lot more flexibility down the line

def LoadTestSet(test_data_list = test_data_list):
    
    """
    Function to load held-out test sets and return them in the correct format
    
    model_list : list 
            list of model names
    """
    
    test_texts, test_labels, test_encodings, test_dataset = {}, {}, {},{}

    # load Tokenizer
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    
    for dataset in test_data_list:
        data_test = pd.read_csv(f"/home/ec2-user/Projects/v0.1/Data/{dataset}/{dataset}_test.csv",lineterminator="\n",)#training_data[dataset].copy()#.sample(n=100, random_state=123)#df_train[dataset].text.astype("string").tolist()
        data_test = data_test.copy().sample(frac=what_pct, random_state=123)
        test_texts[dataset] = data_test.text.astype("string").tolist()
        test_labels[dataset] = data_test.label.tolist()
        test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True)
        test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])

    
    
    
   
    #print(df_raw[dataset])
    #for dset in dataset:
    #    test_labels[dset] = df_test[dset].label.tolist()
    #    test_texts[dset] = df_test[dset].text.astype("string").tolist()
    #    #print(test_labels[dset])
    #    #print(test_texts[dset])
    #    test_encodings[dset] = tokenizer(test_texts[dset], truncation=True, padding=True)
    #    #print(test_encodings[dset])
    #    test_dataset[dset] = HateDataset(test_encodings[dset], test_labels[dset])

    return test_dataset,test_labels



## Get Model Predictions on Test Sets

In [17]:
def getModelPrediction(trainer, test_dataset,index = None):
    """
    Helper function to get the model's prediction as a dictionnary of results indexed by dataset 
    
    
    trainer : dict
            dict of all the models, each corresponding to one dataset
    test_dataset : dict
            dict of all the test subsets relative to each data set
    
    """
    results = {}
    result_dfs = {}
    print(trainer.keys())

    for dataset in trainer:
        print('Evaluating weighted {} BERT model on test data'.format(dataset))
        results[dataset] = trainer[dataset].predict(test_dataset[dataset])
        
        new_df = pd.DataFrame(np.argmax(results[dataset].predictions,1),columns=[f"pred_BERT_{dataset}_weighted"], index = index )
        result_dfs[dataset]=new_df
        
    return result_dfs

## Analyse Model Performance on Test Sets

In [18]:
def evaluateOnTest(trainer, test_dataset,test_labels):
    """
    Helper function to evaluate on a held out test data 
    
    trainer : dict
            dict of all the models, each corresponding to one dataset
    test_dataset : dict
            dict of all the test subsets relative to each data set
    
    """
    results = {}
    print(test_dataset.keys())
    print(test_dataset)
    
    print(trainer.keys())
    print(trainer)

    for model in trainer:
        for dataset in test_dataset.keys():
            print('Evaluating weighted {} BERT model on test data'.format(dataset))
            results[f"{model}_{dataset}"] = trainer[model].predict(test_dataset[dataset])
            for metric in results[f"{model}_{dataset}"].metrics:
                print(metric, results[f"{model}_{dataset}"].metrics['{}'.format(metric)])
    pred_labels={}

    for model in trainer:
        for dataset in test_dataset.keys():

            preds=[]

            for row in results[f"{model}_{dataset}"][0]:
                preds.append(int(np.argmax(row)))

            pred_labels[f"{model}_{dataset}"] = pd.Series(preds)

    # print classification reports for each model

    for model in trainer:
        for dataset in test_dataset.keys():
            print(f"{model}_{dataset}".upper())
            print(test_labels[dataset],pred_labels[f"{model}_{dataset}"])
            print(classification_report(test_labels[dataset],pred_labels[f"{model}_{dataset}"]))
            print()
    # f1 scores
    for model in trainer:
        for dataset in test_dataset.keys():
            print(f"{model}_{dataset}".upper())
            for average in ['micro', 'macro', 'weighted']:
                print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[f"{model}_{dataset}"], average=average)))
            print()
    # distribution of predictions
    for model in trainer:
        for dataset in test_dataset.keys():
            print(f"{model}_{dataset}".upper())
            print(pred_labels[f"{model}_{dataset}"].value_counts())
            print()

# PR: I think the "test set" section could end here and the below code up to "2.) Evaluate on HateCheck" could be moved into the functions

The three lines below actually run the evaluation


In [19]:
test_dataset,test_labels = LoadTestSet()
trainer = loadModels(model_list=model_list)
evaluateOnTest(trainer, test_dataset,test_labels)

BERT_combined_unweighted
BERT_combined_weighted
dict_keys(['davidson2017', 'dynabench2021', 'founta2018', 'combined'])
{'davidson2017': <__main__.HateDataset object at 0x7f9c7fe25c70>, 'dynabench2021': <__main__.HateDataset object at 0x7f9c7fe25f10>, 'founta2018': <__main__.HateDataset object at 0x7f9c7fe25df0>, 'combined': <__main__.HateDataset object at 0x7f9d20014160>}
dict_keys(['BERT_combined_unweighted', 'BERT_combined_weighted'])
{'BERT_combined_unweighted': <transformers.trainer.Trainer object at 0x7f9d200525e0>, 'BERT_combined_weighted': <transformers.trainer.Trainer object at 0x7f9ccd47bfd0>}
Evaluating weighted davidson2017 BERT model on test data


eval_loss 0.021819284185767174
eval_runtime 1.6145
eval_samples_per_second 15.485
Evaluating weighted dynabench2021 BERT model on test data
eval_loss 0.2695775628089905
eval_runtime 3.5612
eval_samples_per_second 8.424
Evaluating weighted founta2018 BERT model on test data
eval_loss 0.08650258928537369
eval_runtime 3.6949
eval_samples_per_second 27.065
Evaluating weighted combined BERT model on test data
eval_loss 0.15805402398109436
eval_runtime 10.4585
eval_samples_per_second 14.821
Evaluating weighted davidson2017 BERT model on test data


eval_loss 0.03562268614768982
eval_runtime 1.5261
eval_samples_per_second 16.382
Evaluating weighted dynabench2021 BERT model on test data
eval_loss 0.43717288970947266
eval_runtime 3.5073
eval_samples_per_second 8.554
Evaluating weighted founta2018 BERT model on test data
eval_loss 0.08916343748569489
eval_runtime 3.7046
eval_samples_per_second 26.993
Evaluating weighted combined BERT model on test data
eval_loss 0.34621480107307434
eval_runtime 10.3815
eval_samples_per_second 14.93
BERT_COMBINED_UNWEIGHTED_DAVIDSON2017
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] 0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    1
21    0
22    0
23    0
24    0
dtype: int64
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        24
           1       1.00      1.00      1.00         1

    accuracy        