In [None]:
%load_ext autoreload
%autoreload 2


import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, f1_score
import pickle

from data_utils import HateDataset, get_results
from utils import initialize_seeds

In [None]:
initialize_seeds()

In [None]:
filename = "./data/targetData.pt"

In [None]:
data = torch.load(filename)

## Get predictions

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('hatecheck-experiments/Models/BERT_davidson2017_weighted/Final')

In [None]:
model_names = ["davidson2017", "founta2018"]

In [None]:
results = {}
for name in model_names:
    model = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_{name}_weighted/Final")
    trainer = Trainer(
        model=model,         
        args=TrainingArguments(
            output_dir=(f"./hatecheck-experiments/Models/BERT_{name}_weighted/test"),
            per_device_eval_batch_size = 64)
    )
    print(f"Evaluating model pretrained on {name} on {name} validation set")
    results[name], _= get_results(trainer, data[1][name])

In [None]:
for name in model_names:
    model = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_{name}_hatecheck_weighted/Final")
    trainer = Trainer(
        model=model,         
        args=TrainingArguments(
            output_dir=(f"./models/BERT_hateCheck_weighted/test"),
            per_device_eval_batch_size = 64)
    )
    print(f"Evaluating model finetuned on random split hatecheck on {name} validation set")
    results[f"BERT-R_{name}"], _= get_results(trainer, data[1][name])

## Get samples with worse degradation

In [None]:
with open("./results/iid_preds.pkl", "wb") as file:
    pickle.dump(results, file)

In [None]:
results = pickle.load(open("./results/iid_preds.pkl", "rb"))

In [None]:
from scipy.special import softmax

def get_non_hate_probs(result):
    return softmax(result[0], axis=1)[:,0]

In [None]:
def get_deltas_df(results, data_name):
    p_non_hate_before, p_non_hate_after = get_non_hate_probs(results[data_name]), get_non_hate_probs(results[f"BERT-R_{data_name}"])
    deltas = pd.DataFrame((p_non_hate_after - p_non_hate_before).squeeze(), columns=["delta"])
    deltas["label"] = results[data_name][1]
    return deltas

In [None]:
deltas_davidson = get_deltas_df(results, "davidson2017")

In [None]:
deltas_founta = get_deltas_df(results, "founta2018")

## Largest degradations for hateful samples

In [None]:
def get_samples(results, data, data_name, tokenizer, idxs):
    samples = [tokenizer.decode(data[data_name][i]["input_ids"]) for i in idxs]
    labels = [results[data_name][1][i] for i in idxs]
    pred_before = [results[data_name][0][i] for i in idxs]
    pred_after = [results[f"BERT-R_{data_name}"][0][i] for i in idxs]
    for i, s in enumerate(samples):
        print("Sample:")
        print(s.split("[PAD")[0])
        print(f"Gold label: {'hateful' if labels[i] == 1 else 'non-hateful'}")
        print(f"Before fine-tuning prob: {softmax(pred_before[i])[labels[i]]}")
        print(f"After fine-tuning prob: {softmax(pred_after[i])[labels[i]]}")
        print()

In [None]:
idxs = deltas_davidson[deltas_davidson.label==1].nlargest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "davidson2017", tokenizer, idxs)

In [None]:
idxs = deltas_founta[deltas_founta.label==1].nlargest(5,  "delta").index.to_list()

In [None]:
get_samples(results, data[1], "founta2018", tokenizer, idxs)

## Largest degradations for non_hateful samples

In [None]:
idxs = deltas_davidson[deltas_davidson.label==0].nsmallest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "davidson2017", tokenizer, idxs)

In [None]:
idxs = deltas_founta[deltas_founta.label==0].nsmallest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "founta2018", tokenizer, idxs)

## Largest improvements for hateful-samples

In [None]:
idxs = deltas_davidson[deltas_davidson.label==1].nsmallest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "davidson2017", tokenizer, idxs)

In [None]:
idxs = deltas_founta[deltas_founta.label==1].nsmallest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "founta2018", tokenizer, idxs)

## Largest improvements for non hateful-samples

In [None]:
idxs = deltas_davidson[deltas_davidson.label==0].nlargest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "davidson2017", tokenizer, idxs)

In [None]:
idxs = deltas_founta[deltas_founta.label==0].nlargest(5, "delta").index.to_list()

In [None]:
get_samples(results, data[1], "founta2018", tokenizer, idxs)

## Manual inspection

In [None]:
BERT_D = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_davidson2017_weighted/Final").cuda()
BERT_F = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_founta2018_weighted/Final").cuda()
BERT_D_R = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_davidson2017_hatecheck_weighted/Final").cuda()
BERT_F_R = BertForSequenceClassification.from_pretrained(f"./hatecheck-experiments/Models/BERT_founta2018_hatecheck_weighted/Final").cuda()

In [None]:
from transformers import TextClassificationPipeline

def inspect_model(model, sentence):
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0)
    # outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]
    print(pipe(sentence))

In [None]:
models = [BERT_D, BERT_F, BERT_D_R, BERT_F_R]

In [None]:
[inspect_model(m, "or a southern bitch like me who will blow a persons heart out of their back w / one shot!!!! ") for m in models]