# Welcome to ML Blog Tutorial 2 (Textual Data)

This tutorial will present the application of four interpretability techniques in a machine learning task.

## Setup
First, let's install few libraries we need!

In [None]:
!pip install lime
!pip install eli5
!pip install mlxtend==0.18.0
!pip install transformers
!pip install pip install scikit-multilearn
!pip install transformers-interpret
!pip install bertviz
!pip install anchor-exp

## Initialise

Then, we will fine-tune a trannsformer model (BERT) for detecting hate speech content in short texts! We will use the [ETHOS dataset](https://link.springer.com/article/10.1007/s40747-021-00608-2)

In [None]:
import lime.lime_text
import numpy as np
import numpy as np
import pandas as pd
import urllib
import re
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import maxabs_scale

First, we will load our data

In [None]:
url = "https://raw.githubusercontent.com/intelligence-csd-auth-gr/Ethos-Hate-Speech-Dataset/master/ethos/ethos_data/Ethos_Dataset_Binary.csv"
ethos = pd.read_csv(url,delimiter=';')
x = ethos['comment'].values
y = [1 if i >= 0.5 else 0 for i in ethos['isHate'].values]

We split our data in train/test/valitdation sets

In [None]:
from sklearn.model_selection import train_test_split
train_texts_o, test_texts, train_labels_o, test_labels = train_test_split(x, y, stratify=y, test_size=0.1, random_state=42)
size = (0.05 * len(y)) / len(train_labels_o)
train_texts, validation_texts, train_labels, validation_labels = train_test_split(list(train_texts_o), train_labels_o, stratify=train_labels_o, test_size=size, random_state=42)

We finne tune our Bert (Base Cased) model

In [None]:
from transformers import Trainer, TrainingArguments, utils, BertTokenizerFast
from transformers.models.bert import BertForSequenceClassification
from torch.utils.data import Dataset as TDataset
from torch import tensor

training_args = TrainingArguments(
	evaluation_strategy='epoch', save_strategy='epoch', logging_strategy='epoch', log_level='warning',
	output_dir='./results', num_train_epochs=3, warmup_steps=200, load_best_model_at_end=True
)

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

class myDataset(TDataset):
	def __init__(self, encodings, labels, tokenizer):
		self.encodings = tokenizer(list(encodings), truncation=True, padding=True)
		self.labels = labels

	def __getitem__(self, idx):
		item = {key: tensor(val[idx]) for key, val in self.encodings.items()}
		item['labels'] = tensor(self.labels[idx])
		return item

	def __len__(self):
		return len(self.labels)

train_dataset = myDataset(train_texts, train_labels, tokenizer)
validation_dataset = myDataset(validation_texts, validation_labels, tokenizer)
test_dataset = myDataset(test_texts, test_labels, tokenizer)

model = BertForSequenceClassification.from_pretrained("bert-base-cased", output_attentions=True, output_hidden_states=False)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=validation_dataset)
trainer.train()

Let's see the performance of the model in the train/test sets

In [None]:
predictions = []
hidden_states = []
for instance in train_texts_o:
    temp_dataset = myDataset([instance],[0],tokenizer)
    output = trainer.predict(temp_dataset)
    predictions.append(list(output[0][0][0]))
    hidden_states.append(output[0][1][-1][0][0].shape)
train_predictions = [1 if i[1]>=0 else 0 for i in predictions]

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score
print('Balanced Accuracy: ', balanced_accuracy_score(train_labels_o, train_predictions))
print('F1 weighted:       ', f1_score(train_labels_o, train_predictions, average='weighted'))

In [None]:
test_predictions = []
test_hidden_states = []
for instance in test_texts:
    temp_dataset = myDataset([instance],[0],tokenizer)
    output = trainer.predict(temp_dataset)
    test_predictions.append(list(output[0][0][0]))
    test_hidden_states.append(output[0][1][-1][0][0].shape)
test_predictions2 = [1 if i[1]>=0 else 0 for i in test_predictions]

In [None]:
from sklearn.metrics import balanced_accuracy_score, f1_score
print('Balanced Accuracy: ', balanced_accuracy_score(test_labels, test_predictions2))
print('F1 weighted:       ', f1_score(test_labels, test_predictions2, average='weighted'))

## Explain an example

Finally, we will use few techniques to explain a random instance (instance x_test[3])

In [None]:
from lime.lime_text import LimeTextExplainer
from transformers_interpret import SequenceClassificationExplainer
from anchor.anchor_text import AnchorText
from bertviz import model_view, head_view

We will start with LIME! LIME provides weights (feature importance) as explanations.

In [None]:
instance = test_texts[46] + ''
print(instance)

In [None]:
from scipy.special import softmax

def predictor(texts):
  all_probabilities = []
  splits = np.array_split(texts, 100)
  for split in splits:
    split_labels = [0] * len(split)
    dataset = myDataset(split, split_labels, tokenizer)
    logits, _ = trainer.predict(dataset)[0]
    probabilities = softmax(logits, axis=1)
    all_probabilities.extend(probabilities)
  return np.array(all_probabilities)

lime_explainer = LimeTextExplainer(class_names=['No Hate Speech','Hate Speech'], split_expression='\s+', bow=False)
exp = lime_explainer.explain_instance(instance, predictor, num_samples=1000)#4,
exp.show_in_notebook()

Then, we will use Anchors, which provides a rule as an anchor interpretation! (This one is very slow)

In [None]:
from scipy.special import softmax
def predictor_anchors(texts):
  all_probabilities = []
  splits = np.array_split(texts, 1)
  for split in splits:
    split_labels = [0] * len(split)
    dataset = myDataset(split, split_labels, tokenizer)
    logits, _ = trainer.predict(dataset)[0]
    probabilities = [np.argmax(i) for i in softmax(logits, axis=1)]
    all_probabilities.extend(probabilities)
  return np.array(all_probabilities)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
explainer = AnchorText(nlp, ['No Hate Speech','Hate Speech'], use_unk_distribution=True, mask_string='love')
text = instance
pred = explainer.class_names[predictor_anchors([text])[0]]
alternative =  explainer.class_names[1 - predictor_anchors([text])[0]]
print('Prediction: %s' % pred)
exp = explainer.explain_instance(text, predictor_anchors, threshold=0.90)

In [None]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print()
print('Examples where anchor applies and model predicts %s:' % pred)
print()
print('\n'.join([x[0] for x in exp.examples(only_same_prediction=True)]))
print()
print('Examples where anchor applies and model predicts %s:' % alternative)
print()
print('\n'.join([x[0] for x in exp.examples(partial_index=0, only_different_prediction=True)]))

In [None]:
exp.show_in_notebook()

Then, we will use Integrated Gradients (IG)! Like LIME, IG provides weights (feature importance) as explanations.



In [None]:
ig_explainer = SequenceClassificationExplainer(trainer.model, tokenizer, custom_labels=['No Hate Speech','Hate Speech'])
ig_explainer(instance, index=1, n_steps=100)[1:-1]
ig_explainer.visualize()

Finally, we will use BertViz to visualise the Attention Information! We start with the head view.

In [None]:
utils.logging.set_verbosity_error()  # Suppress standard warnings

instance_dataset = myDataset([instance],[0],tokenizer)
outputs = trainer.predict(instance_dataset)
attention = tensor(np.array(list(outputs[0][1])))
tokens = ['CLS'] + tokenizer.tokenize(instance) + ['SEP']
head_view(attention[:,:,:,:,:], tokens,prettify_tokens=True)

And this is how BerViz visualizes attention through model_view

In [None]:
model_view(attention[:,:,:,1:-1,1:-1], tokens[1:-1], display_mode='light')