In [1]:
import torch
import datasets
import transformers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import Dict
from torch.utils.data import Dataset
from captum.attr import visualization as viz

from captum.attr import (IntegratedGradients, LayerIntegratedGradients,
                         configure_interpretable_embedding_layer,
                         remove_interpretable_embedding_layer)
from transformers import (ElectraForSequenceClassification,
                          ElectraTokenizerFast, EvalPrediction, InputFeatures,
                          Trainer, TrainingArguments)

In [30]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
# Model and Tokenizer

model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator", num_labels = 2)

tokenizer = ElectraTokenizerFast.from_pretrained(
    "google/electra-small-discriminator", do_lower_case=True)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [17]:
# Load the SST2 dataset from the datasets library
dataset = datasets.load_dataset("glue", "sst2")

# Load the SST2 metric from the datasets library
glue_metric = datasets.load_metric('glue', 'sst2')

# Look at the labels
print("Training set labels: {}".format(set(dataset["train"]["label"])))
print("Validation set labels: {}".format(set(dataset["validation"]["label"])))
print("Test set labels: {}".format(set(dataset["test"]["label"])))

# Explore the dataset
df = pd.DataFrame({"senence": dataset["train"]["sentence"],
                   "label": dataset["train"]["label"]})
pd.options.display.max_colwidth = 0
df.head()

Reusing dataset glue (/home/mirac13/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


Training set labels: {0, 1}
Validation set labels: {0, 1}
Test set labels: {-1}


Unnamed: 0,senence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates something rather beautiful about human nature,1
3,remains utterly satisfied to remain the same throughout,0
4,on the worst revenge-of-the-nerds clichés the filmmakers could dredge up,0


In [4]:
# Create Dataset class

class TrainerDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer

        self.tokenized_inputs = tokenizer(inputs, padding=True)   

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return InputFeatures(
            input_ids=self.tokenized_inputs['input_ids'][idx],
            token_type_ids=self.tokenized_inputs['token_type_ids'][idx],
            attention_mask=self.tokenized_inputs['attention_mask'][idx],
            label=self.targets[idx])

In [5]:
train_dataset = TrainerDataset(dataset["train"]["sentence"],
                               dataset["train"]["label"], tokenizer)
eval_dataset = TrainerDataset(dataset["validation"]["sentence"],
                              dataset["validation"]["label"], tokenizer)

### Fine Tune

In [18]:
np.random.seed(42)
torch.manual_seed(42)

training_args = TrainingArguments(
    output_dir="electra_sst2",
    num_train_epochs=3,  # (1 epoch gives slightly lower accuracy)
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=64,    
    dataloader_drop_last=True,  # Make sure all batches are of equal size
)


def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    # The choice of a dataset (task_name) implies metric
    return glue_metric.compute(
        task_name="sst-2",
        preds=preds,
        labels=p.label_ids)


# Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset)

In [7]:
trainer.train()

Step,Training Loss
500,0.355
1000,0.2279
1500,0.172
2000,0.1625
2500,0.1308
3000,0.123


TrainOutput(global_step=3156, training_loss=0.19133564909148126, metrics={'train_runtime': 182.5762, 'train_samples_per_second': 17.286, 'total_flos': 1083750877034496, 'epoch': 3.0})

In [47]:
# Evaluate
for obj in eval_dataset:
    preds = model(input_ids = torch.LongTensor([obj.input_ids]).to(device), \
                  token_type_ids = torch.LongTensor([obj.token_type_ids]).to(device), \
                  attention_mask = torch.LongTensor([obj.attention_mask]).to(device))
    preds = torch.LongTensor([np.argmax(preds.logits.to('cpu').detach().numpy())]).to(device)
    glue_metric.add_batch(predictions = preds, references = torch.LongTensor([obj.label]).to(device))
    
print("Accuracy: {}".format(glue_metric.compute()))

Accuracy: {'accuracy': 0.911348872755063}


### Interpretability


The examples below use two attribution methods from the Captum library:

- **Integrated Gradients** - the method requires configuring interpretation hooks to perform attribution for all three embedding layers in one step, and
- **Layer Integrated Gradients**, computed separately with respect to each of the three layers:
     - model.electra.embeddings.word_embeddings,
     - model.electra.embeddings.token_type_embeddings,
     - model.electra.embeddings.position_embeddings.
     
We will try to find out to what extent, according to these methods, each token has contributed to the model's prediction, or, more precisely, to its shift from the baseline output. Each method requires setting a target class index: 0 for negative or 1 for a positive sentiment. Attribution is performed for each target class separately. Scores will be assigned with regard to the model's output for the selected class.

The shape of attributions is the same as the shape of the inputs parameter of the attribute method.

In [None]:
text = "visually imaginative , thematically instructive and thoroughly \
delightful , it takes us on a roller-coaster ride from innocence to experience \
without even a hint of that typical kiddie-flick sentimentality . "
true_label = 1

[x for x in dataset["validation"] if x["sentence"] == text]

### Helper functions

The functions below construct input tensors for our sample and for a sequence of [PAD] tokens serving as a baseline. We also need to define a forward function running inference on the model. The function will be passed on to objects handling attribution.

Computation with IntegratedGradients requires altering the model by configuring additional layers. For this purpose, the Captum library provides the configure_interpretable_embedding_layer and remove_interpretable_embedding_layer methods. Configuring an interpretable embedding layer modifies the model. A model with interpretable layers requires input of a different shape.

In [None]:
def configure_interpretable_embeddings():
    """Configure interpretable embedding layer"""
    interpretable_embedding1 = configure_interpretable_embedding_layer(
        model, "electra.embeddings.word_embeddings")
    interpretable_embedding2 = configure_interpretable_embedding_layer(
        model, "electra.embeddings.token_type_embeddings")
    interpretable_embedding3 = configure_interpretable_embedding_layer(
        model,"electra.embeddings.position_embeddings")
    return (interpretable_embedding1,
            interpretable_embedding2,
            interpretable_embedding3)


def remove_interpretable_embeddings(interpretable_embedding1, 
                                    interpretable_embedding2, 
                                    interpretable_embedding3):
    """Remove interpretable layer to restore the original model structure"""
    if not \
    type(model.get_input_embeddings()).__name__ == "InterpretableEmbeddingBase":
        return
    remove_interpretable_embedding_layer(model, interpretable_embedding1)
    remove_interpretable_embedding_layer(model, interpretable_embedding2)
    remove_interpretable_embedding_layer(model, interpretable_embedding3)  


def predict_forward_func(input_ids, token_type_ids=None, 
                         position_ids=None, attention_mask=None):
    """Function passed to ig constructors"""
    return model(input_ids=input_ids, 
                 token_type_ids=token_type_ids, 
                 position_ids=position_ids, 
                 attention_mask=attention_mask)[0]  


def prepare_input(text):
    """Prepare ig attribution input: tokenize sample and baseline text."""
    tokenized_text = tokenizer(text, return_tensors="pt", 
                               return_attention_mask=True)
    seq_len = tokenized_text["input_ids"].shape[1]
    position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)

    # Construct the baseline (a reference sample).
    # A sequence of [PAD] tokens of length equal to that of the processed sample
    ref_text = tokenizer.pad_token * (seq_len - 2) # special tokens
    tokenized_ref_text = tokenizer(ref_text, return_tensors="pt") 
    ref_position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)

    return (tokenized_text["input_ids"],
            tokenized_text["token_type_ids"], 
            position_ids,
            tokenized_ref_text["input_ids"],
            tokenized_ref_text["token_type_ids"], 
            ref_position_ids,
            tokenized_text["attention_mask"])   


def prepare_input_embed(input_ids, token_type_ids, position_ids,
                        ref_input_ids, ref_token_type_ids, ref_position_ids,
                        attention_mask):
    """Construct input for the modified model"""
    input_ids_embed = interpretable_embedding1.indices_to_embeddings(input_ids)
    ref_input_ids_embed = interpretable_embedding1.indices_to_embeddings(
        ref_input_ids)
    token_type_ids_embed = interpretable_embedding2.indices_to_embeddings(
        token_type_ids)
    ref_token_type_ids_embed = interpretable_embedding2.indices_to_embeddings(
        ref_token_type_ids)
    position_ids_embed = interpretable_embedding3.indices_to_embeddings(
        position_ids)
    ref_position_ids_embed = interpretable_embedding3.indices_to_embeddings(
        ref_position_ids)
    
    return (input_ids_embed, token_type_ids_embed, position_ids_embed, 
            ref_input_ids_embed, ref_token_type_ids_embed, 
            ref_position_ids_embed, attention_mask)


def get_input_data(text):
    input_data = place_on_device(*prepare_input(text))
    input_data_embed = prepare_input_embed(*input_data)   
    return input_data, input_data_embed 


def place_on_device(*tensors):
    tensors_device = []
    for t in tensors:
        tensors_device.append(t.to(device))
    return tuple(tensors_device)  


def ig_attribute(ig, class_index, input_data_embed):
    return ig.attribute(inputs=input_data_embed[0:3],
                        baselines=input_data_embed[3:6],
                        additional_forward_args=(input_data_embed[6]),
                        target = class_index,
                        return_convergence_delta=True,
                        n_steps=200)
    

def lig_attribute(lig, class_index, input_data):
    return lig.attribute(
        inputs=input_data[0], baselines=input_data[3],
        additional_forward_args=(input_data[1], input_data[2], input_data[6]),
        return_convergence_delta=True, target=class_index, n_steps=200)

### Integrated Gradients
To compute attributions with Integrated Gradients we will:

- instantiate the IntegratedGradients class passing the predict_forward_func function as parameter,
- configure interpretable embedding layer,
- prepare input tensors,
- compute attributions,
- remove interpratable embedding layer.
- Calling the get_input_embeddings method of the model helps to find out whether extra layers have been configured.

#### Compute attributions

In [None]:
# Instantiate the IntegratedGradients class
ig = IntegratedGradients(predict_forward_func)

In [None]:
# Configure interpretable embedding layer 
print("Original model's input embeddings:\n {}\n".
      format(model.get_input_embeddings()))
if not \
type(model.get_input_embeddings()).__name__ == "InterpretableEmbeddingBase":
    interpretable_embedding1, interpretable_embedding2,\
    interpretable_embedding3 = configure_interpretable_embeddings()
print("Input embeddings with interpretable layer:\n {}\n".
      format(model.get_input_embeddings()))

# Prepare input
input_data, input_data_embed = get_input_data(text)  

# Compute attributions for both target classes
# class 0 (negative)
attributions_0, approximation_error_0 = ig_attribute(ig, 0, input_data_embed)
# class 1 (positive)
attributions_1, approximation_error_1 = ig_attribute(ig, 1, input_data_embed)

# Remove interpratable embedding layer used by ig attribution
remove_interpretable_embeddings(interpretable_embedding1, 
                                interpretable_embedding2, 
                                interpretable_embedding3)
print("\nInput embeddings with interpretable layer removed:\n {}\n"
.format(model.get_input_embeddings()))

print("\nThe reference sample:\n{}".format(tokenizer.convert_ids_to_tokens(
    input_data[3].clone().detach().to('cpu').numpy().squeeze())))

### Completeness
The Integrated Gradients method satisfies the completeness property. The sum of attributions should be equal, with certain accuracy, to the difference between the model's output for the sample and its output for the selected baseline (in this case a sequence of [PAD] tokens). Increase the n_steps parameter of the attribute method to obtain better accuracy.

In [None]:
def check_completeness(attributions_0, attributions_1):
    input_ids, token_type_ids, position_ids, ref_input_ids,\
    ref_token_type_ids, ref_position_ids, attention_mask = input_data

    # Prediction for the sample
    scores = predict_forward_func(input_ids, token_type_ids,
                                position_ids, attention_mask) 
    # Prediction for the baseline
    ref_scores = predict_forward_func(ref_input_ids, ref_token_type_ids,
                                    ref_position_ids, attention_mask)

    # Put on cpu
    if torch.is_tensor(attributions_0[0]):
        attributions_0 = [x.clone().detach().to('cpu').numpy() 
        for x in attributions_0]
    if torch.is_tensor(attributions_1[0]):
        attributions_1 = [x.clone().detach().to('cpu').numpy() 
        for x in attributions_1]  
    scores = scores.clone().detach().to('cpu').numpy().squeeze()
    ref_scores = ref_scores.clone().detach().to('cpu').numpy().squeeze()    

    # How prediction for the sample differs from baseline prediction  
    diff_from_baseline = scores - ref_scores

    # Sum of attributions
    attributions_sum0 = [x.sum() for x in attributions_0]
    attributions_sum1 = [x.sum() for x in attributions_1]
    attributions_sum = [sum(attributions_sum0), sum(attributions_sum1)]

    # Difference from the baseline output for both classes
    diff = diff_from_baseline - attributions_sum

    # Find out which layers contribute to the score 
    print("Class 0: input tokens attr. sum: {}".format(attributions_sum0[0]))
    print("Classs 0: token type attr. sum: {}".format(attributions_0[1].sum()))
    print("Class 0: position ids attr. sum: {}".format(attributions_0[2].sum()))
    print("Class 1: input tokens attr. sum: {}".format(attributions_1[0].sum()))
    print("Classs 1: token type attr. sum: {}".format(attributions_1[1].sum()))
    print("Class 1: position ids attr. sum: {}".format(attributions_1[2].sum()))

    # Compare sum of attributions with the difference from baseline prediction
    print("\nPrediction for sample: {}".format(scores))
    print("Prediction for baseline: {}".format(ref_scores))
    print("Difference from baseline: {}".format(diff_from_baseline))
    print("Sum of attributions: {}".format(attributions_sum))
    print("\nClass 0:\n score: {}\n reference score: {}\
    \n difference from ref.: {}\n sum of attributions:  {}\
    \n difference from reference - attributions: {}".\
    format(scores[0], ref_scores[0], diff_from_baseline[0], 
            attributions_sum[0], diff[0]))
    print("\nClass 1:\n score: {}\n reference score: {}\
    \n difference from ref.: {}\n sum of attributions:  {}\
    \n difference from reference - attributions: {}".\
    format(scores[1], ref_scores[1], diff_from_baseline[1], 
            attributions_sum[1], diff[1]))
    
    return attributions_0, attributions_1
    
    
attributions_0, attributions_1 = check_completeness(attributions_0,
                                                    attributions_1)

### Layer Integrated Gradients
With Layer Integrated Gradients, attributions are computed with respect to a certain layer. We'll run the algorithm for three layers separately:

- model.electra.embeddings.word_embeddings
- model.electra.embeddings.token_type_embeddings
- model.electra.embeddings.position_embeddings

#### Compute attributions

In [None]:
# Input for lig attributions (model with no special layers configured)
input_data = place_on_device(*prepare_input(text))

# 1. Layer: model.electra.embeddings.word_embeddings
lig_we = LayerIntegratedGradients(
    predict_forward_func, 
    model.electra.embeddings.word_embeddings)
layer_attributions_we_0, _ = lig_attribute(lig_we, 0, input_data)
layer_attributions_we_1, _ = lig_attribute(lig_we, 1, input_data)

# 2. Layer: model.electra.embeddings.token_type_embeddings
lig_tte = LayerIntegratedGradients(
    predict_forward_func,
    model.electra.embeddings.token_type_embeddings)
layer_attributions_tte_0, _ = lig_attribute(lig_tte, 0, input_data)
layer_attributions_tte_1, _ = lig_attribute(lig_tte, 1, input_data)

# 3. Layer: model.electra.embeddings.position_embeddings
lig_pe = LayerIntegratedGradients(
    predict_forward_func, 
    model.electra.embeddings.position_embeddings)
layer_attributions_pe_0, _ = lig_attribute(lig_pe, 0, input_data)
layer_attributions_pe_1, _ = lig_attribute(lig_pe, 1, input_data)

print("Shape of attributions:")
print(layer_attributions_we_0.shape, layer_attributions_we_1.shape)
print(layer_attributions_tte_0.shape, layer_attributions_tte_1.shape)
print(layer_attributions_pe_0.shape, layer_attributions_pe_1.shape)

#### Completeness
Completeness for attributions found for each layer separately

In [None]:
layer_attributions_0, layer_attributions_1 = check_completeness(
    (layer_attributions_we_0, layer_attributions_tte_0, layer_attributions_pe_0),
    (layer_attributions_we_1, layer_attributions_tte_1, layer_attributions_pe_1))

In [None]:
# Compare with IG

# Attributions for input_ids computed with IG and LIG
# word_embeddings: index 0
ig_1 = attributions_1[0].squeeze().sum(1)
lig_1 = layer_attributions_1[0].squeeze().sum(1)

tokens = tokenizer.convert_ids_to_tokens(tokenizer(text)["input_ids"])

range_ig = [x + 0.5 for x in np.arange(len(ig_1))]
range_lig = [x + 0.5 for x in range_ig]
 
plt.rcParams["figure.figsize"] = [12, 6] 
plt.bar(range_ig, ig_1, width=0.5, label='ig')
plt.bar(range_lig, lig_1, width=0.5, label='lig')
plt.xlabel('Token', fontweight='bold')
plt.xticks(list(range(len(lig_1))), tokens, rotation='vertical')
plt.legend()
plt.title("Attributions with IG and LIG for the positive target class.")
plt.show()

In [None]:
# Plot for both target classes
# Attributions assigned to tokens may take opposite values when computed with regard to class 0 and class 1.

# Attributions for word_embeddings: index 0
lig_0 = layer_attributions_0[0].squeeze().sum(1)
lig_1 = layer_attributions_1[0].squeeze().sum(1)

tokens = tokenizer.convert_ids_to_tokens(tokenizer(text)["input_ids"])

plt.rcParams["figure.figsize"] = [12, 6]
plt.bar(list(range(len(lig_0))), lig_0, color='r', alpha=0.5)
plt.bar(list(range(len(lig_1))), lig_1, color='g', alpha=0.5)
plt.xticks(list(range(len(lig_0))), tokens, rotation='vertical')
plt.legend(labels=["Target: negative", "Target: positive"])
plt.xlabel('Token', fontweight='bold')
plt.title("Token attributions for positive and negative target class")
plt.show()

### Visualization

In [None]:
# Helper Functions

def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions


def compute_attributions_ig(ig, input_data_embed):
    # Create interpretable layer
    if not type(
        model.get_input_embeddings()).__name__ == "InterpretableEmbeddingBase":    
        interpretable_embedding1, interpretable_embedding2,\
        interpretable_embedding3 = configure_interpretable_embeddings()
    # Compute attributions for positive and nagative samples (class 1 and 0)
    attr_0, delta_0 = ig_attribute(ig, 0, input_data_embed)
    attr_1, delta_1 = ig_attribute(ig, 1, input_data_embed)
    # Remove interprateble layer used by ig attribution
    remove_interpretable_embeddings(interpretable_embedding1, 
                                    interpretable_embedding2, 
                                    interpretable_embedding3)
    # Return sum over all three layers
    attr_0 = torch.stack(attr_0, axis=0).sum(0)
    attr_1 = torch.stack(attr_1, axis=0).sum(0)    
    return (attr_0, delta_0), (attr_1, delta_1)    


def compute_attributions_lig(lig, input_data):  
    # Compute attributions for positive and nagative samples (class 1 and 0)
    return lig_attribute(lig, 0, input_data), lig_attribute(lig, 1, input_data)


def get_visualization_record(text, attributions, scores, true_label,
                             all_tokens, approximation_error):
    attributions_sum = summarize_attributions(attributions)
    return viz.VisualizationDataRecord(
        attributions_sum,
        torch.max(torch.softmax(scores[0], dim=0)),
        torch.argmax(scores),
        true_label,
        text,
        attributions_sum.sum(),
        all_tokens,
        approximation_error)
    

def visualize_attributions(text, true_label, ig_object, 
                           method, layer_name=None):
    # Prepare input
    input_data, input_data_embed = get_input_data(text)

    # Compute attributions
    attr_0, attr_1, delta_0, delta_1 = None, None, None, None
    if method == "ig":
        (attr_0, delta_0), (attr_1, delta_1) = \
        compute_attributions_ig(ig_object, input_data_embed)
    elif method == "lig":    
        (attr_0, delta_0), (attr_1, delta_1) = \
        compute_attributions_lig(ig_object, input_data)
    else:
        return "method: ig or lig"    
    # Run inference
    scores = predict_forward_func(*input_data[0:3], input_data[-1])
    # Prepare visualization 
    indices = input_data[0][0].detach().tolist()
    all_tokens = tokenizer.convert_ids_to_tokens(indices)
    data_vis_0 = get_visualization_record(text, attr_0, scores, 
                                          true_label, all_tokens, delta_0)  
    data_vis_1 = get_visualization_record(text, attr_1, scores, 
                                          true_label, all_tokens, delta_1) 
    # Visualize
    print("\nAttribution method: {},".
          format(method), "class index: 0 (negative)")
    if not layer_name is None:
        print("Layer: {}".format(layer_name))
    viz.visualize_text([data_vis_0])
    print("Attribution method: {},".
          format(method), "Class index: 1 (positive)")
    if not layer_name is None:
        print("Layer: {}".format(layer_name))    
    viz.visualize_text([data_vis_1])
     
    return attr_0, attr_1

#### Examples
Captum visualization library shows in green tokens that push the prediction towards the target class. Those driving the score towards the reference value are marked in red. As a result, words perceived as positive will appear in green if attribution is performed against class 1 (positive) but will be highlighted in red with an attribution targeting class 0 (negative).

Because importance scores ar assigned to tokens, not words, some examples may show, that attribution is highly dependent on tokenization. Classification results may vary between runs.

In [None]:
# Run predictions
eval_pred_result = trainer.predict(eval_dataset)
predictions = np.argmax(eval_pred_result.predictions, axis=1)

# Find correctly classified and misclassifed samples
eval_samples = [tokenizer.decode(x.input_ids, skip_special_tokens=True) \
                for x in eval_dataset]
eval_preds = list(zip(eval_pred_result.label_ids, predictions))
positive_pred_as_positive = [sample for sample, (real_label, pred_label) \
                             in zip(eval_samples, eval_preds) \
                             if real_label == pred_label and real_label == 1]  
negative_pred_as_negative = [sample for sample, (real_label, pred_label) \
                             in zip(eval_samples, eval_preds) \
                             if real_label == pred_label and real_label == 0] 
positive_pred_as_negative = [sample for sample, (real_label, pred_label) \
                             in zip(eval_samples, eval_preds) \
                             if real_label != pred_label and real_label == 1]                                                           
negative_pred_as_positive = [sample for sample, (real_label, pred_label) \
                             in zip(eval_samples, eval_preds) \
                             if real_label != pred_label and real_label == 0]

# Browse
print('\n'.join(positive_pred_as_positive))   
print('\n'.join(negative_pred_as_negative))    
print('\n'.join(positive_pred_as_negative))  
print('\n'.join(negative_pred_as_positive))

In [None]:
# Positive
# A correctly classified positive sample

text_vis = text
true_label_vis = true_label

ig_0, ig_1 = visualize_attributions(text_vis, true_label_vis, ig, "ig")
lig_0, lig_1 = \
visualize_attributions(text_vis, true_label_vis, lig_we, "lig",
                       layer_name="electra.embeddings.word_embeddings")

In [None]:
# Negative
# A correctly classified negative sample

text_vis = 'the film makes a fatal mistake : it asks us to care about a young \
man whose only apparent virtue is that he is not quite as unpleasant as some \
of the people in his life.'
true_label_vis = 0

ig_0, ig_1 = visualize_attributions(text_vis, true_label_vis, ig, "ig")
lig_0, lig_1 = \
visualize_attributions(text_vis, true_label_vis, lig_we, "lig",
                       layer_name="electra.embeddings.word_embeddings")

In [None]:
# Misclassified
# Pick an example. Results may vary between runs.

print('-------------------------------Negative examples misclassified as positive------------------------------')
print('\n\n'.join(negative_pred_as_positive))

print('-------------------------------Positive examples misclassified as negative------------------------------')
print('\n\n'.join(positive_pred_as_negative))

In [None]:
text_vis = "a tv style murder mystery with a few big screen moments ( including one that seems to be made for a different film altogether )."
true_label_vis = 0

ig_0, ig_1 = visualize_attributions(text_vis, true_label_vis, ig, "ig")
lig_0, lig_1 = \
visualize_attributions(text_vis, true_label_vis, lig_we, "lig",
                       layer_name="electra.embeddings.word_embeddings")