# MIT Movie Dataset - Explainability



In [None]:
import pandas as pd
import numpy as np

# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')
#os.chdir('/content/drive/My Drive/DAAN888/data')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/DAAN888/data'

In [None]:
model_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/models/'

In [None]:
data_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/data/'

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Mon Nov  9 02:08:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces



---



## Load Dataset

In [None]:
import pickle 

with open('mitmovie.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

In [None]:
# first row in train set
list(zip(dataset['train_tokens'][0], dataset['train_labels'][0]))

[('what', 'O'),
 ('movies', 'O'),
 ('star', 'O'),
 ('bruce', 'B-ACTOR'),
 ('willis', 'I-ACTOR')]

## Bert Model

Documentation for this model can be found: 
https://huggingface.co/transformers/model_doc/bert.html#overview

Following HuggingFace distilbert tutorial: https://huggingface.co/transformers/custom_datasets.html#tok-ner


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 19.7MB/s eta 0:00:01[K     |▌                               | 20kB 6.2MB/s eta 0:00:01[K     |▉                               | 30kB 7.6MB/s eta 0:00:01[K     |█                               | 40kB 8.1MB/s eta 0:00:01[K     |█▎                              | 51kB 6.6MB/s eta 0:00:01[K     |█▋                              | 61kB 7.3MB/s eta 0:00:01[K     |█▉                              | 71kB 8.3MB/s eta 0:00:01[K     |██                              | 81kB 8.5MB/s eta 0:00:01[K     |██▍                             | 92kB 8.3MB/s eta 0:00:01[K     |██▋                             | 102kB 8.8MB/s eta 0:00:01[K     |██▉                             | 112kB 8.8MB/s eta 0:00:01[K     |███▏                            | 122kB 8.8M

### Encode Labels

In [None]:
# get the set of unique labels in the movie dataset
uniq_labels = set([label for doc in dataset['train_labels'] for label in doc])

In [None]:
# assign a number to each label
label_encoding = {label: id for id, label in enumerate(uniq_labels)}
label_encoding

{'B-ACTOR': 21,
 'B-CHARACTER': 0,
 'B-DIRECTOR': 23,
 'B-GENRE': 20,
 'B-PLOT': 10,
 'B-RATING': 12,
 'B-RATINGS_AVERAGE': 19,
 'B-REVIEW': 11,
 'B-SONG': 24,
 'B-TITLE': 1,
 'B-TRAILER': 14,
 'B-YEAR': 5,
 'I-ACTOR': 7,
 'I-CHARACTER': 22,
 'I-DIRECTOR': 8,
 'I-GENRE': 9,
 'I-PLOT': 15,
 'I-RATING': 13,
 'I-RATINGS_AVERAGE': 18,
 'I-REVIEW': 3,
 'I-SONG': 6,
 'I-TITLE': 17,
 'I-TRAILER': 2,
 'I-YEAR': 16,
 'O': 4}

### Encode Texts

To encode the texts, we have to use the same Tokenizer that DistilBert was trained on.

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
print('There are %s words in this tokenizer object' % tokenizer.vocab_size)

There are 30522 words in this tokenizer object


In [None]:
# use the tokenizer to encode the texts 
train_encodings = tokenizer(dataset['train_tokens'], 
                            is_split_into_words=True, 
                            return_offsets_mapping=True, 
                            padding=True, 
                            truncation=True)

test_encodings = tokenizer(dataset['test_tokens'], 
                           is_split_into_words=True, 
                           return_offsets_mapping=True, 
                           padding=True, 
                           truncation=True)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [None]:
# make sure same number of docs
len(train_encodings['input_ids']), len(dataset['train_tokens'])

(9775, 9775)

In [None]:
dataset['train_tokens'][0]

['what', 'movies', 'star', 'bruce', 'willis']

In [None]:
# preview what the encoded result looks like
list(zip(train_encodings['input_ids'][0][0:12], train_encodings['attention_mask'][0][0:12], train_encodings['offset_mapping'][0][0:12]))

[(101, 1, (0, 0)),
 (2054, 1, (0, 4)),
 (5691, 1, (0, 6)),
 (2732, 1, (0, 4)),
 (5503, 1, (0, 5)),
 (12688, 1, (0, 6)),
 (102, 1, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0)),
 (0, 0, (0, 0))]

In [None]:
# the model expects all docs to be same length (51) 
# the attention mask will tell the model to ignore the padding with zeros
print('Length of sequences is %s ' % len(train_encodings['input_ids'][1]))

Length of sequences is 51 


In [None]:
# first document in dataset
print( dataset['train_tokens'][0] )

# check out new tokenization result as words
print( tokenizer.convert_ids_to_tokens( train_encodings['input_ids'][0][0:9]) )

# check out new tokenization result as ids
print( [val for val in train_encodings['input_ids'][0] if val != 0] )

['what', 'movies', 'star', 'bruce', 'willis']
['[CLS]', 'what', 'movies', 'star', 'bruce', 'willis', '[SEP]', '[PAD]', '[PAD]']
[101, 2054, 5691, 2732, 5503, 12688, 102]


In [None]:
find_subwords = []
for i, offset_list in enumerate(train_encodings['offset_mapping']):
  for j, offset_tuple in enumerate(offset_list):
    if offset_tuple[0] != 0:
      find_subwords.append(i)

In [None]:
np.unique(find_subwords)

array([   1,    2,    3, ..., 9752, 9767, 9772])

In [None]:
dataset['train_tokens'][1]

['show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s']

In [None]:
list(zip(train_encodings['input_ids'][1][0:12], train_encodings['attention_mask'][1][0:12], train_encodings['offset_mapping'][1][0:12]))

[(101, 1, (0, 0)),
 (2265, 1, (0, 4)),
 (2033, 1, (0, 2)),
 (3152, 1, (0, 5)),
 (2007, 1, (0, 4)),
 (3881, 1, (0, 4)),
 (6287, 1, (0, 5)),
 (5974, 1, (5, 9)),
 (2013, 1, (0, 4)),
 (1996, 1, (0, 3)),
 (3865, 1, (0, 5)),
 (102, 1, (0, 0))]

In [None]:
[tokenizer.convert_ids_to_tokens(val) for val in train_encodings['input_ids'][1][0:12]]

['[CLS]',
 'show',
 'me',
 'films',
 'with',
 'drew',
 'barry',
 '##more',
 'from',
 'the',
 '1980s',
 '[SEP]']

### Adjust Labels for Vocab Offset


In the print out above you can see that "barrymore" gets transformed into "barry", "##more". This is because the name is not in the vocabulary of the model, however, those small subwords "barry" and "##more" are. So the offsetting notifies the model of this splitting of the word. However, we have to adjust the labels now to account for this separation. 

Based on https://datascience.stackexchange.com/questions/69640/what-should-be-the-labels-for-subword-tokens-in-bert-for-ner-task we will not drag the label to the new subword feature because that would introduce more instances of that class and change the number of support instances thus making the models difficult to compare. 

Also some comments to consider for an alternative strategy: https://github.com/google-research/bert/issues/646


In [None]:
import numpy as np

def adjust_labels_for_offset(original_labels, label_dictionary, encodings):

    # convert to the numeric encoding of the label
    labels = [[label_dictionary[label] for label in doc] for doc in original_labels]

    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):

        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
train_labels = adjust_labels_for_offset(dataset['train_labels'], 
                                        label_encoding, 
                                        train_encodings)

test_labels = adjust_labels_for_offset(dataset['test_labels'],
                                      label_encoding, 
                                      test_encodings)

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}
id_to_label[-100] = 'X'

#print( tokenizer.convert_ids_to_tokens(encoding_example) )
#print([id_to_label[id] for id in train_labels[0][0:9]])
list(zip( tokenizer.convert_ids_to_tokens(train_encodings['input_ids'][0][0:15]), [id_to_label[id] for id in train_labels[0][0:15]], train_labels[0][0:15] )) 

[('[CLS]', 'X', -100),
 ('what', 'O', 4),
 ('movies', 'O', 4),
 ('star', 'O', 4),
 ('bruce', 'B-ACTOR', 21),
 ('willis', 'I-ACTOR', 7),
 ('[SEP]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100),
 ('[PAD]', 'X', -100)]

In [None]:
np.unique(train_labels)

array([-100,    0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
         10,   11,   12,   13,   14,   15,   16,   17,   18,   19,   20,
         21,   22,   23,   24])

### Prepare Pytorch Datasets

https://huggingface.co/transformers/custom_datasets.html#ft-trainer


In [None]:
import torch

# pytorch is expecting a certain type of dataset 

class pt_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# remove the offset_mapping
train_encodings.pop("offset_mapping") 
test_encodings.pop("offset_mapping")

train_dataset = pt_dataset(train_encodings, train_labels)
test_dataset = pt_dataset(test_encodings, test_labels)

### Train Model

In [None]:
from transformers import BertForTokenClassification

# load the pretrained model from huggingface
model = BertForTokenClassification.from_pretrained('bert-large-uncased', num_labels=len(uniq_labels))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large

In [None]:
from transformers import Trainer, TrainingArguments

# boiler plate code from huggingface to launch a trainer instance
# sets directories and baseline configuration for batch sizes and weight decay

training_args = TrainingArguments(
    output_dir = model_dir +  'explainability/bert',          # output directory
    overwrite_output_dir = True,
    num_train_epochs=3,              # total number of training epochs
    evaluation_strategy = 'epoch',
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir = model_dir +  'explainability/bert/logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset = train_dataset,         # training dataset
    eval_dataset = test_dataset             # evaluation dataset
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [None]:
model.to(device);

In [None]:
from datetime import datetime

start = datetime.now()

trainer.train()

print('Time to train:', datetime.now() - start)

Epoch,Training Loss,Validation Loss
1,0.257758,0.285916
2,0.220679,0.252612
3,0.127478,0.241965


Time to train: 0:15:44.144982


In [None]:
#trainer.save_model(model_dir + 'mitmovie_pt_bert_uncased/model')
#model.save_pretrained(model_dir + 'mitmovie_pt_bert_uncased/model')

### Interpret Predictions

Guidance for how to do interpretability with NER was taken from: 
https://www.depends-on-the-definition.com/interpretable-named-entity-recognition/


In [None]:
# from transformers import BertForTokenClassification
# import torch

# # retreive the saved model 
# model = BertForTokenClassification.from_pretrained(model_dir + 'mitmovie_pt_bert_uncased/model', num_labels=len(uniq_labels))

In [None]:
# from transformers import BertTokenizerFast

# tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased')

In [None]:
!pip install eli5
!pip install lime
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler

In [None]:
model.to('cpu'); # push model to cpu

In [None]:
id_to_label = {id: label for (label,id) in label_encoding.items()}

The function below will yield probabilities for a specific word in the sentence you want to interpret.

In [None]:
def make_prediction(texts):

  # if type(text) == str:
  #   text = text
  # if type(text) == tuple:
  #   text = text[0]

  texts = [text.split() for text in texts]

  encodings = tokenizer(texts,  # [text.split()]
            is_split_into_words=True, 
            return_offsets_mapping=False, 
            padding=True, 
            truncation=True)
  
  with torch.no_grad():
      output = model( torch.tensor(encodings['input_ids']) )

  preds = output[0]  
  preds = torch.softmax(preds, dim = -1) # get probabilities

  return preds[:,word_to_check,:].numpy() # return probs only for the word you want

Assign the sentence and the word to inspect


In [None]:
sentence =  ' '.join( dataset['train_tokens'][6] ) # assign by index value [#]
sentence

'show me science fiction films directed by steven spielberg'

Figure out which index the word you want to predict is in. Some tokens get split into subwords making more tokens than original. Also, tokenizer will add a [CLS] at beginning and a [SEP] token at end of every sentence.

In [None]:
# encode the sentence
encodings = tokenizer([sentence.split()],  # [text.split()]
            is_split_into_words=True, 
            return_offsets_mapping=False, 
            padding=True, 
            truncation=True)

# get tokenized version
tokes = tokenizer.convert_ids_to_tokens(encodings['input_ids'][0])

# get predictions
with torch.no_grad():
    output = model(torch.tensor(encodings['input_ids']))

# convert predictions to classes
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
predicted = [id_to_label[label] for label in label_indices.tolist()[0]]

# print the index, token, class                                        
for i,t in enumerate(zip(tokes, predicted)) :
  print(i, t[0], t[1])

0 [CLS] O
1 show O
2 me O
3 science B-GENRE
4 fiction I-GENRE
5 films O
6 directed O
7 by O
8 steven B-DIRECTOR
9 spielberg I-DIRECTOR
10 [SEP] I-DIRECTOR


In [None]:
# assign the index of the token you want to inspect
word_to_check = 4 

In [None]:
# checking output of instance to make sure we get back a prediction
id_to_label[np.argmax(make_prediction([sentence]), -1)[0]]

'I-GENRE'

In [None]:
sampler = MaskingTextSampler(
    replacement = "UNK",
    max_replace = 0.7,
    bow = False # bag of words means it will test the exclusion of multiple words as well
)

In [None]:
# illustrations of token removal for testing
samples, similarity = sampler.sample_near(sentence, n_samples=4)
for sample in samples:
  print(sample)
  print()
print('Similarities:', similarity)

show UNK UNK UNK UNK UNK UNK UNK spielberg

UNK me UNK UNK films UNK by UNK UNK

show me UNK fiction UNK UNK by UNK UNK

show me UNK UNK films directed UNK steven spielberg

Similarities: [0.47140438 0.57735014 0.66666655 0.81649647]


In [None]:
text_explainer = TextExplainer(
    sampler=sampler,
    position_dependent = True,
    random_state = 35
)

text_explainer.fit(sentence, make_prediction)



TextExplainer(char_based=False,
              clf=SGDClassifier(alpha=0.001, average=False, class_weight=None,
                                early_stopping=False, epsilon=0.1, eta0=0.0,
                                fit_intercept=True, l1_ratio=0.15,
                                learning_rate='optimal', loss='log',
                                max_iter=1000, n_iter_no_change=5, n_jobs=None,
                                penalty='elasticnet', power_t=0.5,
                                random_state=RandomState(MT19937) at 0x7F7FE87DC678,
                                shuffle=True, tol=0.001,
                                validation_fraction=0.1, verbose=0,
                                warm_start=False),
              expand_factor=10, n_samples=5000, position_dependent=True,
              random_state=35, rbf_sigma=None,
              sampler=MaskingTextSampler(bow=False, group_size=1,
                                         max_replace=0.7, min_replace=1,
         

In [None]:
text_explainer.explain_prediction(
    target_names=list(label_encoding.keys()),
    top_targets=3 # will give you result for the 3 most probable
)

Contribution?,Feature
16.317,Highlighted in text (sum)
-14.514,<BIAS>

Contribution?,Feature
5.767,Highlighted in text (sum)
-6.168,<BIAS>

Contribution?,Feature
-0.893,Highlighted in text (sum)
-1.083,<BIAS>


In [None]:
#!pip install git+https://github.com/sofiaherrero/lime-ner