# BertViz Interactive Demo
## **Scroll down for pre-loaded visualizations** 👇👇👇👇👇👇


In [1]:
# !pip3 install bertviz
# !pip3 install ipywidgets

In [2]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min',
      jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

<IPython.core.display.Javascript object>

In [3]:
# Load model and retrieve attention weights
import torch
from bertviz import head_view, model_view
from transformers import BertTokenizer, BertForMultipleChoice

model_version = 'hfl/chinese-macbert-large'
ckpt_path = 'ckpt/mt/best_model_qa_c3_1.pt'
do_lower_case = True
model = BertForMultipleChoice.from_pretrained(model_version, output_attentions=True)
model.load_state_dict(torch.load(ckpt_path))
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)


Some weights of the model checkpoint at hfl/chinese-macbert-large were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model check

In [4]:
import json
train_data_dir = 'data/qa/processed_train_150_r2_pg0.json'
with open(train_data_dir, 'r') as f1:
    train_data = json.load(f1)

In [5]:
i = 675
train_data[i]

{'id': 676,
 'article_id': 337,
 'text': '醫師:就抽血那幾天這樣。民眾:抽血前。醫師:抽血前這樣子。民眾:對。',
 'question': {'stem': '民眾病毒量變高是因為下列何者原因？',
  'choices': [{'text': '抽血前發燒', 'label': 'A'},
   {'text': '抽血前拉肚子', 'label': 'B'},
   {'text': '抽血前咳嗽', 'label': 'C'}]},
 'answer': 'A'}

In [6]:
pg = [train_data[i]['text']] * 3
q = train_data[i]['question']['stem']
q_opts = []
for opt_i in range(3):
    opt = train_data[i]['question']['choices'][opt_i]['text']
    q_opts.append(q + opt)

inputs = tokenizer(pg, q_opts, padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )


In [7]:
input_ids = inputs.input_ids
token_type_ids = inputs.token_type_ids
attention_mask = inputs.attention_mask

In [8]:
tokenizer.decode(input_ids[0,:])

'[CLS] 醫 師 : 就 抽 血 那 幾 天 這 樣 。 民 眾 : 抽 血 前 。 醫 師 : 抽 血 前 這 樣 子 。 民 眾 : 對 。 [SEP] 民 眾 病 毒 量 變 高 是 因 為 下 列 何 者 原 因 ？ 抽 血 前 發 燒 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

### Calcualte attention

In [9]:
attention = model(**{k: v.unsqueeze(0) for k,v in inputs.items()})[-1]

In [10]:
# for opt_i in range(3):
opt_i = 2
sentence_b_start = token_type_ids[opt_i].tolist().index(1)
input_id_list = input_ids[opt_i].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list) 

# Head View
The attention-head view visualizes attention in one or more heads in a particular layer in the model.

## Usage
* **Hover** over any **token** on the left/right side of the visualization to filter attention from/to that token. The colors correspond to different attention heads.
* **Double-click** on any of the **colored tiles** at the top to filter to the corresponding attention head.
* **Single-click** on any of the **colored tiles** to toggle selection of the corresponding attention head. 
* **Click** on the **Layer** drop-down to change the model layer (zero-indexed).
* The lines show the attention from each token (left) to every other token (right). Darker lines indicate higher attention weights. When multiple heads are selected, the attention weights are overlaid on one another. 

## Select visulized layer and head

In [11]:
import numpy as np

In [12]:
head_i = 0 # 0-15
layer = 23 # 0-23

In [13]:
att_i = ()
for att in attention:
    pad_pos = np.where(np.array(tokens) == '[PAD]')[0][0]
    att_i = att_i + (att[0,head_i:head_i+1,:pad_pos,:pad_pos].unsqueeze(0),)

In [14]:
head_view(att_i[layer:layer+1], tokens[:pad_pos], sentence_b_start)

<IPython.core.display.Javascript object>