# BERT Tutorial


## Install the libraries
First you need to install the following libraries:

    pip install transformers
    pip install ipywidgets
    pip install bertviz

Once everything is installed you can download 

In [1]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']

FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo


## Imports and definitions

In [2]:
from bertviz import model_view, head_view
from transformers import *

import numpy as np
import pprint

# Get the interactive Tools for Matplotlib
%matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from transformers import BertTokenizer, BertModel
import torch

In [3]:
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

In [4]:
#model_path = 'nboost/pt-bert-base-uncased-msmarco'
model_path = 'bert-base-uncased'

CLS_token = "[CLS]"
SEP_token = "[SEP]"


# Load the required tokenizer, configuration and model

In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased',  output_hidden_states=True, output_attentions=True)  
model = AutoModel.from_pretrained('bert-base-uncased', config=config)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Tokenization

See here for details: https://huggingface.co/docs/transformers/tokenizer_summary

In [37]:
sentence_a = "Is throat cancer treatable?"
sentence_b = "Tell me about lung cancer."
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True, max_length = 512, truncation = True)
pprint.pprint(inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  2003,  3759,  4456,  7438,  3085,  1029,   102,  2425,  2033,
          2055, 11192,  4456,  1012,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])}


In [38]:
print(tokenizer.decode(inputs["input_ids"][0].tolist()))

[CLS] is throat cancer treatable? [SEP] tell me about lung cancer. [SEP]


In [39]:
input_ids = inputs['input_ids']
pprint.pprint(input_ids[0].tolist())

[101,
 2003,
 3759,
 4456,
 7438,
 3085,
 1029,
 102,
 2425,
 2033,
 2055,
 11192,
 4456,
 1012,
 102]


In [40]:
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
pprint.pprint(tokens)

['[CLS]',
 'is',
 'throat',
 'cancer',
 'treat',
 '##able',
 '?',
 '[SEP]',
 'tell',
 'me',
 'about',
 'lung',
 'cancer',
 '.',
 '[SEP]']


# Model inference output

In [41]:
with torch.no_grad():
    outputs = model(**inputs)

In [42]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states', 'attentions'])

## Layer embeddings

In [92]:
# the last layer is the output embedding layer
output_embeddings = outputs['last_hidden_state']

In [93]:
token_throat = 2
token_lung = 11
# out[0][token]
output_embeddings[0][token_throat]

tensor([ 1.3546e-01,  5.9505e-01,  4.2941e-01, -8.0278e-01,  1.1841e+00,
         8.1318e-01,  2.5967e-01,  6.3340e-01, -9.9528e-03, -7.0135e-01,
        -4.9668e-01,  1.0843e-01,  1.0491e-01,  2.7708e-01,  5.9081e-02,
         8.2602e-01,  9.6558e-01,  8.7914e-01, -3.4101e-01,  6.7063e-01,
         1.3637e-01, -4.8719e-01,  1.5602e-01,  5.3420e-01,  5.7751e-01,
        -2.0827e-01,  1.8220e-01, -1.9312e-02, -9.2467e-01, -3.3004e-01,
         4.5463e-01,  3.5004e-01,  3.4277e-01, -3.7747e-01, -4.8721e-01,
        -7.1949e-01, -1.1295e+00, -2.8828e-01, -7.7856e-01, -1.8062e-01,
        -1.0366e+00, -1.8247e-02,  1.7411e-01,  1.8165e-01,  2.5319e-01,
        -1.2004e-01,  1.8728e-01, -1.6055e-01,  1.2180e-01, -1.1573e+00,
        -9.9230e-01, -6.7146e-01, -2.0892e-01, -4.6718e-01, -4.9289e-01,
         5.9843e-01, -2.3778e-02, -2.0047e-01,  6.4973e-01, -7.4589e-01,
        -6.9295e-02, -2.3436e-01,  5.1695e-02, -1.2295e+00, -8.2671e-01,
         6.6215e-01,  2.3159e-01,  4.4064e-01, -7.3

In [45]:
hidden_states = outputs['hidden_states']

In [46]:
# hidden_states[layer][0][token])
hidden_states[0][0][token_throat]

tensor([-0.1592,  0.2358,  0.5437, -0.4591,  0.4130,  0.5577, -0.8181, -0.6066,
         0.3859, -0.4519, -0.5240,  0.3563,  0.8813, -0.5152, -0.3479, -0.8447,
         0.2498, -0.5831, -0.5223, -0.7000, -0.5182, -0.9982,  0.3124, -0.5684,
         0.2507, -0.8659, -0.1018, -0.7466, -0.4140,  0.0205, -0.3228, -0.3108,
        -1.2027,  0.0488, -0.0759, -1.3291,  0.4393,  0.6587, -1.2414, -0.8355,
        -0.7831, -0.1931,  0.5124, -0.7864, -0.4347,  0.1025, -0.0937,  0.3378,
        -0.8771, -1.0559, -1.2896, -0.3827, -0.2764,  0.9259, -0.3566, -0.5777,
         0.6622, -0.8857,  0.3801, -1.0057, -0.2319, -0.5292,  0.0404, -1.1151,
        -0.3623,  0.9480,  0.5445,  0.2032,  0.0760,  0.3717,  0.2971,  0.1316,
         0.3474,  0.8049,  0.3292, -0.2390,  0.0157,  0.4088, -0.7019, -1.4708,
        -0.5477, -0.5134,  1.0274,  0.8588, -0.1060, -0.9407,  0.4311,  0.8782,
        -0.4329,  0.4699,  0.8978, -0.7934,  0.1529, -0.4782,  0.2570,  0.1144,
        -0.0521,  0.1673,  0.2998,  0.91

## Self-attention matrices

In [47]:
attention = outputs['attentions']

In [48]:
# There's a softmax, so, the sum should be 1 
attention[3][0][3][token_throat].sum()

tensor(1.)

In [49]:
# attention[layer][0][head][token1][token2]
attention[3][0][3][token_throat][token_lung]

tensor(3.6900e-05)

In [50]:
attention[3][0][3][token_throat].sum()

tensor(1.)

# Extract Token embeddings

Note that this code computes the average embedding of a word occurrences

In [89]:
import torch
from transformers import AutoTokenizer, AutoModel

def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)

def get_word_vector(inputs, outputs, idx, layer):
    """Get a word vector by averaging the embeddings of 
       all word occurrences of that word in the input"""

    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(inputs.word_ids()) == idx)
    word_tokens_output = outputs.hidden_states[layer][0][token_ids_word]

    return word_tokens_output.mean(dim=0)

idx = get_word_idx(sentence_a, "throat")

word_embedding = get_word_vector(inputs, outputs, idx, 4)

In [91]:
word_embedding

tensor([ 6.2784e-01, -9.1235e-02,  3.4819e-03,  4.6117e-02,  1.6443e-01,
         4.8151e-01, -3.3493e-01,  6.8531e-01,  6.3966e-01, -2.5792e-01,
        -8.2985e-01,  7.9850e-01, -4.8082e-02, -5.0432e-02, -6.7123e-01,
         6.7558e-01,  6.7259e-01, -1.4663e-01,  1.5597e-01, -1.3089e-01,
        -1.7256e-01, -4.0354e-01, -6.4943e-02,  3.9349e-01,  2.7099e-01,
         7.6182e-01,  4.1979e-01, -6.7708e-01, -4.0113e-02,  1.6656e-01,
        -1.8783e-01, -8.7409e-01, -5.2706e-01, -3.3006e-01, -9.5537e-01,
        -9.0674e-01, -9.8048e-01, -2.9497e-01, -7.5755e-01, -2.6050e-01,
        -1.4730e+00, -1.3521e+00, -3.6398e-01,  6.1162e-01,  3.4968e-01,
         8.4314e-01,  2.0161e-01, -9.0983e-01,  3.0212e-02, -1.8938e+00,
        -8.9150e-01,  1.2806e-01, -1.8896e-01, -6.9148e-01, -7.8760e-02,
         5.0060e-01,  5.0080e-01, -3.3230e-01,  6.3802e-01, -2.8080e-01,
        -8.7494e-01, -3.7270e-01,  1.6104e-01, -8.1353e-01, -2.3886e-01,
         3.2847e-01, -4.6971e-01,  1.6774e-01,  4.4

# Attention visualization

More details are available here: https://github.com/jessevig/bertviz

In [15]:
call_html()
head_view(attention, tokens)
model_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Other pre-trained BERT models

There are many other models available for download (https://huggingface.co/models).

BioBERT is a popular BERT model trained on biomedical literature (https://academic.oup.com/bioinformatics/article/36/4/1234/5566506):

    tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
    config = AutoConfig.from_pretrained('dmis-lab/biobert-v1.1',  output_hidden_states=True, output_attentions=True)  
    model = AutoModel.from_pretrained('dmis-lab/biobert-v1.1', config=config)

Another popular BERT is the SciBERT trained on scientific literature (https://arxiv.org/abs/1903.10676):

    tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
    config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased',  output_hidden_states=True, output_attentions=True)  
    model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased', config=config)
