## Introduction to BERT

In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Let's load a vanilla BERT-base model. 
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Get all of the model's parameters as a list of tuples.
named_params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(named_params)))

print('==== Embedding Layer ====\n')
for p in named_params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Encoder ====\n')
for p in named_params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')
for p in named_params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


In [None]:
# The pooler is a separate linear and tanh activated layer that acts on the [CLS] token's representation
# This pooled_output is often used as a representation for the entire sentence.

In [None]:
# load the bert-base uncased tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenizer.encode('Sinan loves a beautiful day')  # tokenize a simple sequence

In [None]:
# run tokens through the model

#1 Turn tokens_with_unknown_words into a tensor (will be size (8,))
#2 Unsqueeze a first dimension to simulate batches. Resulting shape is (1, 8)
response = model(torch.tensor(tokenizer.encode('Sinan loves a beautiful day')).unsqueeze(0))

In [None]:
response

In [None]:
# Embedding for each token, the first one being the [CLS] token
response.last_hidden_state

In [None]:
# This layer is trained on top of the Embedding of the CLS token

response.pooler_output.shape

In [None]:
model.pooler

In [None]:
# grab the final encoder's representation of the CLS token
CLS_embedding = response.last_hidden_state[:, 0, :].unsqueeze(0)

CLS_embedding.shape

In [None]:
model.pooler(CLS_embedding).shape

In [None]:
# Running the embedding for CLS through the pooler gives the same output as the "pooler_output"
(model.pooler(CLS_embedding) == response.pooler_output).all()

In [None]:
total_params = 0
for p in model.parameters():
    if len(p.shape) == 2:
        total_params += p.shape[0] * p.shape[1]
        
print(f'Total Parameters: {total_params:,}')  # This is where the 110M parameter comes from

## Wordpiece tokenization

## Let's start by taking a look at the Bert Tokenizer.

Let's use the `from_pretrained` method to grab the uncased bert-base tokenizer

A list of all available modules can be found on their site: https://huggingface.co/transformers/pretrained_models.html

In [None]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# load the bert-base uncased tokenizer. Quick check what does "uncased" mean?
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Length of BERT base vocabulary: {len(tokenizer.vocab)}')

In [None]:
text = "A simple sentence!"

tokens = tokenizer.encode(text)  # get token ids per BERT-base's vocabulary
print(tokens)

In [None]:
# decode will re-construct the sentence with the added [CLS] and [SEP] token
tokenizer.decode(tokens)

In [None]:
text = "My friend told me about this class and I love it so far! She was right."

tokens = tokenizer.encode(text)
print(tokens)

In [None]:
# A nicer printout  of token ids and token strings

print(f'Text: {text}. Num tokens: {len(tokens)}')
for t in tokens:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

In [None]:
# Sinan is not in our vocab :'(

'sinan' in tokenizer.vocab

In [None]:
text_with_unknown_words = 'Sinan loves a beautiful day'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)

# We see our sub words in action!
for t in tokens_with_unknown_words:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

In [None]:
tokenizer.encode('sinan')

In [None]:
tokenizer.encode('an')

In [None]:
text_with_unknown_words = 'Sinan is our instructor for this awesomesauce class'
tokens_with_unknown_words = tokenizer.encode(text_with_unknown_words)

for t in tokens_with_unknown_words:
    print(f'Token: {t}, subword: {tokenizer.decode([t])}')

In [None]:
text = "My friend told me about this class and I love it so far! She was right."

# encode_plus gives us token ids, attention mask and segment ids (A vs B). Useful for training time
tokens = tokenizer.encode_plus(text)
print(tokens)

In [None]:
tokenizer(text)  # calling the tokenizer directly does the same thing as encode_plus

In [None]:
# python is the 6th token (don't forget the [CLS] token!)
python_pet = tokenizer.encode('I love my pet python')

# python is the 6th token (don't forget the [CLS] token!)
python_language = tokenizer.encode('I love coding in python')

In [None]:
# contextful embedding of 'python' in 'I love my pet python'
python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()

# contextful embedding of 'python' in 'I love coding in python'
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()

# contextful embedding of 'snake' in 'snake'
snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()

# contextful embedding of 'programming' in 'programming'
programming_alone_embedding = model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [None]:
python_pet_embedding.shape

In [None]:
python_language_embedding.shape

In [None]:
# Similarity of the representation of the word Python in a sentence about coding to the word snake
cosine_similarity(python_language_embedding, snake_alone_embedding)

In [None]:
# Similarity of the representation of the word Python in a sentence about pets to the word snake. More similar!
cosine_similarity(python_pet_embedding, snake_alone_embedding)

In [None]:
# Similarity of the representation of the word Python in a sentence about pets to the word programming
cosine_similarity(python_pet_embedding, programming_alone_embedding)

In [None]:
# Similarity of the representation of the word Python in a sentence about coding to the word programming. More similar!
cosine_similarity(python_language_embedding, programming_alone_embedding)

## The many embeddings of BERT

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
"""
word_embeddings == context-free word embeddings
position_embeddings == encodes word position
token_type_embeddings == 0 or 1. Used to lookup the segment embedding
"""

model.embeddings

In [None]:
example_phrase = 'I am Sinan'

# return_tensors='pt' converts to pytorch automatically
tokenizer.encode(example_phrase, return_tensors='pt')

In [None]:
# context-less embedding of each token in our sentence
model.embeddings.word_embeddings(tokenizer.encode(example_phrase, return_tensors='pt'))

In [None]:
# Note the first and last row are the same because they are the 
#  [CLS] and [SEP] reserved tokens. They are the same without context for every input
model.embeddings.word_embeddings(tokenizer.encode('I am Matt', return_tensors='pt'))

In [None]:
model.embeddings.position_embeddings  # 512 embeddings, one for each position in a max 512 input sequence

In [None]:
torch.LongTensor(range(6))

In [None]:
model.embeddings.position_embeddings(torch.LongTensor(range(6)))  # positional embeddings for our example_phrase

In [None]:
model.embeddings.token_type_embeddings  # 2 embeddings. One for A and one for B

In [None]:
torch.LongTensor([0]*6)

In [None]:
model.embeddings.token_type_embeddings(torch.LongTensor([0]*6))  # All tokens have the same embedding

In [None]:
# Apply feed forward normalization layer

model.embeddings.LayerNorm(
    model.embeddings.word_embeddings(tokenizer.encode(example_phrase, return_tensors='pt')) + \
    model.embeddings.position_embeddings(torch.LongTensor(range(6))) + \
    model.embeddings.token_type_embeddings(torch.LongTensor([0]*6))
)

In [None]:
# Et Voilà! The many embeddings of BERT become one embedding per token
model.embeddings(tokenizer.encode(example_phrase, return_tensors='pt'))

In [None]:
model.embeddings(tokenizer.encode(example_phrase, return_tensors='pt')).shape