In [None]:
# Demonstrate creating word embeddings using BERT

In [None]:
# need to pip install pytorch pre-trained bert library
!pip install pytorch-pretrained-bert

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [None]:
# use the online model bert-base-uncased, 12/768/110M

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# check the size of the entire vocabulary
len(tokenizer.vocab)

In [None]:
# peek at some of the vocabulary items
list(tokenizer.vocab.keys())[5000:5020]

In [None]:
# [CLS] denotes start of classification
# [SEP] is separator between sentences in a classification
text = "Here is the sentence I want embeddings for."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
tokenized_text

# note hashes in output, which means that is a subword or character of a larger word precented by another word
# hashes split to subword tokens instead of unknowns and then average for approximation

In [None]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Word2Vec classifies bank as the same meaning, BERT creates 3

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize("[CLS] " + text + " [SEP]")
tokenized_text

In [None]:
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
# Mark each of the 22 tokens as belonging to sentence "1".
# tokens must be mapped to the appropriate sentence

segments_ids = [1] * len(tokenized_text)
segments_ids

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
tokens_tensor

In [None]:
segments_tensors

In [None]:
# load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation, less memory
model.eval()

In [None]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

In [None]:
# check on the numer of layers, batches, tokens and hidden units
print ("Number of layers:", len(encoded_layers))
print ("Number of batches:", len(encoded_layers[0]))
print ("Number of tokens:", len(encoded_layers[0][0]))
print ("Number of hidden units:", len(encoded_layers[0][0][0]))

In [None]:
# `encoded_layers` is a Python list.
print('    Type of encoded_layers: ', type(encoded_layers))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', encoded_layers[0].size())

In [None]:
# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

In [None]:
# Remove dimension 1, the "batches" - that is, the sentences as we only have one
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

In [None]:
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)
token_embeddings.size()

# token_embeddings is a [22 x 12 x 768] tensor.

In [None]:
# Stores the token vectors, with shape [22 x 768]  
# sums the last 4 vectors

# rolled up sum of tensors / vectors
token_vecs_sum = []

# token_embeddings is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:
    # token is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

In [None]:
# reminder of the tokens
for i, token_str in enumerate(tokenized_text):
  print (i, token_str)

In [None]:
# peek at some of the embeddings
print('First 5 vector values for each instance of "bank".')
print("bank vault   ", str(token_vecs_sum[6][:5]))
print("bank robber  ", str(token_vecs_sum[10][:5]))
print("river bank   ", str(token_vecs_sum[19][:5]))

In [None]:
# examines one of the bank tokens encoding in its entirety (768 values)
token_vecs_sum[6] # bank vault

In [None]:
from scipy.spatial.distance import cosine

# "bank robber" vs "bank vault" 
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])
same_bank

In [None]:
# "bank robber" vs "river bank" 

diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])
diff_bank