In [None]:
!pip install transformers

In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
text = "The river was full of people sitting in the bank ."

# BERT requires the specific tokens before and after the text
text_tokens = "[CLS] " + text + " [SEP]"

# Load pre-trained model and tokenize the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = tokenizer.tokenize(text_tokens)
print(tokenized_text)

Run the code above for the text below .. what do you see? 

In [None]:
text = "The river was full of people sitting in the banky ."

Besides the tokenization we need to 
1. convert tokens to integers
2. include the segment_ids (if one sentence should all be one)
3. convert these to tensors

In [None]:
indexed_text = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_text])
segments_tensors = torch.tensor([segments_ids])

Now we pass the text representation to the Bert model to take the Nx768 weights for our N input words. We will take them from Bert's last layer (in total 12)

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

with torch.no_grad():
  outputs = model(tokens_tensor, segments_tensors)
last_layer= outputs.last_hidden_state  

To be able to convert this tensor to a list to study it better we are performing a preprocessing

In [None]:
token_embeddings = torch.squeeze(last_hidden_states, dim=0)
list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]
print('The list contains {}  words each of them has a vector of "{}!"'.format(len(list_token_embeddings),len(list_token_embeddings[0]))) 

Ok before we proceed lets put eveything in a function. Input will be text and output will be the list_token_embeddings

In [None]:
def bert_embeddings(text):
  # BERT requires the specific tokens before and after the text
  text_tokens = "[CLS] " + text + " [SEP]"

  # Load pre-trained model and tokenize the text
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  tokenized_text = tokenizer.tokenize(text_tokens)
  print(tokenized_text)
  indexed_text = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1] * len(tokenized_text)
  tokens_tensor = torch.tensor([indexed_text])
  segments_tensors = torch.tensor([segments_ids])

  # Load pre-trained model (weights)
  model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

  # Put the model in "evaluation" mode, meaning feed-forward operation.
  model.eval()


  with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensors)
    last_layer= outputs.last_hidden_state  
  
  token_embeddings = torch.squeeze(last_layer, dim=0)
  list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]
  print('The list contains {}  words each of them has a vector of "{}!"'.format(len(list_token_embeddings),len(list_token_embeddings[0]))) 

  return list_token_embeddings


In [None]:
#Now lets run it for multiple texts
bank_river = bert_embeddings("the bank river")
bank_shore = bert_embeddings("the bank shore")
bank_thief = bert_embeddings("the bank thief")

In [None]:
from scipy.spatial.distance import cosine
cos_dist = 1 - cosine(bank_river[2], bank_shore[2])
print(cos_dist)

In [None]:
cos_dist = 1 - cosine(bank_river[2], bank_thief[2])
print(cos_dist)