In [1]:
text = "I am so <mask>"

In [2]:
import torch
import torch.nn.functional as F
import numpy as np
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaModel

model_checkpoint = 'roberta-base'
RobertaLM_model = RobertaForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(text, return_tensors="pt")
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")
    
with torch.no_grad():
    model_output = RobertaLM_model(**inputs)
    logits = model_output.logits
    
am_loc = 2
mask_loc = 4

def k_most_similar(logits, index):
    mask_token_logits = logits[0, index, :]
    # Pick the [MASK] candidates with the highest logits
    probabilities = F.softmax(mask_token_logits,dim=0)
    top_5_tokens = np.argsort(-probabilities)[:5].tolist()
    
    for token in top_5_tokens:
        print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))},    (probability:{probabilities[token]})")

print('\n------------------------\n')
print("5 most similar to 'am':") 
k_most_similar(logits, am_loc)
print('\n------------------------')
print("5 most similar to '<mask>':") 
k_most_similar(logits, mask_loc)

  from .autonotebook import tqdm as notebook_tqdm


<s> : 0
I : 100
 am : 524
 so : 98
<mask> : 50264
</s> : 2

------------------------

5 most similar to 'am':
>>> I am so  am,    (probability:0.9998922348022461)
>>> I am so  is,    (probability:3.9378628571284935e-05)
>>> I am so 'm,    (probability:2.9937518775113858e-05)
>>> I am so  was,    (probability:8.688964953762479e-06)
>>> I am so  feel,    (probability:8.550764505343977e-06)

------------------------
5 most similar to '<mask>':
>>> I am so  sorry,    (probability:0.3083705008029938)
>>> I am so  proud,    (probability:0.0649036392569542)
>>> I am so  grateful,    (probability:0.05806168541312218)
>>> I am so  happy,    (probability:0.04478686675429344)
>>> I am so  blessed,    (probability:0.032352522015571594)


In [3]:
# Static word embeddings
#all_embeddings = RobertaLM_model.roberta.embeddings.word_embeddings.weight
#am_embeddings = all_embeddings[tokenizer(["I am so <mask>"])['input_ids'][0][am_loc]]
#mask_embeddings = all_embeddings[tokenizer(["I am so <mask>"])['input_ids'][0][mask_loc]]

# Contextualize word embeddings
Roberta_model = RobertaModel.from_pretrained(model_checkpoint)
with torch.no_grad():
    model_output = Roberta_model(**inputs)['last_hidden_state']
    
am_embeddings = model_output[0][am_loc]
mask_embeddings = model_output[0][mask_loc]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
# Low
sentence1 = "The experience was dope, I enjoyed it so much"
sentence2 = "The drugs addict bought an dope"

# High
sentence1 = "I took a loan from the bank"
sentence2 = "I went to the bank to get money"

input1 = tokenizer(sentence1, return_tensors="pt")
input2 = tokenizer(sentence2, return_tensors="pt")

Roberta_model.eval()

with torch.no_grad():
    model_output1 = Roberta_model(**input1)['last_hidden_state'][0]
    
with torch.no_grad():
    model_output2 = Roberta_model(**input2)['last_hidden_state'][0]
    
embs1 = model_output1[7][:]
embs2 = model_output2[5][:]

cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embs1, embs2)

tensor(0.9720)

In [40]:
# 4) Find a sentence with n words, that is tokenized into m > n tokens by the tokenizer.
for code in tokenizer("I just love Dilophosaurus")['input_ids']:
    print(f"{ tokenizer.decode(code)} : {code}")
    

<s> : 0
I : 100
 just : 95
 love : 657
 Dil : 14205
oph : 6673
osaurus : 44422
</s> : 2
