In [1]:
text = "I am so <mask>"

In [2]:
import torch
import torch.nn.functional as F
import numpy as np
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaModel

model_checkpoint = 'roberta-base'
RobertaLM_model = RobertaForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

inputs = tokenizer(text, return_tensors="pt")
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")
    
with torch.no_grad():
    model_output = RobertaLM_model(**inputs)
    logits = model_output.logits
    
am_loc = 2
mask_loc = 4

def k_most_similar(logits, index):
    mask_token_logits = logits[0, index, :]
    # Pick the [MASK] candidates with the highest logits
    probabilities = F.softmax(mask_token_logits,dim=0)
    top_5_tokens = np.argsort(-probabilities)[:5].tolist()
    
    for token in top_5_tokens:
        print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))},    (probability:{probabilities[token]})")

print('\n------------------------\n')
print("5 most similar to 'am':") 
k_most_similar(logits, am_loc)
print('\n------------------------')
print("5 most similar to '<mask>':") 
k_most_similar(logits, mask_loc)

  from .autonotebook import tqdm as notebook_tqdm


<s> : 0
I : 100
 am : 524
 so : 98
<mask> : 50264
</s> : 2

------------------------

5 most similar to 'am':
>>> I am so  am,    (probability:0.9998922348022461)
>>> I am so  is,    (probability:3.9378628571284935e-05)
>>> I am so 'm,    (probability:2.9937518775113858e-05)
>>> I am so  was,    (probability:8.688964953762479e-06)
>>> I am so  feel,    (probability:8.550764505343977e-06)

------------------------
5 most similar to '<mask>':
>>> I am so  sorry,    (probability:0.3083705008029938)
>>> I am so  proud,    (probability:0.0649036392569542)
>>> I am so  grateful,    (probability:0.05806168541312218)
>>> I am so  happy,    (probability:0.04478686675429344)
>>> I am so  blessed,    (probability:0.032352522015571594)


In [78]:
# Static word embeddings
#all_embeddings = RobertaLM_model.roberta.embeddings.word_embeddings.weight
#am_embeddings = all_embeddings[tokenizer(["I am so <mask>"])['input_ids'][0][am_loc]]
#mask_embeddings = all_embeddings[tokenizer(["I am so <mask>"])['input_ids'][0][mask_loc]]

# Contextualize word embeddings
Roberta_model = RobertaModel.from_pretrained(model_checkpoint)
with torch.no_grad():
    model_output = Roberta_model(**inputs)['last_hidden_state']
    
am_embeddings = model_output[0][am_loc]
mask_embeddings = model_output[0][mask_loc]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [156]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-distilroberta-v1')
model = AutoModel.from_pretrained('sentence-transformers/all-distilroberta-v1')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

sentence1 = "The robber was stealing money from the bank vault"
sentence2 = "I was fishing on the Mississippi river bank."

input1 = tokenizer(sentence1, return_tensors="pt")
input2 = tokenizer(sentence2, return_tensors="pt")

model.eval()

with torch.no_grad():
    model_output1 = Roberta_model(**input1)#['last_hidden_state'][0]
    
# with torch.no_grad():
#     model = Roberta_model(**input2)['last_hidden_state'][0]

Downloading: 100%|██████████████████████████████| 333/333 [00:00<00:00, 111kB/s]
Downloading: 100%|████████████████████████████| 798k/798k [00:03<00:00, 210kB/s]
Downloading: 100%|████████████████████████████| 456k/456k [00:02<00:00, 155kB/s]
Downloading: 100%|██████████████████████████| 1.36M/1.36M [00:05<00:00, 244kB/s]
Downloading: 100%|█████████████████████████████| 239/239 [00:00<00:00, 58.2kB/s]
Downloading: 100%|██████████████████████████████| 653/653 [00:00<00:00, 213kB/s]
Downloading: 100%|███████████████████████████| 329M/329M [01:32<00:00, 3.55MB/s]


NameError: name 'sentences' is not defined

In [None]:
model_output1

In [None]:
inputs = tokenizer(sentence2, return_tensors="pt")
for i,code in enumerate(inputs['input_ids'][1]):
    print(f"{i} --> { tokenizer.decode(code)} : {code}")

In [85]:
model_checkpoint = 'roberta-base'
Roberta_model = RobertaModel.from_pretrained(model_checkpoint,output_hidden_states = True)
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
Roberta_model(**input1)['last_hidden_state'].shape

torch.Size([1, 11, 768])

In [152]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Roberta_model = RobertaModel.from_pretrained('roberta-base')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = Roberta_model(**encoded_input)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [154]:
# sentence1 = "I took a loan from the bank"
# sentence2 = "went to the bank to get money"

sentence1 = "The robber was stealing money from the bank vault"
sentence2 = "I was fishing on the Mississippi river bank."

input1 = tokenizer(sentence1, return_tensors="pt")
input2 = tokenizer(sentence2, return_tensors="pt")

Roberta_model.eval()

with torch.no_grad():
    model_output1 = Roberta_model(**input1)['last_hidden_state'][0]
    
with torch.no_grad():
    model_output2 = Roberta_model(**input2)['last_hidden_state'][0]
    
bank1_embs = model_output1[8]
bank2_embs = model_output2[8]

cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(bank1_embs, bank2_embs)

tensor(0.9016)

In [96]:
 1 - cosine(bank1_embs, bank2_embs)

0.901610255241394

In [126]:
inputs = tokenizer(sentence2, return_tensors="pt")
for i,code in enumerate(inputs['input_ids'][0]):
    print(f"{i} --> { tokenizer.decode(code)} : {code}")

0 --> <s> : 0
1 --> I : 100
2 -->  was : 21
3 -->  fishing : 5651
4 -->  on : 15
5 -->  the : 5
6 -->  Mississippi : 5750
7 -->  river : 4908
8 -->  bank : 827
9 --> . : 4
10 --> </s> : 2


In [80]:
model_output1[0][:5]

tensor([-0.0192,  0.0770, -0.0123, -0.0947,  0.0821])

In [71]:
input1

{'input_ids': tensor([[    0,   133, 29364,    21,  9460,   418,    31,     5,   827, 19362,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [75]:
input2

{'input_ids': tensor([[   0,  100,   21, 5651,   15,    5, 5750, 4908,  827,    4,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [70]:
inputs = tokenizer(sentence2, return_tensors="pt")
for i,code in enumerate(inputs['input_ids'][1]):
    print(f"{i} --> { tokenizer.decode(code)} : {code}")

0 --> <s> : 0
1 --> I : 100
2 -->  was : 21
3 -->  fishing : 5651
4 -->  on : 15
5 -->  the : 5
6 -->  Mississippi : 5750
7 -->  river : 4908
8 -->  bank : 827
9 --> . : 4
10 --> </s> : 2


In [129]:
#sentence1 = "we need to book an order for monday" 
#sentence2 = "I read an article in the book and it was bad, oax, dog , cat"


from transformers import BertTokenizer, BertModel
import torch

tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased",output_hidden_states = True)
model.eval()

sentence1 = "click on the mouse to open windowin the computer"
sentence2 = "the mouse is an animal that love cheese"
sentence1 = "I took a loan from the bank"
sentence2 = "went to the bank to get money"
sentence1 = "horrible"
sentence2 = "fine"
input1 = tokenizer_bert(sentence1, return_tensors="pt")
input2 = tokenizer_bert(sentence2, return_tensors="pt")
input1['output_hidden_states'] = True
with torch.no_grad():
    model_output1 = model(**input1)['hidden_states'][-1]#['last_hidden_state']

input2['output_hidden_states'] = True

with torch.no_grad():
    model_output2 = model(**input2)['hidden_states'][-1]#['last_hidden_state']
    
#embs1 = model_output1[0][7]#[4]
#embs2 = model_output2[0][4]#[2]
embs1 = model_output1[0][1]#[4]
embs2 = model_output2[0][1]#[2]

cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embs1, embs2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(0.6570)

In [73]:
tokenizer_bert

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
inputs['input_ids']

tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
          1012,   102]])

In [134]:
inputs

{'input_ids': tensor([[   0,  100,   21, 5651,   15,    5, 5750, 4908,  827,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [140]:
text1 = "The robber was stealing money from the bank vault"
text2 =        "I was fishing on the Mississippi river bank"
inputs = tokenizer_bert(text1, return_tensors="pt")
# for code in inputs['input_ids'][0]:
#     print(f"{ tokenizer.decode(code)} : {code}")
with torch.no_grad():
    outputs1 = model(**inputs)
inputs = tokenizer_bert(text2, return_tensors="pt")

with torch.no_grad():
    outputs2 = model(**inputs)

from scipy.spatial.distance import cosine
# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(outputs1['hidden_states'][-1][0][8], outputs2['hidden_states'][-1][0][8])

In [141]:
diff_bank

0.31634584069252014

In [43]:
cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(outputs1['hidden_states'][-1][0][8],outputs2['hidden_states'][-1][0][8])

tensor(0.3163)

In [149]:
inputs = tokenizer_bert(text2, return_tensors="pt")
for i,code in enumerate(inputs['input_ids'][0]):
    print(f"{i}: { tokenizer_bert.decode(code)} : {code}")

0: [ C L S ] : 101
1: i : 1045
2: w a s : 2001
3: f i s h i n g : 5645
4: o n : 2006
5: t h e : 1996
6: m i s s i s s i p p i : 5900
7: r i v e r : 2314
8: b a n k : 2924
9: [ S E P ] : 102


In [28]:
diff_bank

0.6978819370269775

In [26]:
outputs['hidden_states'][-1].shape

torch.Size([1, 22, 768])

In [21]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [4]:
model(**input1).j

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1404,  0.4852, -0.2654,  ..., -0.2033,  0.0936,  0.0939],
         [ 0.4549,  0.4488, -0.3614,  ...,  0.2909,  0.2794, -0.5092],
         [ 0.8003,  0.1645, -0.3999,  ...,  0.1224, -0.7517, -0.2463]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.2017e-01, -1.7302e-01,  3.6870e-01,  6.5230e-01, -2.6350e-01,
         -5.0332e-02,  8.8083e-01,  2.0498e-01, -2.1210e-02, -9.9967e-01,
          1.1388e-01,  3.0782e-01,  9.5688e-01, -1.2867e-01,  8.9957e-01,
         -5.3771e-01, -1.7128e-01, -5.0038e-01,  3.2841e-01, -7.6873e-01,
          4.8446e-01,  8.6860e-01,  4.9746e-01,  1.8733e-01,  3.6306e-01,
          3.8558e-01, -5.0272e-01,  8.8007e-01,  9.2353e-01,  6.0863e-01,
         -6.4101e-01,  1.3125e-01, -9.6348e-01, -1.6425e-01,  3.6321e-01,
         -9.6277e-01,  1.0158e-01, -6.9240e-01,  3.2903e-02,  2.7733e-02,
         -8.0736e-01,  2.0906e-01,  9.9478e-01, -1.4673e-01, -4.3981e-02

In [None]:

input1['output_hidden_states'] = True
with torch.no_grad():
    model_output2 = model(**input1)
model_output1

In [None]:
model_output1

In [86]:
sentence2 = "went to the bank to get money"
sentence1 = "good"
sentence2 = "fine"
input1 = tokenizer(sentence1, return_tensors="pt")
input2 = tokenizer(sentence2, return_tensors="pt")
input1['output_hidden_states'] = True
with torch.no_grad():
    model_output1 = model(**input1)#['hidden_states'][-1]#['last_hidden_state']

input2['output_hidden_states'] = True

with torch.no_grad():
    model_output2 = model(**input2)#['hidden_states'][-1]#['last_hidden_state']
    
#embs1 = model_output1[0][7]#[4]
#embs2 = model_output2[0][4]#[2]
embs1 = model_output1[0][1]#[4]
embs2 = model_output2[0][1]#[2]

cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embs1, embs2)

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [88]:

embs1 = model_output1['hidden_states'][-2][0][1]#[4]
embs2 = model_output2['hidden_states'][-2][0][1]#[2]

cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embs1, embs2)

tensor(0.7396)

In [85]:
inputs = tokenizer(sentence1, return_tensors="pt")
for code in inputs['input_ids'][1]:
    print(f"{ tokenizer.decode(code)} : {code}")

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

ModuleNotFoundError: No module named 'sklearn'

In [41]:
cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embs1, embs2)

tensor(0.5099)

In [40]:
cos_similarity = torch.nn.CosineSimilarity()
cos_similarity(torch.unsqueeze(embs1,0), torch.unsqueeze(embs2,0))

tensor([0.5099])

In [12]:
model_output1.shape

torch.Size([1, 12, 768])

In [250]:
model_output1.shape

torch.Size([1, 6, 768])

In [28]:
inputs = tokenizer(sentence1, return_tensors="pt")
for code in inputs['input_ids'][1]:
    print(f"{ tokenizer.decode(code)} : {code}")

[ C L S ] : 101
i : 1045
t o o k : 2165
a : 1037
l o a n : 5414
f r o m : 2013
t h e : 1996
b a n k : 2924
[ S E P ] : 102


In [214]:
inputs['output_hidden_states'] = True
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")

<s> : 0
I : 100
 sit : 2662
 on : 15
 the : 5
 river : 4908
 bank : 827
</s> : 2


In [None]:
sentence1, sentence2)

In [212]:
inputs = tokenizer("went to the bank to get money", return_tensors="pt")
inputs['output_hidden_states'] = True
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")

<s> : 0
went : 31135
 to : 7
 the : 5
 bank : 827
 to : 7
 get : 120
 money : 418
</s> : 2


In [210]:
embeddings1 = torch.mean(all_embeddings[tokenizer(["good"])['input_ids'][0][1:-1]], dim=0)
embeddings2 = torch.mean(all_embeddings[[tokenizer(["bad"])['input_ids'][0][1:-1]]], dim=0)
cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embeddings1, embeddings2)

torch.Size([6, 768])

In [211]:
mask_embeddings

tensor([ 3.0015e-01, -1.1161e-01,  5.1395e-02, -6.7252e-02,  8.4492e-01,
         2.4305e-01,  6.9192e-02, -2.4385e-02, -3.3705e-01, -3.3250e-02,
         1.8574e-01,  2.4432e-01,  2.5454e-01,  2.9118e-01,  4.7106e-02,
        -2.9313e-01, -1.8963e-01, -2.5199e-01,  4.2967e-02,  2.9540e-01,
        -1.5799e-01,  4.6764e-02, -8.0097e-02,  2.0638e-01,  9.6993e-02,
         1.5285e-01,  1.2662e-01, -2.8163e-01, -1.0327e-01,  1.9596e-02,
        -2.8306e-01,  2.1835e-01,  9.7796e-02, -3.9643e-01, -5.4720e-02,
        -2.6545e-02,  7.1371e-02, -4.7886e-02,  3.1641e-01, -1.2958e-01,
        -2.4607e-01, -9.1115e-02,  1.0304e-01, -7.7099e-02, -1.2601e-01,
        -5.3634e-02, -1.3854e-01,  6.8302e-02, -3.0741e-02, -6.4592e-02,
         3.6420e-02, -1.6332e-01,  2.4784e-01,  8.3879e-02,  7.4927e-02,
         1.2919e-02,  6.1526e-02,  4.5856e-01, -1.2458e-01,  3.3233e-01,
         6.8998e-02,  5.4167e-01, -6.8654e-02,  1.5643e-01,  2.9762e-01,
        -1.4412e-01, -1.8248e-01,  9.3979e-02,  1.3

In [199]:
model_output['last_hidden_layer']

KeyError: 'last_hidden_layer'

In [193]:
inputs = tokenizer("<mask> am so <mask>", return_tensors="pt")
inputs['output_hidden_states'] = True
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")
    
with torch.no_grad():
    model_output = RobertaLM_model(**inputs)
    logits = model_output.logits

<s> : 0
<mask> : 50264
 am : 524
 so : 98
<mask> : 50264
</s> : 2


In [196]:
model_output.logits.shape

torch.Size([1, 6, 50265])

In [156]:
tokenizer(["sad"])['input_ids'][0][1:3]

[29, 625]

In [164]:
tokenizer(["sad"])['input_ids'][

IndexError: list index out of range

In [181]:
tokenizer(["sad"])['input_ids'][0][1:-1]

[29, 625]

In [191]:
embeddings1.shape

torch.Size([1, 768])

In [192]:

all_embeddings = RobertaLM_model.roberta.embeddings.word_embeddings.weight

# I am very "happy"
embeddings1 = torch.mean(all_embeddings[tokenizer(["good"])['input_ids'][0][1:-1]], dim=0)
embeddings2 = torch.mean(all_embeddings[[tokenizer(["bad"])['input_ids'][0][1:-1]]], dim=0)
cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embeddings1, embeddings2)

tensor(0.3564, grad_fn=<SumBackward1>)

In [176]:
embeddings1.shape

torch.Size([768])

In [177]:
cos_similarity = torch.nn.CosineSimilarity(dim=0)
cos_similarity(embeddings1, embeddings2)

tensor(0.1211, grad_fn=<SumBackward1>)

In [170]:
torch.mean(embeddings2,dim=0)

tensor([-3.9688e-02,  6.4377e-02,  3.9093e-02, -8.1921e-02, -1.5125e-01,
         4.9866e-02,  1.8823e-01, -8.3008e-02, -1.1902e-01,  2.3880e-03,
        -1.8250e-02,  1.2848e-02, -9.2865e-02,  8.4457e-02, -2.8625e-02,
         1.0071e-03, -5.4413e-02, -6.9824e-02, -9.2026e-02, -5.6358e-02,
         1.3062e-02, -1.6934e-01, -8.8196e-03, -6.3446e-02,  3.0323e-02,
        -2.1118e-01, -5.0087e-02, -1.9440e-01, -1.9006e-01, -1.8185e-01,
         2.6913e-02, -8.2920e-02,  3.0212e-03, -2.0068e-01, -2.3346e-03,
        -2.9348e-02, -7.5556e-02, -5.6000e-02, -1.0611e-01, -6.6086e-02,
         2.6611e-02, -2.3730e-01, -9.0897e-02, -1.3884e-01,  7.2021e-03,
        -1.0330e-02, -3.4409e-02,  8.4778e-02,  7.9041e-02,  4.4128e-02,
         7.7179e-02,  7.9803e-02, -1.0681e-02, -7.7698e-02,  5.9509e-03,
        -9.3523e-02, -1.8066e-02, -5.6702e-02,  2.4956e-02,  4.8828e-04,
         2.8206e-02, -4.0741e-02, -2.8290e-02, -2.0874e-02,  8.6433e-02,
        -3.7292e-02, -1.5564e-02, -9.2773e-03,  6.4

In [145]:
tokenizer(["happy", "sad"])

{'input_ids': [[0, 27333, 2], [0, 29, 625, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1, 1]]}

In [146]:
tokenizer(["happy", "sad"])['input_ids'][0]

[0, 27333, 2]

In [149]:
for code in tokenizer(["happy", "miserable"])['input_ids'][1]:
    print(f"{ tokenizer.decode(code)} : {code}")

<s> : 0
m : 119
iser : 5999
able : 868
</s> : 2


In [136]:
Roberta_model = RobertaModel.from_pretrained(model_checkpoint)

inputs = tokenizer(["I am very", return_tensors="pt")
inputs['output_hidden_states'] = True
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")
    
with torch.no_grad():
    last_hidden_state = Roberta_model(**inputs)['last_hidden_state']

<s> : 0
I : 100
 am : 524
 so : 98
<mask> : 50264
</s> : 2


odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [121]:
from transformers import RobertaModel
model2 = RobertaModel.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [123]:

with torch.no_grad():
    model_output2 = model2(**inputs)


In [127]:
len(model_output2['hidden_states'])

13

In [None]:
Roberta_model = RobertaModel.from_pretrained(model_checkpoint)

inputs = tokenizer(text, return_tensors="pt")
inputs['output_hidden_states'] = True
for code in inputs['input_ids'][0]:
    print(f"{ tokenizer.decode(code)} : {code}")
    
with torch.no_grad():
    model_output = model(**inputs)
    logits = model_output.logits

In [None]:
RobertaModel()

In [119]:
len(model_output['hidden_states'])#[-1]

13

In [128]:
model2

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [129]:
model_output.keys()

odict_keys(['logits', 'hidden_states'])

In [118]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [110]:
# 4) Find a sentence with n words, that is tokenized into m > n tokens by the tokenizer.
for code in tokenizer("Dilophosaurus")['input_ids']:
    print(f"{ tokenizer.decode(code)} : {code}")
    

<s> : 0
D : 495
il : 718
oph : 6673
osaurus : 44422
</s> : 2
