In [4]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
input_ids = torch.tensor(tokenizer.encode("UCSB is a [MASK] place.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
maskedlm = input_ids.clone().detach()

outputs = model(input_ids) #, masked_lm_labels=maskedlm)

prediction_scores = outputs.logits

mask = [int(i) for i in maskedlm[0]].index(103)

args = prediction_scores[0][mask].topk(5)[1]
for i in args:
    print(tokenizer.decode([int(i)]))

good
big
beautiful
market
great


In [6]:
def get_mask_prob(sent):
    results = []
    with torch.no_grad():
        input_ids = torch.tensor(tokenizer.encode(sent, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        maskedlm = input_ids.clone().detach()
        scores = []
        masks = [i for i in range(len(input_ids[0])) if int(input_ids[0][i]) == tokenizer.mask_token_id]
        for mask in masks:
            outputs = model(maskedlm) 
            prediction_scores = outputs.logits
            pscores = torch.log(torch.softmax(prediction_scores[0], 1))
            probs, args = pscores[mask].topk(20)
            results += [([tokenizer.decode([int(a)]) for a in args], [round(float(p), 3) for p in probs])]
            maskedlm = input_ids.clone().detach()
    return results

In [7]:
def get_word_prob(sent, word_list):
    results = []
    word_list_ids = tokenizer.encode(word_list)
    with torch.no_grad():
        input_ids = torch.tensor(tokenizer.encode(sent, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        maskedlm = input_ids.clone().detach()
        scores = []
        masks = [i for i in range(len(input_ids[0])) if int(input_ids[0][i]) == tokenizer.mask_token_id]
        for mask in masks:
            outputs = model(maskedlm) #, masked_lm_labels=maskedlm)
            prediction_scores = outputs.logits
            pscores = torch.log(torch.softmax(prediction_scores[0], 1))
            for word in word_list_ids:
                results += [(tokenizer.decode([word]), pscores[mask][word])]
            maskedlm = input_ids.clone().detach()
    print(sent)
    return results

In [8]:
# try parasitic gaps
print(get_mask_prob("What book did you review without actually [MASK]?"))
print(get_mask_prob("You reviewed that book without actually [MASK]."))


[(['reading', 'knowing', 'writing', 'answering', 'asking', 'listening', 'trying', 'looking', 'publishing', 'learning', 'starting', 'seeing', 'finishing', 'checking', 'talking', 'speaking', 'studying', 'participating', 'stopping', 'going'], [-0.768, -2.47, -3.133, -3.242, -3.811, -3.958, -3.979, -4.034, -4.097, -4.584, -4.585, -4.689, -4.874, -4.948, -5.018, -5.036, -5.045, -5.079, -5.095, -5.157])]
[(['reading', 'knowing', 'speaking', 'listening', 'looking', 'asking', 'answering', 'talking', 'participating', 'commenting', 'seeing', 'trying', 'voting', 'going', 'visiting', 'writing', 'thinking', 'checking', 'attending', 'saying'], [-2.041, -2.16, -2.643, -2.751, -2.889, -3.015, -3.214, -3.473, -3.783, -4.18, -4.204, -4.23, -4.264, -4.397, -4.416, -4.416, -4.444, -4.706, -4.708, -4.831])]


In [9]:
print(get_mask_prob("The host spent a special 77 [MASK] on the show."))
print(get_mask_prob("The host spent a special two [MASK] on the show."))

[(['days', 'hours', 'minutes', 'weeks', 'seconds', 'years', 'episodes', 'nights', '##th', 'months', 'hour', '##k', 'min', 'day', 'minute', 'episode', 'birthday', 'night', 'week', 'moments'], [-0.58, -1.436, -2.139, -4.202, -4.239, -4.632, -5.167, -5.246, -5.366, -5.89, -5.978, -6.118, -6.119, -6.321, -6.441, -6.804, -7.401, -7.723, -7.776, -7.836])]
[(['weeks', 'days', 'hours', 'years', 'months', 'nights', 'minutes', 'seasons', 'week', 'episodes', 'weekends', 'evenings', 'day', 'hour', 'night', 'summers', 'sundays', 'saturdays', 'decades', 'month'], [-0.964, -1.541, -1.968, -2.539, -2.594, -3.429, -4.288, -4.508, -4.835, -4.927, -5.275, -6.041, -6.113, -6.169, -6.179, -6.278, -6.912, -6.941, -7.007, -7.143])]


In [10]:
print(get_mask_prob("The student spent [MASK] beautiful five days in Canberra."))

[(['a', 'two', 'three', 'four', 'five', 'some', 'the', 'six', 'several', 'his', 'seven', 'her', 'eight', 'many', 'very', 'these', 'nine', '4', 'one', '5'], [-1.694, -2.0, -2.345, -2.437, -2.747, -2.972, -3.071, -3.28, -3.503, -3.558, -3.667, -3.948, -4.28, -4.359, -4.366, -4.4, -5.001, -5.003, -5.119, -5.199])]


In [11]:
print(get_word_prob("We will spend a beautiful five [MASK] in Canberra.", ["day", "days"]))
print(get_word_prob("We will spend a beautiful 2818 [MASK] in Canberra.", ["day", "days"]))
print(get_word_prob("The prize went to a lucky three [MASK].", ["player", "players"]))
print(get_word_prob("The prize went to a lucky seventy-four [MASK].", ["player", "players"]))
print(get_word_prob("The prize went to an embarrassed three [MASK].", ["player", "players"]))
print(get_word_prob("The prize went to an embarrassed seventy-four [MASK].", ["player", "players"]))
print(get_word_prob("All credit goes to a special three [MASK].", ["player", "players"]))
print(get_word_prob("All credit goes to a special seventy-four [MASK].", ["player", "players"]))
print(get_word_prob("I have had a busy three [MASK].", ["week", "weeks"]))
print(get_word_prob("I have had a busy seventy-seven [MASK].", ["week", "weeks"]))
print(get_word_prob("We walked a wintry seventy-two [MASK].", ["block", "blocks"]))
print(get_word_prob("We walked a wintry three [MASK].", ["block", "blocks"]))
print(get_word_prob("We walked a wintry seventy-two [MASK].", ["block", "blocks"]))
print(get_word_prob("We walked a wintry three [MASK].", ["block", "blocks"]))
print(get_word_prob("The host spent a special ten [MASK] on the show.", ["night", "nights"]))
print(get_word_prob("The host spent a special two [MASK] on the show.", ["night", "nights", "segment", "segments"]))
print(get_word_prob("The host spent a special 1212 [MASK] on the show.", ["night", "nights", "segment", "segments"]))

We will spend a beautiful five [MASK] in Canberra.
[('[CLS]', tensor(-21.6975)), ('day', tensor(-8.7000)), ('days', tensor(-0.9306)), ('[SEP]', tensor(-21.0437))]
We will spend a beautiful 2818 [MASK] in Canberra.
[('[CLS]', tensor(-17.2315)), ('day', tensor(-3.1602)), ('days', tensor(-1.3575)), ('[SEP]', tensor(-15.6093))]
The prize went to a lucky three [MASK].
[('[CLS]', tensor(-19.2531)), ('player', tensor(-3.5122)), ('players', tensor(-1.9555)), ('[SEP]', tensor(-16.6073))]
The prize went to a lucky seventy-four [MASK].
[('[CLS]', tensor(-17.1052)), ('player', tensor(-4.6879)), ('players', tensor(-6.3033)), ('[SEP]', tensor(-15.0824))]
The prize went to an embarrassed three [MASK].
[('[CLS]', tensor(-16.5591)), ('player', tensor(-8.5510)), ('players', tensor(-5.8898)), ('[SEP]', tensor(-15.6126))]
The prize went to an embarrassed seventy-four [MASK].
[('[CLS]', tensor(-17.3437)), ('player', tensor(-8.1305)), ('players', tensor(-5.5902)), ('[SEP]', tensor(-15.5876))]
All credit goe

In [12]:
print(get_word_prob("I reviewed a remarkable four [MASK].", ["paper", "papers"]))
print(get_word_prob("I reviewed a remarkable seventy-four [MASK].", ["paper", "papers"]))
print(get_word_prob("We spent a beautiful eighty [MASK] in Canberra.", ["day", "days"]))


I reviewed a remarkable four [MASK].
[('[CLS]', tensor(-14.2232)), ('paper', tensor(-8.7153)), ('papers', tensor(-3.7857)), ('[SEP]', tensor(-15.4789))]
I reviewed a remarkable seventy-four [MASK].
[('[CLS]', tensor(-14.7811)), ('paper', tensor(-7.9636)), ('papers', tensor(-3.1446)), ('[SEP]', tensor(-16.2712))]
We spent a beautiful eighty [MASK] in Canberra.
[('[CLS]', tensor(-19.1133)), ('day', tensor(-6.8273)), ('days', tensor(-0.4375)), ('[SEP]', tensor(-18.3835))]


In [13]:
print(get_word_prob("I met a [MASK] five people yesterday.", ["special", "lucky", "handsome", "tall"]))
print(get_word_prob("I met a [MASK] seventy-seven people yesterday.", ["special", "lucky", "handsome", "tall"]))

I met a [MASK] five people yesterday.
[('[CLS]', tensor(-16.6099)), ('special', tensor(-9.9151)), ('lucky', tensor(-5.9449)), ('handsome', tensor(-8.0705)), ('tall', tensor(-8.4602)), ('[SEP]', tensor(-17.2869))]
I met a [MASK] seventy-seven people yesterday.
[('[CLS]', tensor(-15.2442)), ('special', tensor(-11.0371)), ('lucky', tensor(-8.5766)), ('handsome', tensor(-8.7123)), ('tall', tensor(-10.9221)), ('[SEP]', tensor(-16.8270))]


In [14]:
print(get_mask_prob("I met a lucky [MASK] people yesterday."))
print(get_mask_prob("I met lucky [MASK] people yesterday."))
print(get_mask_prob("The prize went to a lucky three [MASK]."))
print(get_mask_prob("The day was carried by a special three [MASK]."))
print(get_mask_prob("We spent a beautiful [MASK] days in Canberra."))
print(get_mask_prob("We spent beautiful [MASK] days in Canberra."))
print(get_mask_prob("We spent a [MASK] four days in Canberra."))
print(get_mask_prob("I met a [MASK] five people yesterday."))
print(get_mask_prob("The prize went to a [MASK] three players."))


[(['few', 'two', 'three', 'five', 'four', 'many', 'six', 'seven', 'dozen', 'couple', 'ten', 'eight', 'hundred', '10', 'nine', 'thousand', 'twenty', 'twelve', 'eleven', 'fifty'], [-0.289, -3.087, -3.206, -3.948, -4.047, -4.18, -4.3, -4.307, -4.503, -4.561, -4.57, -5.074, -5.196, -5.839, -5.984, -6.29, -6.293, -6.474, -6.627, -6.638])]
[(['two', 'three', 'few', 'poor', 'young', 'rich', 'four', 'white', 'five', 'other', 'seven', 'six', 'little', 'black', 'good', 'eight', 'day', 'nice', 'ten', 'new'], [-1.974, -2.625, -2.665, -2.714, -3.312, -3.313, -3.421, -3.499, -3.656, -3.687, -3.783, -3.814, -3.835, -3.913, -3.939, -4.335, -4.483, -4.533, -4.676, -4.824])]
[(['players', 'contestants', 'teams', 'winners', 'losers', 'contestant', 'player', 'people', 'team', 'men', 'competitors', 'women', 'horses', 'couples', 'participants', 'winner', 'finalists', 'votes', 'members', 'entries'], [-1.956, -2.105, -2.387, -3.059, -3.438, -3.44, -3.512, -3.554, -3.933, -3.971, -4.291, -4.374, -4.465, -4.527

In [15]:
print(get_mask_prob("The prize went to a lucky ten [MASK]."))
print(get_mask_prob("The prize went to a lucky happy [MASK]."))

[(['player', 'winner', 'contestant', 'team', 'person', '##er', 'loser', 'participant', 'ticket', 'players', 'contestants', 'winners', 'horse', 'driver', 'prize', 'losers', 'candidate', 'member', 'survivor', 'entry'], [-1.363, -1.938, -2.015, -2.89, -3.538, -3.838, -3.93, -4.252, -4.313, -4.878, -4.9, -4.927, -5.115, -5.148, -5.154, -5.158, -5.329, -5.329, -5.371, -5.419])]
[(['couple', 'family', 'ending', 'loser', 'person', 'man', 'birthday', 'child', 'woman', 'girl', 'wife', 'widow', 'husband', 'one', 'boy', 'friend', 'marriage', 'heart', 'survivor', 'killer'], [-1.33, -1.882, -3.269, -3.382, -3.554, -4.122, -4.458, -4.465, -4.698, -4.79, -4.869, -4.898, -4.926, -4.992, -5.266, -5.332, -5.406, -5.425, -5.432, -5.482])]


In [16]:
print(get_mask_prob("The prize went to a whopping three [MASK]."))
print(get_mask_prob("The prize went to a whopping seventy-two [MASK]."))
print(get_mask_prob("The prize went to a whopping big [MASK]."))

[(['guineas', 'dollars', 'pounds', 'points', 'hundred', 'thousand', 'people', 'million', 'men', 'cents', 'shillings', 'players', 'bucks', 'winners', 'horses', 'times', 'tickets', 'stars', 'votes', 'women'], [-1.934, -2.048, -2.362, -2.366, -3.102, -3.185, -3.584, -3.699, -3.957, -4.094, -4.825, -4.95, -4.965, -5.033, -5.142, -5.226, -5.423, -5.424, -5.426, -5.431])]
[(['pounds', 'dollars', 'guineas', 'men', 'points', 'people', 'cents', 'thousand', 'bucks', 'souls', 'hundred', 'votes', 'dragons', 'heads', 'spectators', 'lives', 'horses', 'percent', 'million', 'warriors'], [-1.1, -1.101, -3.392, -3.829, -4.029, -4.047, -4.05, -4.441, -4.473, -4.567, -5.708, -5.739, -5.825, -5.854, -5.938, -5.951, -6.02, -6.075, -6.076, -6.133])]
[(['guy', 'man', 'boy', 'girl', 'dog', 'brother', 'one', 'cat', 'bear', 'boss', 'wolf', '##foot', 'thing', 'bull', 'kid', 'bitch', 'dick', 'woman', 'sister', 'rat'], [-1.879, -2.184, -2.938, -3.461, -3.477, -3.559, -3.674, -3.856, -4.035, -4.286, -4.582, -4.643, 

In [18]:
outputs

MaskedLMOutput(loss=None, logits=tensor([[[ -6.3999,  -6.3635,  -6.3764,  ...,  -5.8375,  -5.5929,  -3.8700],
         [ -7.0323,  -7.3702,  -6.8279,  ...,  -7.1544,  -5.1179,  -3.5200],
         [ -8.4059,  -8.1868,  -7.9537,  ...,  -6.6552,  -3.6863,  -8.2279],
         ...,
         [ -9.8070,  -9.9536,  -9.6722,  ...,  -8.9193,  -8.3495,  -9.8221],
         [-12.6997, -12.4316, -12.9278,  ..., -11.2620, -10.5817,  -6.6558],
         [-10.2625, -10.2069, -10.2232,  ...,  -9.7631,  -9.0680,  -6.1355]]],
       grad_fn=<AddBackward0>), hidden_states=None, attentions=None)

In [23]:
# get model internals
input_ids = torch.tensor(tokenizer.encode("The dog chased the cat.", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
maskedlm = input_ids.clone().detach()

outputs = model(input_ids,output_hidden_states=True) #, masked_lm_labels=maskedlm)


In [26]:
# 13 layers
len(outputs.hidden_states)

13

In [28]:
# 8 word vectors, each of length 768
outputs.hidden_states[0].shape

torch.Size([1, 8, 768])