# Bidirectional Encoder Representations from Transformers (BERT), Masked Word Completion

In [1]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

## Set Device

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load Pre-trained BERT Model Tokenizer (Vocabulary)

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 426100.72B/s]


## Encode Text Inputs

In [4]:
text = '[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]'
tokenized_text = tokenizer.tokenize(text)

In [5]:
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

In [6]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

tensor_tokens = torch.tensor([indexed_tokens])
tensor_segments = torch.tensor([segments_ids])

In [7]:
tensor_tokens = tensor_tokens.to(device) 
tensor_segments = tensor_segments.to(device)

## Load Pre-trained BERT Model Weights

In [8]:
bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert.to(device)

100%|██████████| 313/313 [00:00<00:00, 338267.75B/s]
100%|██████████| 440473133/440473133 [00:52<00:00, 8444686.03B/s] 


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
   

## Evaluate BERT Model

In [9]:
bert.eval()

with torch.no_grad():
    outputs = bert(tensor_tokens, token_type_ids=tensor_segments)
    predictions = outputs[0]

In [10]:
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [11]:
print('Prediction is:', predicted_token)

Prediction is: henson


---