# Build a simple BERT model
* Task1: Predict the masked word in a sentence using BertForMaskedLM
* Task2: Classify the accuracy of a next sentence given a sentence using BertForNextSentencePrediction

**Warning**: The following code execute on Google Colab. 

In [None]:
!pip install folium
!pip install urllib3
!pip install pytorch_transformers

## Task1: predict the masked word in a sentence using BertForMaskedLM

In [6]:
# import BertForMaskedLM
from pytorch_transformers import BertTokenizer, BertForMaskedLM

# set up the text
text = "[CLS] I played football with my friends at school yesterday [SEP]"
# set up the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenize the text
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['[CLS]', 'i', 'played', 'football', 'with', 'my', 'friends', 'at', 'school', 'yesterday', '[SEP]']


In [3]:
# mask the part of the text
masked_index = 3
tokenized_text[masked_index] = '[MASK]'
print(tokenized_text)

# convert the text to ids
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# convert the text to a tensor
import torch
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor)

['[CLS]', 'i', 'played', '[MASK]', 'with', 'my', 'friends', 'at', 'school', 'yesterday', '[SEP]']
tensor([[ 101, 1045, 2209,  103, 2007, 2026, 2814, 2012, 2082, 7483,  102]])


In [5]:
# predict the masked word using the BertForMaskedLM model
# set up the model
msk_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
msk_model.eval()

# predict the masked word
with torch.no_grad():
    x = tokens_tensor
    y = msk_model(x)
result = y[0]
print(result)

# get the top 5 predictions
_, predicted_indexes = torch.topk(result[0, masked_index], 5)
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indexes.tolist())
print(predicted_tokens)

tensor([[[ -6.6873,  -6.6405,  -6.6409,  ...,  -6.0201,  -5.8183,  -3.9777],
         [ -9.5150,  -9.3415,  -9.3818,  ...,  -8.4236,  -8.4428,  -5.3152],
         [-10.0568, -10.1768, -10.2753,  ...,  -8.5044,  -8.6216,  -5.3011],
         ...,
         [-13.6662, -14.2769, -13.8572,  ..., -12.8681, -11.8016, -11.4662],
         [ -9.2015,  -8.9383,  -9.3056,  ...,  -7.7869,  -9.2608,  -3.0500],
         [-13.1242, -12.9604, -12.7900,  ...,  -9.9769, -10.1773, -10.8939]]])
['basketball', 'football', 'soccer', 'baseball', 'tennis']


## Task2: Classify the accuracy of a next sentence given a sentence using BertForNextSentencePrediction

In [7]:
# import BertForNextSentencePrediction
from pytorch_transformers import BertForNextSentencePrediction

# define the function to determine and show a continuity between two sentences
def show_continuity(text, seg_ids):
    # set up the model
    nsp_model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
    nsp_model.eval()

    # convert the text to ids
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # convert the text and the segment ids to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([seg_ids])
    
    # predict the next sentence
    with torch.no_grad():
        x = tokens_tensor
        y = segments_tensors
        z = nsp_model(x, y)
    result = torch.softmax(z[0], dim=1)
    print(result)
    print('Continuity: ', result[0][0].item()*100, '%')

In [8]:
# test the function to give a continuity between two sentences
text = '[CLS] What is baseball ? [SEP] Baseball is a sport played with a bat and ball [SEP]'
seg_ids = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
show_continuity(text, seg_ids)

tensor([[1.0000e+00, 4.7963e-06]])
Continuity:  99.9995231628418 %


In [9]:
# test the function to give no continuity between two sentences
text = '[CLS] What is baseball ? [SEP] This food is made with flour and milk [SEP]'
seg_ids = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
show_continuity(text, seg_ids)

tensor([[9.5296e-06, 9.9999e-01]])
Continuity:  0.0009529629096505232 %
