In [1]:
top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])


def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx

In [2]:
import torch
import string

from transformers import BertTokenizer, BertForMaskedLM
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
text_sentence="Here is a test about"
print(text_sentence)
text_sentence+= ' <mask>'
top_clean=5
input_ids, mask_idx = encode(bert_tokenizer, text_sentence)
with torch.no_grad():
    predict = bert_model(input_ids)[0]
bert = decode(bert_tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
print()
print(bert)

Here is a test about

you
me
love
it
this


In [4]:
import torch
import string
from transformers import AutoTokenizer

from transformers import BertTokenizer, BertForMaskedLM
tokenizer_checkpoint = "./S288C_TOKENIZER"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint,local_files_only=True)
bert_model = BertForMaskedLM.from_pretrained('CUBERT').eval()

In [5]:
tokenizer.encode(text_sentence, add_special_tokens=True)

[2, 1, 1, 1, 1, 1, 1, 1, 1, 3]

In [6]:
from transformers import pipeline
import json

pipe = pipeline(task='fill-mask', model='./CUBERT',tokenizer="./S288C_TOKENIZER")
out = pipe(f"4 2 2 {pipe.tokenizer.mask_token} 2 1")
print(json.dumps(out, indent=4))

[
    {
        "score": 0.47349822521209717,
        "token": 5,
        "token_str": "1",
        "sequence": "4 2 2 1 2 1"
    },
    {
        "score": 0.3058720827102661,
        "token": 6,
        "token_str": "2",
        "sequence": "4 2 2 2 2 1"
    },
    {
        "score": 0.10576262325048447,
        "token": 7,
        "token_str": "3",
        "sequence": "4 2 2 3 2 1"
    },
    {
        "score": 0.06819885969161987,
        "token": 8,
        "token_str": "4",
        "sequence": "4 2 2 4 2 1"
    },
    {
        "score": 0.02411608397960663,
        "token": 9,
        "token_str": "5",
        "sequence": "4 2 2 5 2 1"
    }
]


In [4]:
from transformers import pipeline

classifier = pipeline(task='fill-mask', model='./CUBERT',tokenizer="./S288C_TOKENIZER")
classifier("4 2 2 [MASK] 2 1")

[{'score': 0.47349822521209717,
  'token': 5,
  'token_str': '1',
  'sequence': '4 2 2 1 2 1'},
 {'score': 0.3058720827102661,
  'token': 6,
  'token_str': '2',
  'sequence': '4 2 2 2 2 1'},
 {'score': 0.10576262325048447,
  'token': 7,
  'token_str': '3',
  'sequence': '4 2 2 3 2 1'},
 {'score': 0.06819885969161987,
  'token': 8,
  'token_str': '4',
  'sequence': '4 2 2 4 2 1'},
 {'score': 0.02411608397960663,
  'token': 9,
  'token_str': '5',
  'sequence': '4 2 2 5 2 1'}]