In [1]:
%%capture
!pip install transformers

In [2]:
!pip list | grep transformers

transformers                  4.3.3         


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
sentences = ["Test 1st sample", "Sample #1", "This is a larger sample 5 tokens longer?"]
tokenizer(sentences)

{'input_ids': [[101, 3231, 3083, 7099, 102], [101, 7099, 1001, 1015, 102], [101, 2023, 2003, 1037, 3469, 7099, 1019, 19204, 2015, 2936, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [5]:
for sentence in sentences:
    print(tokenizer.encode(sentence, max_length=10, padding="max_length"))

[101, 3231, 3083, 7099, 102, 0, 0, 0, 0, 0]
[101, 7099, 1001, 1015, 102, 0, 0, 0, 0, 0]
[101, 2023, 2003, 1037, 3469, 7099, 1019, 19204, 2015, 2936, 1029, 102]


In [6]:
for sentence in sentences:
    print(tokenizer.encode_plus(sentence))

{'input_ids': [101, 3231, 3083, 7099, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [101, 7099, 1001, 1015, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [101, 2023, 2003, 1037, 3469, 7099, 1019, 19204, 2015, 2936, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
tokenizer.batch_encode_plus(sentences)

{'input_ids': [[101, 3231, 3083, 7099, 102], [101, 7099, 1001, 1015, 102], [101, 2023, 2003, 1037, 3469, 7099, 1019, 19204, 2015, 2936, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [8]:
tokenizer.special_tokens_map

{'cls_token': '[CLS]',
 'mask_token': '[MASK]',
 'pad_token': '[PAD]',
 'sep_token': '[SEP]',
 'unk_token': '[UNK]'}

In [9]:
tokenizer.convert_tokens_to_ids(["MASK"])

[100]

In [10]:
text = "A bird usually has [MASK] legs."

In [11]:
from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained("bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokens = tokenizer([text], return_tensors="pt", truncation=True, padding=True)
tokens

{'input_ids': tensor([[ 101, 1037, 4743, 2788, 2038,  103, 3456, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
output = model(**tokens)

In [14]:
output.keys()

odict_keys(['logits'])

In [15]:
output["logits"].shape

torch.Size([1, 9, 30522])

In [16]:
tokenizer.convert_ids_to_tokens(output["logits"][0].argmax(dim=-1))

['.', 'a', 'bird', 'usually', 'has', 'four', 'legs', '.', '.']