In [1]:
s1 = 'albums sold 124443286539 copies'
s2 = 'technically perfect, melodically correct'
s3 = 'featuring a previously unheard track'
s4 = 'bestselling music artist'
s5 = 's1 d1 o1 and o2'
s6 = 'asbofwheohwbeif'

# 0. Instantiate the tokenizer

In [2]:
from transformers import BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# 1. vocab
- tokenizer.vacab vs tokenizer.ids_to_tokens
- len(tokenizer.vocab) == 30522

In [4]:
len(tokenizer.vocab), tokenizer.vocab['[UNK]']

(30522, 100)

# 2. Test by sample subword

In [20]:
inputs = tokenizer(s1)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 4042, 2853, 13412, 22932, 16703, 20842, 22275, 2683, 4809, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] albums sold 124443286539 copies [SEP]
['[CLS]', 'albums', 'sold', '124', '##44', '##32', '##86', '##53', '##9', 'copies', '[SEP]']


In [14]:
inputs = tokenizer(s2)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 10892, 3819, 1010, 17187, 3973, 6149, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] technically perfect, melodically correct [SEP]
['[CLS]', 'technically', 'perfect', ',', 'melodic', '##ally', 'correct', '[SEP]']


In [15]:
inputs = tokenizer(s3)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 3794, 1037, 3130, 4895, 26362, 2650, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] featuring a previously unheard track [SEP]
['[CLS]', 'featuring', 'a', 'previously', 'un', '##heard', 'track', '[SEP]']


In [16]:
inputs = tokenizer(s4)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 2190, 23836, 2075, 2189, 3063, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[CLS] bestselling music artist [SEP]
['[CLS]', 'best', '##sell', '##ing', 'music', 'artist', '[SEP]']


In [17]:
inputs = tokenizer(s5)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 1055, 2487, 1040, 2487, 1051, 2487, 1998, 1051, 2475, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] s1 d1 o1 and o2 [SEP]
['[CLS]', 's', '##1', 'd', '##1', 'o', '##1', 'and', 'o', '##2', '[SEP]']


In [18]:
inputs = tokenizer(s6)
print(inputs)
print(tokenizer.decode(inputs['input_ids']))
print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))

{'input_ids': [101, 2004, 5092, 2546, 2860, 5369, 11631, 2860, 19205, 2546, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] asbofwheohwbeif [SEP]
['[CLS]', 'as', '##bo', '##f', '##w', '##he', '##oh', '##w', '##bei', '##f', '[SEP]']


# 3. Summary
- tokenizer will not easily convert a word into [UNK] (100)
- based on vocab table -> tokenize, encode, decode in one
    - tokenize：word -> token(s)，Map the word into keys in vocab as accurately as possible
    - encode: token -> id
    - decode: id -> token -> word
    - encoding isn't the end, decoding should also restore the IDs into words that closely match the original input. 

The main aim of a vocabulary (vocab) in NLP models is exactly to map words (or subwords/tokens) into integer IDs.