Train the BPE tokenizer on a small sample of the datasets, add special tokens.

IMPROVEMENT: prevent special token injection

In [2]:
from tokenizer_datasets import *

lite_dataset = False
dataset_iter, dataset_length = tokenizer_lite_dataset() if lite_dataset else tokenizer_real_dataset()

print(dataset_length)
example_message = next(dataset_iter())
print(example_message)

3798585513
Aster is a chatbot who answers questions with rhymes.


In [3]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))

In [4]:
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence

tokenizer.normalizer = Sequence([
    NFD(),
    StripAccents(),
    Lowercase()
])
tokenizer.normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

In [5]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str(example_message)

[('Aster', (0, 5)),
 ('Ġis', (5, 8)),
 ('Ġa', (8, 10)),
 ('Ġchatbot', (10, 18)),
 ('Ġwho', (18, 22)),
 ('Ġanswers', (22, 30)),
 ('Ġquestions', (30, 40)),
 ('Ġwith', (40, 45)),
 ('Ġrhymes', (45, 52)),
 ('.', (52, 53))]

In [6]:
from special_tokens import special_tokens

special_token_list = list(special_tokens.values())
tokenizer.add_tokens(special_token_list)
tokenizer.get_vocab()

{'<|bos|>': 2,
 '<|pad|>': 7,
 '<|eos|>': 3,
 '<|assistant|>': 5,
 '<|endofturn|>': 1,
 '<|system|>': 6,
 '<|user|>': 4,
 '<|unk|>': 8,
 '<|endoftext|>': 0}

In [7]:
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    special_tokens=special_token_list,
    show_progress=True,
    min_frequency=2,
)

In [8]:
tokenizer.train_from_iterator(dataset_iter(), trainer, length=dataset_length)
file_name = "./tokenizer.json"

no robots completed
wiki completed
tiny stories completed
tiny textbooks completed





In [9]:
from tokenizers import decoders, processors

tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

In [10]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="<|bos|> $A <|eos|>",  # adds BOS before & EOS after each sequence
    pair="<|bos|> $A <|eos|> <|bos|> $B <|eos|>",  # for pairs (less common)
    special_tokens=[
        ("<|bos|>", tokenizer.token_to_id("<|bos|>")),
        ("<|eos|>", tokenizer.token_to_id("<|eos|>")),
    ],
)

In [11]:
tokenizer.save(file_name)

In [12]:
print("<|user|>" in tokenizer.get_vocab())
print("<|" in tokenizer.get_vocab())

True
False


In [13]:
tokenizer = Tokenizer.from_file(file_name)

encoding = tokenizer.encode(example_message)
print(encoding.tokens)
decoding = tokenizer.decode(encoding.ids)
print(decoding)

['<|bos|>', 'aster', 'Ġis', 'Ġa', 'Ġch', 'at', 'b', 'ot', 'Ġwho', 'Ġanswers', 'Ġquestions', 'Ġwith', 'Ġrhy', 'mes', '.', '<|eos|>']
aster is a chatbot who answers questions with rhymes.
