This is tokenizer training uses a smaller dataset to make local experimenting easier.

In [1]:
from datasets import load_dataset

total_length=0

no_robots = load_dataset("HuggingFaceH4/no_robots", streaming=True)
no_robots_ds=no_robots["test"].select_columns(["messages"])
total_length+=no_robots_ds.dataset_size

print(total_length)
example_row=next(iter(no_robots_ds))
print(example_row)

Downloading readme: 0.00B [00:00, ?B/s]

17384327
{'messages': [{'content': 'Aster is a chatbot who answers questions with rhymes.', 'role': 'system'}, {'content': 'Where did chocolate originate?', 'role': 'user'}, {'content': 'Chocolate is 4000 years old/Mexico is where it was first sold', 'role': 'assistant'}, {'content': 'Where was milk chocolate invented?', 'role': 'user'}, {'content': 'Switzerland was the first to add milk/To make their chocolate smooth as silk', 'role': 'assistant'}, {'content': 'What are some good desserts that use chocolate?', 'role': 'user'}, {'content': 'Pie, tart, cookies, and cake/Chocolate is great to bake', 'role': 'assistant'}]}


In [2]:
special_tokens = [
    "<|endoftext|>",       # end of text (EOT) — very common in GPT-style models
    "<|eot_id|>",          # alternative name sometimes used for end of turn / end of sequence
    "<|bos|>",             # beginning of sequence (BOS)
    "<|eos|>",             # end of sequence (EOS) — sometimes distinct from EOT
    "<|user|>",            # user marker
    "<|assistant|>",       # assistant marker
    "<|system|>",          # system prompt
    "<|pad|>",             # padding token (PAD)
    "<|unk|>",             # usually good to have
]

In [3]:
def chat_template(messages:list[dict[str,str]]):
    result=""
    for message in messages:
        role=message["role"]
        role_token:str
        match role:
            case "user":
                role_token="<|user|>"
            case "system":
                role_token="<|system|>"
            case "assistant":
                role_token="<|assistant|>"
            case _:
                raise ValueError(f"Invalid message role {role}")
        result += role_token+message["content"]+"<|eot_id|>\n"
    return result

messages_text=chat_template(example_row["messages"])
print(messages_text)

<|system|>Aster is a chatbot who answers questions with rhymes.<|eot_id|>
<|user|>Where did chocolate originate?<|eot_id|>
<|assistant|>Chocolate is 4000 years old/Mexico is where it was first sold<|eot_id|>
<|user|>Where was milk chocolate invented?<|eot_id|>
<|assistant|>Switzerland was the first to add milk/To make their chocolate smooth as silk<|eot_id|>
<|user|>What are some good desserts that use chocolate?<|eot_id|>
<|assistant|>Pie, tart, cookies, and cake/Chocolate is great to bake<|eot_id|>



In [18]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>"))

In [19]:
from tokenizers.normalizers import NFD, StripAccents, Lowercase, Sequence

tokenizer.normalizer = Sequence([
        NFD(),
        StripAccents(),
        Lowercase()
    ])
tokenizer.normalizer.normalize_str("Héllò hôw are ü?")

'hello how are u?'

In [20]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str(messages_text)

[('<|', (0, 2)),
 ('system', (2, 8)),
 ('|>', (8, 10)),
 ('Aster', (10, 15)),
 ('Ġis', (15, 18)),
 ('Ġa', (18, 20)),
 ('Ġchatbot', (20, 28)),
 ('Ġwho', (28, 32)),
 ('Ġanswers', (32, 40)),
 ('Ġquestions', (40, 50)),
 ('Ġwith', (50, 55)),
 ('Ġrhymes', (55, 62)),
 ('.<|', (62, 65)),
 ('eot', (65, 68)),
 ('_', (68, 69)),
 ('id', (69, 71)),
 ('|>', (71, 73)),
 ('Ċ', (73, 74)),
 ('<|', (74, 76)),
 ('user', (76, 80)),
 ('|>', (80, 82)),
 ('Where', (82, 87)),
 ('Ġdid', (87, 91)),
 ('Ġchocolate', (91, 101)),
 ('Ġoriginate', (101, 111)),
 ('?<|', (111, 114)),
 ('eot', (114, 117)),
 ('_', (117, 118)),
 ('id', (118, 120)),
 ('|>', (120, 122)),
 ('Ċ', (122, 123)),
 ('<|', (123, 125)),
 ('assistant', (125, 134)),
 ('|>', (134, 136)),
 ('Chocolate', (136, 145)),
 ('Ġis', (145, 148)),
 ('Ġ4000', (148, 153)),
 ('Ġyears', (153, 159)),
 ('Ġold', (159, 163)),
 ('/', (163, 164)),
 ('Mexico', (164, 170)),
 ('Ġis', (170, 173)),
 ('Ġwhere', (173, 179)),
 ('Ġit', (179, 182)),
 ('Ġwas', (182, 186)),
 ('Ġfirst',

In [22]:
tokenizer.add_tokens(special_tokens)
tokenizer.get_vocab()

{'<|eos|>': 3,
 '<|unk|>': 8,
 '<|assistant|>': 5,
 '<|system|>': 6,
 '<|eot_id|>': 1,
 '<|endoftext|>': 0,
 '<|user|>': 4,
 '<|bos|>': 2,
 '<|pad|>': 7}

In [23]:
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    special_tokens=special_tokens,
    show_progress=True,
    min_frequency=2,
)

In [24]:
from typing import Iterator

def plain_text_iterator() -> Iterator[str]:
    """
    Streams all text messages from the dataset.
    """
    for row in no_robots_ds:
        for msg in row["messages"]:
            yield msg["content"]

texts=plain_text_iterator()
print(next(texts))

Aster is a chatbot who answers questions with rhymes.


In [25]:
tokenizer.train_from_iterator(texts, trainer,length=total_length)
file_name= "./tokenizer_lite.json"






In [26]:
from tokenizers import decoders, processors

tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

In [27]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="<|bos|> $A <|eos|>",          # adds BOS before & EOS after each sequence
    pair="<|bos|> $A <|eos|> <|bos|> $B <|eos|>",  # for pairs (less common)
    special_tokens=[
        ("<|bos|>", tokenizer.token_to_id("<|bos|>")),
        ("<|eos|>", tokenizer.token_to_id("<|eos|>")),
    ],
)

In [28]:
tokenizer.save(file_name)

In [29]:
tokenizer = Tokenizer.from_file(file_name)

text = "To be, or not to be:\n\nThat is \nthe question."

encoding = tokenizer.encode(text)
print(encoding.tokens)
decoding=tokenizer.decode(encoding.ids)
print(decoding)

['<|bos|>', 'to', 'Ġbe', ',', 'Ġor', 'Ġnot', 'Ġto', 'Ġbe', ':', 'Ċ', 'Ċ', 'that', 'Ġis', 'Ġ', 'Ċ', 'the', 'Ġquestion', '.', '<|eos|>']
to be, or not to be:

that is 
the question.
