In [24]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

In [25]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test\npre-tokenization!")

[('Let', (0, 3)),
 ("'s", (3, 5)),
 ('Ġtest', (5, 10)),
 ('Ċ', (10, 11)),
 ('pre', (11, 14)),
 ('-', (14, 15)),
 ('tokenization', (15, 27)),
 ('!', (27, 28))]

In [26]:
special_tokens=["<|endoftext|>","[UNK]"]
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    special_tokens=special_tokens,
    show_progress=True,
    min_frequency=2,
)

In [6]:
from datasets import load_dataset, interleave_datasets
from instruction_following.common.env_handler import hf_token
from huggingface_hub import login

login(hf_token)
total_length=0


# wiki
wiki = load_dataset("rahular/simple-wikipedia",streaming=True)
wiki_ds=wiki["train"].select_columns(["text"])
total_length+=wiki_ds.dataset_size


# tiny stories
tiny_stories = load_dataset("roneneldan/TinyStories", streaming=True)
tiny_stories_ds=interleave_datasets(
    [
       tiny_stories["train"].select_columns(["text"]),
        tiny_stories["validation"].select_columns(["text"])
    ],
    probabilities=[0.90, 0.1],
    seed=42
)
total_length+=tiny_stories_ds.dataset_size


# tiny textbooks
tiny_textbooks = load_dataset("nampdn-ai/tiny-textbooks", streaming=True)
tiny_textbooks_ds=interleave_datasets(
    [
       tiny_textbooks["train"].select_columns(["textbook"]),
         tiny_textbooks["test"].select_columns(["textbook"])
    ],
    probabilities=[0.90, 0.1],
    seed=42
)
total_length+=tiny_textbooks_ds.dataset_size


# no robots
no_robots = load_dataset("HuggingFaceH4/no_robots", streaming=True)
no_robots_ds=interleave_datasets(
    [
        no_robots["train"].select_columns(["messages"]),
        no_robots["test"].select_columns(["messages"])
    ],
    probabilities=[0.90, 0.1],
    seed=42
)
total_length+=no_robots_ds.dataset_size


print(total_length)
print(next(iter(wiki_ds)))
print(next(iter(tiny_stories_ds)))
print(next(iter(tiny_textbooks_ds)))
print(next(iter(no_robots_ds)))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


3842028925
{'text': 'April'}
{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}
{'textbook': "Lesson: Understanding Government Finance\n\nIntroduction:\nGovernments around the world use various methods to finance their expenditures. One such method is deficit financing, which involves borrowing money to fund expenses that exceed i

In [28]:
from typing import Iterator


def plain_text_iterator() -> Iterator[str]:
    """
    Streams all text messages from the dataset.
    """
    for row in no_robots_ds:
        for msg in row["messages"]:
            yield msg["content"]
    print("no robots completed")
    for row in wiki_ds:
        yield row["text"]
    print("wiki completed")
    for row in tiny_stories_ds:
        yield row["text"]
    print("tiny stories completed")
    for row in tiny_textbooks_ds:
        yield row["textbook"]
    print("tiny textbooks completed")

texts=plain_text_iterator()
print(next(texts))

Please summarize the goals for scientists in this text:

Within three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to just take thei

In [None]:
tokenizer.train_from_iterator(texts, trainer,length=total_length)
file_name= "./tokenizer.json"

'The read operation timed out' thrown while requesting GET https://huggingface.co/datasets/nampdn-ai/tiny-textbooks/resolve/1f88d0fe47109c17de0671cd8b8f2659f0a1c1ef/tiny-textbooks/test-00000-of-00001.parquet
Retrying in 1s [Retry 1/5].


In [None]:
from tokenizers import decoders, processors

tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

In [None]:
tokenizer.save(file_name)

In [None]:
tokenizer = Tokenizer.from_file(file_name)

text = "To be, or not to be:\n\nThat is \nthe question."

encoding = tokenizer.encode(text)
print(encoding.tokens)
decoding=tokenizer.decode(encoding.ids)
print(decoding)