In [2]:
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers, Regex
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from datasets import load_dataset

tokenizer_model = "tokenizer-01.json"

# 1. The Regex Pattern
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

In [3]:
# 2. The core model (BPE)
tokenizer = Tokenizer(BPE())
# 3. Normalization (recommended)
tokenizer.normalizer = normalizers.NFKC()

# 4. --- The Pre-tokenizer Pipeline ---
# This is the key part. We build a Sequence.
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    # Step 1: Split the text using the regex
    pre_tokenizers.Split(
        pattern=Regex(GPT4_SPLIT_PATTERN),
        behavior="isolated"
    ),
    # Step 2: Convert the resulting chunks to bytes
    pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])

# 5. Decoder (to read it back)
tokenizer.decoder = decoders.ByteLevel()

# 6. The Trainer (now uses the ByteLevel alphabet)
trainer = BpeTrainer(
    vocab_size=30000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<|end_of_text|>", "<|user|>", "<|assistant|>", "<|im_end|>"],
    )

In [4]:
# Load the saved dataset instantly
dataset = load_dataset("openwebtext")

print(dataset)

train_data = dataset['train']
len(train_data)

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 8013769
    })
})


8013769

In [5]:
import os

# Set the number of threads for the Rust backend
os.environ['RAYON_NUM_THREADS'] = '2'

# Optional: verify it was set
print(f"RAYON_NUM_THREADS is set to: {os.environ.get('RAYON_NUM_THREADS')}")

RAYON_NUM_THREADS is set to: 2


In [6]:
def batch_iterator(data_split, batch_size=1000):
    # Only keep the text column to avoid decoding the rest of the columns unnecessarily
    tok_dataset = data_split.select_columns("text")
    for batch in tok_dataset.iter(batch_size):
        yield batch["text"]

tokenizer.train_from_iterator(batch_iterator(train_data), trainer=trainer, length=len(train_data))






In [7]:
# add a post processor
tokenizer.post_processor = TemplateProcessing(
    single="$A <|end_of_text|>",
    pair="$A <|end_of_text|> $B:1 <|end_of_text|>:1",
    special_tokens=[
        ("<|end_of_text|>", tokenizer.token_to_id("<|end_of_text|>"))
    ],
)

In [2]:
# load
# tokenizer = Tokenizer.from_file(tokenizer_model)

In [3]:
#with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    #text = f.read()

In [14]:
# encode
output = tokenizer.encode(train_data[1679]['text'])
print(output)

Encoding(num_tokens=251, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [15]:
print(output.tokens)
print(output.ids)

['Former', 'ĠRo', 'oster', 'ĠTe', 'eth', 'Ġand', 'ĠAchie', 'vement', 'ĠHunter', 'Ġcontributor', ',', 'ĠDavid', 'Ġ"', 'Kn', 'uck', 'les', 'ĠDawson', '"', 'ĠDre', 'ger', 'Ġhas', 'Ġbeen', 'Ġfound', 'Ġdead', 'Ġat', 'ĠAmb', 'les', 'ide', 'ĠPark', 'Ġin', 'ĠWest', 'ĠVancouver', '.', 'ĠHis', 'Ġbody', 'Ġwas', 'Ġnoticed', 'Ġyesterday', ',', 'Ġaccording', 'Ġto', 'Ġa', 'Ġpost', 'Ġby', 'ĠDre', 'ger', "'s", 'Ġfamily', 'Ġon', 'Ġthe', 'ĠFacebook', 'Ġpage', 'Ġthat', 'Ġwas', 'Ġsetup', 'Ġto', 'Ġhelp', 'Ġfind', 'Ġhim', '.', 'ĠDavid', "'s", 'Ġsister', 'ĠDaniel', 'le', 'Ġwrote', 'Ġon', 'Ġthe', 'Ġpage', ':ĊĊ', 'We', 'Ġare', 'Ġthankful', 'Ġthat', 'Ġthe', 'Ġprayers', 'Ġfor', 'Ġdiscovery', 'Ġmade', 'Ġby', 'Ġeveryone', 'Ġduring', 'Ġthis', 'Ġtime', 'Ġhave', 'Ġbeen', 'Ġanswered', '.', 'ĠIt', 'Ġis', 'Ġwith', 'Ġdeep', 'Ġsadness', 'Ġthat', 'Ġwe', 'Ġmust', 'Ġlet', 'Ġyou', 'Ġall', 'Ġknow', 'Ġthat', 'ĠDavid', "'s", 'Ġbody', 'Ġwas', 'Ġdiscovered', 'Ġon', 'ĠJune', 'Ġ', '20', 'Ġnear', 'ĠAmb', 'les', 'ide', 'ĠPark', '.', 'Ġ

In [16]:
print(tokenizer.decode(output.ids))

Former Rooster Teeth and Achievement Hunter contributor, David "Knuckles Dawson" Dreger has been found dead at Ambleside Park in West Vancouver. His body was noticed yesterday, according to a post by Dreger's family on the Facebook page that was setup to help find him. David's sister Danielle wrote on the page:

We are thankful that the prayers for discovery made by everyone during this time have been answered. It is with deep sadness that we must let you all know that David's body was discovered on June 20 near Ambleside Park. We would like to take this opportunity to thank everyone for their prayers, support and compassion. David was an exceptional, bright, caring and loving young man. We consider ourselves blessed to have had him in our lives, and are comforted knowing that he is now embraced in the arms of our Lord.

Dreger, 28, was reported missing in late May after last being seen on May 26 in his hometown of Vancouver. His bike and helmet were found the day after in Stanley Park

In [11]:
# check a tokens id
tokenizer.token_to_id("<|end_of_text|>")

0

In [12]:
tokenizer.get_vocab_size()

30000

In [13]:
# save
tokenizer.save(tokenizer_model)