In [3]:
from tokenizer import SpaceTokenizer
from tokenizers import Tokenizer
from datasets import load_dataset
import json
import time
from tqdm import tqdm

In [2]:
# TODO also handle s, es, ed, ing
# TODO remove scaffolding tokens 10%@50k 10%@25k?

In [40]:
# with open('tokenizers/tokenizer-space.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)["model"]["vocab"]
with open('tokenizers/tokenizer-space-20k-rs.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)
vocab_size = 10000 + 257
tokenizer = SpaceTokenizer(tokenizer_config, vocab_size)

text = "The quick brown Fox jumps 1234 OVER the lazy Dog."
ids, tokens = tokenizer.encode(text, return_token_tuple=True)
decoded = tokenizer.decode(ids)

print(text)
print(ids)
print(tokens)
print(decoded)

The quick brown Fox jumps 1234 OVER the lazy Dog.
[1029, 14478, 3922, 6483, 19190, 332, 32402, 80, 319, 345, 28560, 1030, 27862, 356, 8403, 56]
['The', 'Ġquick', 'Ġbrown', 'ĠFox', 'Ġjump', 's', 'Ġ123', '4', 'ĠO', 'V', 'ER', 'Ġthe', 'Ġlaz', 'y', 'ĠDog', '.']
The quick brown Fox jumps 1234 OVER the lazy Dog.


In [13]:
list(tokenizer.vocab_decode.items())[0:16]

[(1, '<|endoftext|>'),
 (3, 'Ġ<|endoftext|>'),
 (0, '<|endoftext|>'),
 (2, 'Ġ<|endoftext|>'),
 (5, '!'),
 (7, 'Ġ!'),
 (4, '!'),
 (6, 'Ġ!'),
 (9, '"'),
 (11, 'Ġ"'),
 (8, '"'),
 (10, 'Ġ"'),
 (13, '#'),
 (15, 'Ġ#'),
 (12, '#'),
 (14, 'Ġ#')]

In [15]:
with open('tokenizers/taylorswift.txt', 'r') as file:
    content = file.read()

t0 = time.perf_counter()
ids, tokens = tokenizer.encode(content, return_token_tuple=True)
print("Decode time:", time.perf_counter() - t0)

decoded = tokenizer.decode(ids)
token_count = len(ids)

print(token_count)
print(content == decoded)

Decode time: 0.10392170000022816
53560
True


In [17]:
tokenizer_gpt = Tokenizer.from_file("tokenizers/tokenizer-ref-20k.json")

t0 = time.perf_counter()
output_gpt = tokenizer_gpt.encode(content)
print("Decode time:", time.perf_counter() - t0)
gpt_token_count = len(output_gpt.ids)

print("Space:", token_count, "GPT2:", gpt_token_count, "Ratio:", token_count / gpt_token_count)

Decode time: 0.11724080000021786
Space: 53560 GPT2: 60454 Ratio: 0.885962880868098


In [18]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="test+validation") # split="train+test+validation"

def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [22]:
token_count_gpt = 0
for batch in tqdm(batch_iterator(), total=len(dataset) / 1000):
    output = tokenizer_gpt.encode(" ".join(batch))
    token_count_gpt += len(output.ids)

9it [00:01,  5.97it/s]                           


In [41]:
token_count = 0
ids = []
for batch in tqdm(batch_iterator(), total=len(dataset) / 1000):
    token_ids, tokens = tokenizer.encode(" ".join(batch), return_token_tuple=True)
    ids.extend(token_ids)
    token_count += len(token_ids)

# 30000: 1.003, 32000: 0.9997 35000: 0.9949, 37500: 0.991
# 25k: 0.9999
# After scaffold-removal-5%: 23.5k: 0.9982, 23k: 0.9993, 22.5k: 1.00075
print("Token count", token_count, token_count_gpt, token_count / token_count_gpt)

9it [00:01,  7.39it/s]                           

Token count 546410 545681 1.0013359453600181





### Train tokenizer

In [9]:
from tokenizer import bytes_to_unicode
from tokenizers import Tokenizer, pre_tokenizers, processors, decoders, models, trainers

special_tokens = ["<|endoftext|>", *bytes_to_unicode().values()]

tokenizer = Tokenizer(models.BPE())
trainer = trainers.BpeTrainer(vocab_size=20257, min_frequency=0, special_tokens=special_tokens, show_progress=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.post_processor = processors.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()

tokenizer.train(["tokenizers/wikitext.txt"], trainer)

tokenizer.save("tokenizers/tokenizer-ref-20k.json")

In [10]:
with open("tokenizers/tokenizer-ref-20k.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
data["added_tokens"] = [data["added_tokens"][0]]
with open("tokenizers/tokenizer-ref-20k.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [5]:
# Downgrade huggingface tokenizer config, config is created from main branch of tokenizers

with open("tokenizers/tokenizer-ref-20k-latest.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

data["model"]["merges"] = list(map(lambda merge: f"{merge[0]} {merge[1]}", data["model"]["merges"]))

with open("tokenizers/tokenizer-ref-20k.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)