In [1]:
from tokenizer import SpaceTokenizer
from tokenizers import Tokenizer
from datasets import load_dataset
import json
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# TODO also handle s, es, ed, ing
# TODO remove scaffolding tokens 10%@50k 10%@25k?

In [3]:
with open('tokenizers/tokenizer-space.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)

vocab_size = 25000 + 257
tokenizer = SpaceTokenizer(tokenizer_config["model"]["vocab"], vocab_size)

print(len(tokenizer.vocab_decode)//4, len(tokenizer_config["model"]["vocab"]))

with open('tokenizers/space-vocab-sorted.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)
vocab_size = 23000 + 257
tokenizer = SpaceTokenizer(tokenizer_config, vocab_size)

text = "The quick brown Fox jumps 1234 OVER the lazy Dog."
ids, tokens = tokenizer.encode(text, return_token_tuple=True)
decoded = tokenizer.decode(ids)

print(text)
print(ids)
print(tokens)
print(decoded)

25257 50257
The quick brown Fox jumps 1234 OVER the lazy Dog.
[1037, 8598, 7638, 12427, 70294, 58618, 80, 92459, 89749, 1038, 90350, 12179, 56]
['The', 'Ġquick', 'Ġbrown', 'ĠFox', 'Ġjumps', 'Ġ123', '4', 'ĠOV', 'ER', 'Ġthe', 'Ġlazy', 'ĠDog', '.']
The quick brown Fox jumps 1234 OVER the lazy Dog.


In [4]:
with open('tokenizers/taylorswift.txt', 'r') as file:
    content = file.read()

t0 = time.perf_counter()
ids, tokens = tokenizer.encode(content, return_token_tuple=True)
print("Decode time:", time.perf_counter() - t0)

decoded = tokenizer.decode(ids)
token_count = len(ids)

print(token_count)
print(content == decoded)

Decode time: 0.10043619999851217
53219
True


In [5]:
tokenizer_gpt = Tokenizer.from_file("tokenizers/tokenizer-gpt2-downgrade.json")

t0 = time.perf_counter()
output_gpt = tokenizer_gpt.encode(content)
print("Decode time:", time.perf_counter() - t0)
gpt_token_count = len(output_gpt.ids)

print("Space:", token_count, "GPT2:", gpt_token_count, "Ratio:", token_count / gpt_token_count)

Decode time: 0.12846199999876262
Space: 53219 GPT2: 56758 Ratio: 0.9376475562916241


In [6]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") # split="train+test+validation"

def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [7]:
token_count, token_count_gpt = 0, 0
ids = []
for batch in tqdm(batch_iterator(), total=len(dataset) / 1000):
    text = " ".join(batch)

    output = tokenizer_gpt.encode(text)
    token_count_gpt += len(output.ids)

    token_ids, tokens = tokenizer.encode(text, return_token_tuple=True)
    ids.extend(token_ids)
    token_count += len(tokens)

# 30000: 1.003, 32000: 0.9997 35000: 0.9949, 37500: 0.991
# 25k: 0.9999
# After scaffold-removal-5%: 23.5k: 0.9982, 23k: 0.9993, 22.5k: 1.00075
print("Token count", token_count, token_count_gpt, token_count / token_count_gpt)

1810it [10:54,  2.77it/s]                              

Token count 113918614 113996585 0.9993160233703492





In [8]:
# Downgrade huggingface tokenizer config, config is created from main branch of tokenizers

with open('tokenizers/tokenizer-gpt2.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

data["model"]["merges"] = list(map(lambda merge: f"{merge[0]} {merge[1]}", data["model"]["merges"]))

with open('tokenizers/tokenizer-gpt2-downgrade.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)