In [1]:
from tokenizers import Tokenizer
from datasets import load_dataset
import json

In [2]:
def pack_token(id, space, upper):
    return (id << 2) + (space << 1) + (upper << 0) 

def unpack_token(token):
    id = token >> 2
    space = bool((token >> 1) & 0x01)
    upper = bool((token >> 0) & 0x01)
    return (id, space, upper)

def upper_first(text):
    return text[0].upper() + (text[1:] if len(text) > 1 else "")

def lower_first(text):
    return text[0].lower() + (text[1:] if len(text) > 1 else "")

In [335]:
# Downgrade huggingface tokenizer config, config is created from main branch of tokenizers

with open('tokenizers/tokenizer-space.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

model = data["model"]
if "ignore_merges" in model:
    del model["ignore_merges"]

data["normalizer"] = None

vocab = model["vocab"]
updated_vocab = {}
for token, id in vocab.items():
    updated_vocab[f"{upper_first(token)}"] = pack_token(id, space=False, upper=True)
    updated_vocab[f"Ġ{upper_first(token)}"] = pack_token(id, space=True, upper=True)
    updated_vocab[f"{token}"] = pack_token(id, space=False, upper=False)
    updated_vocab[f"Ġ{token}"] = pack_token(id, space=True, upper=False)

model["vocab"] = updated_vocab

updated_merges = []
upper_merges = []
space_merges = []
for merge in model["merges"]:
    # updated_merges.append(merge[0] + " " + merge[1])

    updated_merges.append(f"{merge[0]} {merge[1]}")

    updated_merges.append(f"{upper_first(merge[0])} {merge[1]}")

    space_merges.append(f"Ġ {upper_first(merge[0])}{merge[1]}")
    space_merges.append(f"Ġ {merge[0]}{merge[1]}")

model["merges"] = updated_merges # + upper_merges + space_merges

with open('tokenizers/tokenizer-space-downgrade.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


In [3]:
tokenizer = Tokenizer.from_file("tokenizers/tokenizer-space-downgrade.json")

text = "The quick brown fox jumps over the Lazy Dog."
output = tokenizer.encode(text)
decodeed = tokenizer.decode(output.ids)

print(text)
print(output.tokens)
print(output.ids)
print(decodeed)
print(len(output.ids))
print(text == decodeed)

The quick brown fox jumps over the Lazy Dog.
['Ġ', 'The', 'Ġ', 'quick', 'Ġ', 'brown', 'Ġ', 'fox', 'Ġ', 'jumps', 'Ġ', 'over', 'Ġ', 'the', 'Ġ', 'Lazy', 'Ġ', 'Dog', '.']
[684, 833, 684, 8392, 684, 7432, 684, 12220, 684, 70088, 684, 1824, 684, 832, 684, 90145, 684, 11973, 56]
 The quick brown fox jumps over the Lazy Dog.
19
False


In [310]:
tokenizer = Tokenizer.from_file("tokenizers/tokenizer-gpt2-downgrade.json")

output = tokenizer.encode("The quick brown fox jumps over the lazy dog.")

print(output.tokens)
print(output.ids)
print(len(output.ids))

['ĠThe', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
[276, 2402, 4866, 13854, 19761, 551, 210, 30933, 4738, 14]
10
