In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    "mlx-community/Hunyuan-7B-Instruct-3bit",
    local_dir=".",
    allow_patterns=["hy.tiktoken", "tokenization_hy.py", "tokenizer_config.json", "special_tokens_map.json"]
)
print("Downloaded")

Downloaded


In [2]:
from tokenization_hy import *

In [3]:
original = HYTokenizer.from_pretrained(".")

In [4]:
from transformers.convert_slow_tokenizer import TikTokenConverter

In [5]:
converter = TikTokenConverter(
    vocab_file="hy.tiktoken",
    pattern=PAT_STR,
    additional_special_tokens=[t[1] for t in SPECIAL_TOKENS],
)

In [6]:
converted = converter.converted()

In [7]:
from tokenizers import normalizers

converted.normalizer = normalizers.NFC()

In [10]:
from transformers import PreTrainedTokenizerFast

t_fast = PreTrainedTokenizerFast(
    tokenizer_object=converted,
    model_input_names=original.model_input_names,
    model_max_length=256*1024,
    clean_up_tokenization_spaces=False,
)

In [11]:
original.encode("hello")

[15339]

In [12]:
t_fast.encode("hello")

[15339]

In [13]:
original.encode("ड़")

[5619, 94, 5619, 120]

In [14]:
t_fast.encode("ड़")

[5619, 94, 5619, 120]

In [15]:
SPECIAL_TOKENS[-1]

(128166, '<|extra_204|>')

In [16]:
original.encode(SPECIAL_TOKENS[-1][-1])

[128166]

In [17]:
t_fast.encode(SPECIAL_TOKENS[-1][-1])

[128166]

In [18]:
from datasets import load_dataset
from tqdm import tqdm

In [19]:
xnli = load_dataset("xnli", "all_languages", split="validation")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
def verify(lang, text):
    encoded_original = original.encode(text)
    encoded_fast = t_fast.encode(text)
    assert encoded_fast == encoded_original, f"Fast encode error: {lang} - {text}"
    decoded = original.decode(encoded_original)
    decoded_fast = t_fast.decode(encoded_fast, skip_special_tokens=True)
    assert decoded_fast == decoded, f"Fast decode error: {lang} - {text}"

In [21]:
for p in tqdm(xnli["premise"]):
    for lang, text in p.items():
        verify(lang, text)

100%|███████████████████████████████████████████████████████████████████████████████| 2490/2490 [00:05<00:00, 431.38it/s]


Testing on codeparrot subset

In [22]:
ds = load_dataset("codeparrot/github-code", streaming=True, trust_remote_code=True, split="train")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
skipped = 0
iterator = iter(ds)
for _ in tqdm(range(1000)):
    item = next(iterator)
    code = item["code"]
    lang = item["language"]
    if False and item["size"] > 1000:
        skipped += 1
        continue
    verify(lang, code)

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:06<00:00, 166.30it/s]


In [27]:
t_fast.chat_template = original.chat_template

In [28]:
t_fast.push_to_hub("Hunyuan-7B-Instruct-tokenizer")

README.md: 0.00B [00:00, ?B/s]

Uploading...:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pcuenq/Hunyuan-7B-Instruct-tokenizer/commit/201270fda5446f747ad4eb441d9645365ba4ba8e', commit_message='Upload tokenizer', commit_description='', oid='201270fda5446f747ad4eb441d9645365ba4ba8e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pcuenq/Hunyuan-7B-Instruct-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='pcuenq/Hunyuan-7B-Instruct-tokenizer'), pr_revision=None, pr_num=None)