In [3]:
import srsly
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.trainers import BpeTrainer


In [3]:
eos_token = "|endoftext|"

# Define the trainer
trainer = BpeTrainer(
    vocab_size=2000,
    min_frequency=2,
    special_tokens=[eos_token],
    show_progress=True,
)

# Define the tokenizer
tokenizer = Tokenizer(BPE())

# Set up the tokenizer components
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.post_processor = ByteLevelProcessor()
tokenizer.decoder = ByteLevelDecoder()

In [4]:
with open("tokenizers/modified_bpe/data/test_small.txt") as fl:
    lines = fl.readlines()

In [10]:
num_docs = 10_000
batch_size = 1_000

dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True, cache_dir=".data_cache")
dataset = dataset.take(num_docs).select_columns(["text"]).batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

In [12]:
tokenizer.train_from_iterator(iter(x["text"] for x in dataset), trainer, int(num_docs / batch_size))






In [2]:
import srsly
import polars as pl
from pathlib import Path

In [42]:
from tokenizers import Tokenizer
from pathlib import Path
from transformers import PreTrainedTokenizerFast
import json

In [44]:
def load_tokenizer_with_vocab_size(path: str | Path, vocab_size: int) -> PreTrainedTokenizerFast:
    path = Path(path)

    # Edit conf to adapt to the new vocab_size
    conf = srsly.read_json(path / "tokenizer.json")
    len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])
    conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
    conf["model"]["merges"] = conf["model"]["merges"][:vocab_size - len_alphabet]
    
    # Instantiate tokenizer using tokenizers library
    backend_tok = Tokenizer.from_str(json.dumps(conf))
    eos_token = srsly.read_yaml(path / "metadata.yaml")["eos_token"]

    # Instantiate PreTrainedTokenizerFast from object
    # NOTE: we do not instantiate from file directly due to compatibility
    # https://github.com/huggingface/tokenizers/issues/1562#issuecomment-2315349846
    tok = PreTrainedTokenizerFast(tokenizer_object=backend_tok)
    tok.padding_side = "left"
    tok.eos_token = eos_token

    return tok


In [45]:
path = Path("outputs/tokenizers/2024-08-28T16-34-11")
vocab_size = 500

tok = load_tokenizer_with_vocab_size(path, vocab_size)

In [48]:
tok.save_pretrained(path / f"tok-vocab{vocab_size}")

('outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/tokenizer_config.json',
 'outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/special_tokens_map.json',
 'outputs/tokenizers/2024-08-28T16-34-11/tok-vocab500/tokenizer.json')

In [34]:
vocab_size = 500

# edit conf to adapt to the new vocab_size
conf = srsly.read_json(path / "tokenizer.json")
# conf["padding"] = "left"
len_alphabet = len(conf["model"]["vocab"]) - len(conf["model"]["merges"])

conf["model"]["vocab"] = dict(list(conf["model"]["vocab"].items())[:vocab_size])
conf["model"]["merges"] = conf["model"]["merges"][:vocab_size - len_alphabet]

In [39]:
t = Tokenizer.from_str(json.dumps(conf))

In [38]:
conf["model"].keys()

dict_keys(['type', 'dropout', 'unk_token', 'continuing_subword_prefix', 'end_of_word_suffix', 'fuse_unk', 'byte_fallback', 'ignore_merges', 'vocab', 'merges'])

In [41]:
t.get_vocab_size()

500

----