In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import ByteLevel as ByteLevelProcessor
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

from transformers import PreTrainedTokenizerFast

from datasets import load_dataset

In [3]:
eos_token = "|endoftext|"

# Define the trainer
trainer = BpeTrainer(
    vocab_size=2000,
    min_frequency=2,
    special_tokens=[eos_token],
    show_progress=True,
)

# Define the tokenizer
tokenizer = Tokenizer(BPE())

# Set up the tokenizer components
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True)
tokenizer.post_processor = ByteLevelProcessor()
tokenizer.decoder = ByteLevelDecoder()

In [4]:
with open("tokenizers/modified_bpe/data/test_small.txt") as fl:
    lines = fl.readlines()

In [10]:
num_docs = 10_000
batch_size = 1_000

dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", streaming=True, cache_dir=".data_cache")
dataset = dataset.take(num_docs).select_columns(["text"]).batch(batch_size=batch_size)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

In [12]:
tokenizer.train_from_iterator(iter(x["text"] for x in dataset), trainer, int(num_docs / batch_size))






In [13]:
tokenizer.get_vocab()

{'ular': 671,
 'ĠWhen': 1474,
 'man': 1206,
 'Ġthat': 273,
 '´': 112,
 'Ġam': 693,
 'ĠZ': 1486,
 'In': 762,
 'als': 722,
 'itions': 1166,
 'Ġgrowth': 1986,
 'ĠUniversity': 1175,
 'ission': 1295,
 'ection': 1010,
 'Ġcell': 980,
 'ad': 282,
 'Ġser': 1107,
 'ored': 1669,
 'ween': 783,
 'ĠB': 314,
 'Ġcomp': 467,
 'oy': 859,
 'uch': 788,
 'ural': 825,
 'Ġrese': 796,
 'Ġbuild': 1240,
 'Ġu': 275,
 'iel': 1061,
 'ĠAfric': 1581,
 'ĠEurope': 1595,
 'ore': 335,
 'Ġlower': 1919,
 'overed': 1997,
 'Ġchar': 1097,
 'Ġind': 612,
 'dition': 1115,
 'Ġmuch': 913,
 'ublic': 1956,
 'ertain': 1269,
 'Ġmight': 1224,
 'ens': 539,
 'ph': 588,
 'ah': 1092,
 'Ġadv': 1204,
 'Ġgoing': 1969,
 'Ġcho': 1774,
 'urop': 1473,
 'omen': 981,
 'Ġm': 229,
 'Ġhaving': 1886,
 'ople': 590,
 'Ġhealth': 863,
 'il': 241,
 'Ġ1': 297,
 'itt': 829,
 'rid': 1532,
 'row': 743,
 'Ġexpl': 1033,
 'Wh': 1171,
 'Ģ': 165,
 'ale': 1003,
 'ĠEx': 1455,
 'ial': 378,
 'Ġflow': 1633,
 'Ġfol': 882,
 '±': 109,
 'Ġv': 345,
 'ear': 412,
 'are': 433,


In [22]:
tokenizer.save("tokenizer.json", pretty=True)

In [23]:
backend_tok = Tokenizer.from_file("tokenizer.json")

tok = PreTrainedTokenizerFast(tokenizer_object=backend_tok)
tok.padding_side = "left"
tok.eos_token = eos_token