
# Test created tokenizers

In [1]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import Pool
import os

from datasets import load_dataset
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast

In [2]:
tokenizer_paths = ["portuguese_bpe_tokenizer_1.json", 
                   "portuguese_bpe_tokenizer_2.json", 
                   "portuguese_bpe_tokenizer_3.json",
                   "portuguese_bpe_tokenizer_4.json",
                   "portuguese_bpe_tokenizer_5.json",
                   "portuguese_bpe_tokenizer_6.json",
                   "portuguese_bpe_tokenizer_7.json",
                   "portuguese_bpe_tokenizer_8.json",
                   "portuguese_bpe_tokenizer_9.json"]
dataset = load_dataset("TucanoBR/wikipedia-PT", split="train", cache_dir="../data")

In [3]:
def load_tokenizers(tokenizer_paths):
    """Load multiple tokenizers from JSON files."""
    tokenizers = []
    for path in tokenizer_paths:
        if not os.path.exists(path):
            raise FileNotFoundError(f"Tokenizer file {path} not found.")
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=path)
        tokenizers.append((os.path.basename(path), tokenizer))
    return tokenizers

In [5]:
tokenizers = load_tokenizers(tokenizer_paths)
#print(tokenizers)

[('portuguese_bpe_tokenizer_1.json', PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)), ('portuguese_bpe_tokenizer_2.json', PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)), ('portuguese_bpe_tokenizer_3.json', PreTrainedTokenizerFast(name_or_path='', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', speci

In [7]:
def _get_text(sample, text_column='text'):
    if isinstance(sample, dict):
        return sample.get(text_column)
    return getattr(sample, text_column, None)

In [8]:
def split_tokenizers(tokenizers, n):
    """Split tokenizers into n groups (as evenly as possible)."""
    n = max(1, min(n, len(tokenizers)))
    k, m = divmod(len(tokenizers), n)
    groups = []
    i = 0
    for j in range(n):
        size = k + (1 if j < m else 0)
        groups.append(tokenizers[i:i+size])
        i += size
    return [g for g in groups if g]

In [9]:
def count_tokens_for_subset(tokenizer_subset, dataset, text_column='text', max_samples=None):
    """Each worker iterates the ENTIRE dataset but only counts tokens for its tokenizer_subset."""
    local_counts = {name: 0 for name, _ in tokenizer_subset}
    sample_count = 0

    n_total = len(dataset)
    if max_samples:
        n_total = min(n_total, max_samples)

    for i in range(n_total):
        sample = dataset[i]
        text = _get_text(sample, text_column)
        if not isinstance(text, str):
            continue
        for name, tokenizer in tokenizer_subset:
            local_counts[name] += len(tokenizer.encode(text, add_special_tokens=False))
        sample_count += 1

    return local_counts, sample_count

In [10]:
# Parameters
n_workers = min(6, len(tokenizers))
max_samples = None  # set to an integer to limit processed samples
subsets = split_tokenizers(tokenizers, n_workers)

# Run threads: each thread processes entire dataset but only its subset of tokenizers
total_counts = {name: 0 for name, _ in tokenizers}
sample_count = None

In [11]:
%%time

with ThreadPoolExecutor(max_workers=len(subsets)) as executor:
    futures = [executor.submit(count_tokens_for_subset, subset, dataset, 'text', max_samples) for subset in subsets]
    for fut in as_completed(futures):
        local_counts, local_samples = fut.result()
        for k, v in local_counts.items():
            total_counts[k] += v
        if sample_count is None:
            sample_count = local_samples

print("Finished threaded counting over full dataset per tokenizer subset")
print_count(total_counts)
print("sample_count:", sample_count)

Finished threaded counting over full dataset per tokenizer subset
for tokenizer: portuguese_bpe_tokenizer_1.json total tokens are: 608,321,068
for tokenizer: portuguese_bpe_tokenizer_2.json total tokens are: 604,103,063
for tokenizer: portuguese_bpe_tokenizer_3.json total tokens are: 603,705,401
for tokenizer: portuguese_bpe_tokenizer_4.json total tokens are: 674,601,344
for tokenizer: portuguese_bpe_tokenizer_5.json total tokens are: 661,595,424
for tokenizer: portuguese_bpe_tokenizer_6.json total tokens are: 661,736,917
for tokenizer: portuguese_bpe_tokenizer_7.json total tokens are: 561,148,010
for tokenizer: portuguese_bpe_tokenizer_8.json total tokens are: 559,658,771
for tokenizer: portuguese_bpe_tokenizer_9.json total tokens are: 560,224,561
sample_count: 1103446
CPU times: total: 2h 10min 25s
Wall time: 31min 29s


| Name | Vocab Size | sample size |   token count  |
|------|------------|-------------|----------------|
| 1    |   30,000   | 679,609     | 608,321,068    |
| 2    |   30,000   | 657,685     | 604,103,063    |
| 3    |   30,000   | 200,000     | 603,705,401    |
| 4    |   15,000   | 679,609     | 674,601,344    |
| 5    |   15,000   | 657,685     | 661,595,424    |
| 6    |   15,000   | 200,000     | 661,736,917    |
| 7    |   60,000   | 679,609     | 561,148,010    |
| 8    |   60,000   | 657,685     | 559,658,771    |
| 9    |   60,000   | 200,000     | 560,224,561    |