# Comparing custom tokenizers

In [2]:
DATASET_ID = "mhurhangee/patent-ind-claim-en"
SPLITS     = ["validation", "test"]  # will be combined
BATCH_SIZE = 1024

TOKENIZERS = {
    "bpe-8k": "mhurhangee/patent-claims-tokenizer-8000",
    "bpe-4k": "mhurhangee/patent-claims-tokenizer-4000",
    "bpe-16k": "mhurhangee/patent-claims-tokenizer-16000",
    "tiktoken-gpt2":  "tiktoken:gpt2",  # special prefix for tiktoken
}

SPECIALS = {"<EOS>", "<IDX>", "[UNK]", "[PAD]", "[CLS]", "[SEP]"}

In [6]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer
import tiktoken
from collections import Counter
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [7]:
ds_parts = [load_dataset(DATASET_ID, split=s) for s in SPLITS]
ds = concatenate_datasets(ds_parts)

# Get cleaned text list
texts = [t.strip() for t in ds["text"] if t and t.strip()]
print(f"Analysis corpus size: {len(texts):,} claims")

Analysis corpus size: 55,862 claims


In [9]:
def analyze_tokenizer(name, tok):
    n = len(texts)
    lens_tokens = []
    lens_chars  = []
    freq = Counter()
    special_ids = {}
    unk_id = None

    if hasattr(tok, "get_vocab"):  # HF tokenizer
        vocab = tok.get_vocab()
        special_ids = {s: tok.convert_tokens_to_ids(s) for s in SPECIALS if s in vocab}
        unk_id = tok.unk_token_id

        for i in tqdm(range(0, n, BATCH_SIZE), desc=f"{name} tokenizing"):
            batch = texts[i:i+BATCH_SIZE]
            enc = tok(batch, add_special_tokens=False)
            ids_lists = enc["input_ids"]
            lens_tokens.extend(len(x) for x in ids_lists)
            lens_chars.extend(len(s) for s in batch)
            for ids in ids_lists:
                freq.update(ids)

    else:  # tiktoken encoder
        enc = tok
        for i in tqdm(range(0, n, BATCH_SIZE), desc=f"{name} tokenizing"):
            batch = texts[i:i+BATCH_SIZE]
            ids_lists = [enc.encode(s) for s in batch]
            lens_tokens.extend(len(x) for x in ids_lists)
            lens_chars.extend(len(s) for s in batch)
            for ids in ids_lists:
                freq.update(ids)

    total_tokens = sum(lens_tokens)
    uniq_tokens  = len(freq)
    unk_count    = freq.get(unk_id, 0) if unk_id is not None else 0
    counts = np.array(list(freq.values()))
    rare_1 = int((counts == 1).sum())
    rare_5 = int((counts <= 5).sum())

    tp = np.array(lens_tokens, dtype=np.int32)
    cp = np.array(lens_chars, dtype=np.int32)

    metrics = {
        "tokens_total": total_tokens,
        "tokens_per_claim_avg": tp.mean(),
        "tokens_per_claim_med": np.median(tp),
        "tokens_per_claim_p95": np.percentile(tp, 95),
        "chars_per_token_avg": cp.sum() / total_tokens,
        "vocab_used": uniq_tokens,
        "rare_tokens_freq==1": rare_1,
        "rare_tokens_freq<=5": rare_5,
        "unk_rate_%": 100.0 * unk_count / total_tokens if total_tokens else 0.0,
    }
    return metrics

In [10]:
results = {}
for name, ident in TOKENIZERS.items():
    if ident.startswith("tiktoken:"):
        enc_name = ident.split(":", 1)[1]
        enc = tiktoken.get_encoding(enc_name)
        results[name] = analyze_tokenizer(name, enc)
    else:
        tok = AutoTokenizer.from_pretrained(ident)
        results[name] = analyze_tokenizer(name, tok)

df = pd.DataFrame(results).T
display(df)

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/531k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

bpe-8k tokenizing:   0%|          | 0/55 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/255k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

bpe-4k tokenizing:   0%|          | 0/55 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

bpe-16k tokenizing:   0%|          | 0/55 [00:00<?, ?it/s]

tiktoken-gpt2 tokenizing:   0%|          | 0/55 [00:00<?, ?it/s]

Unnamed: 0,tokens_total,tokens_per_claim_avg,tokens_per_claim_med,tokens_per_claim_p95,chars_per_token_avg,vocab_used,rare_tokens_freq==1,rare_tokens_freq<=5,unk_rate_%
bpe-8k,21188171.0,379.294887,360.0,706.0,5.115194,7796.0,21.0,106.0,9.9e-05
bpe-4k,22783603.0,407.855125,386.0,759.0,4.757,3879.0,9.0,51.0,9.2e-05
bpe-16k,20520901.0,367.349916,349.0,684.0,5.281523,15483.0,68.0,356.0,0.000102
tiktoken-gpt2,21951768.0,392.964233,373.0,726.0,4.937261,25883.0,1374.0,5917.0,0.0


### Tokenizer Analysis Summary

**Sequence Length Efficiency**  
- **bpe-16k**: Shortest sequences on average (367 tokens/claim).  
- **bpe-8k**: Close behind at 379 tokens/claim.  
- **bpe-4k** and **tiktoken-gpt2**: Longer sequences (~408 and ~393 tokens/claim).  

**Compression Trade-offs**  
- Larger vocabularies reduce token counts: **bpe-16k** > **bpe-8k** > **bpe-4k**.  
- `chars_per_token_avg` rises with vocab size — larger merges yield chunkier tokens.  

**Vocab Usage**  
- **bpe-4k**/**bpe-8k**: Nearly full vocab usage (≥97%).  
- **bpe-16k**: ~97% usage (15.5k of 16k tokens).  
- **tiktoken-gpt2**: ~52% usage (25.9k of 50k tokens) — lots of unused tokens.  

**Rare Token Tail**  
- Custom BPEs: Minimal rare tokens (≤356 tokens occur ≤5 times).  
- GPT-2: Large rare tail (5,917 tokens ≤5 occurrences) — much vocab wasted on unused patterns.  

**UNK Rate**  
- All custom BPEs: ~0% unknown tokens — vocab covers domain well.  
- GPT-2: 0% by design.  

**Conclusion**  
Custom BPE tokenizers are far more domain-efficient for patent claims than GPT-2’s general-purpose tokenizer.  
They yield shorter sequences, higher vocab utilization, and minimal rare-token waste.  
The **16k vocab** gives the best compression, but **8k** is close and may be preferable for smaller LLMs due to a smaller embedding matrix.
