In [19]:
import os

# Dataset on HF Hub
DATASET_ID  = "mhurhangee/patent-ind-claim-en"  # must have "text" column
SPLIT       = "train"  # or "train+validation" if you want more data
SEED        = 42

# Tokenizer params
VOCAB_SIZE      = 16000
MIN_FREQUENCY   = 2
SPECIAL_TOKENS  = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "<IDX>", "<EOS>"]

# HF Hub tokenizer repo
TOKENIZER_REPO_ID = "mhurhangee/patent-claims-tokenizer-16000"
PRIVATE           = True
HF_TOKEN          = os.getenv("HF_TOKEN")  # from .env

In [20]:
from huggingface_hub import create_repo
from tokenizers import Tokenizer, normalizers
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.normalizers import NFKC, Lowercase, StripAccents, Strip
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset

In [21]:
ds = load_dataset(DATASET_ID, split=SPLIT)
# Ensure plain text list
texts = [t.strip() for t in ds["text"] if t.strip()]

In [22]:
normalizer = normalizers.Sequence([
    NFKC(),
    Lowercase(),
    StripAccents(),
    Strip()
])

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.normalizer = normalizer
tokenizer.pre_tokenizer = Whitespace()

# Trainer
trainer = BpeTrainer(
    vocab_size=VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
    special_tokens=SPECIAL_TOKENS
)

In [23]:
# Train
tokenizer.train_from_iterator(texts, trainer=trainer)






In [24]:
tok = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    eos_token="<EOS>"

)

tok.save_pretrained("../data/" + TOKENIZER_REPO_ID.split("/")[1])  # save to local dir

('../data/patent-claims-tokenizer-16000/tokenizer_config.json',
 '../data/patent-claims-tokenizer-16000/special_tokens_map.json',
 '../data/patent-claims-tokenizer-16000/tokenizer.json')

In [25]:
    create_repo(TOKENIZER_REPO_ID, repo_type="model", private=PRIVATE, exist_ok=True, token=HF_TOKEN)
    tok.push_to_hub(TOKENIZER_REPO_ID, token=HF_TOKEN)
    print(f"Tokenizer pushed to https://huggingface.co/{TOKENIZER_REPO_ID}")


Tokenizer pushed to https://huggingface.co/mhurhangee/patent-claims-tokenizer-16000
