In [None]:
# 1_tokenizer_training.ipynb
"""
Train a custom tokenizer for a polysynthetic Indigenous language using Hugging Face Tokenizers.
"""

from tokenizers import Tokenizer, models, pre_tokenizers, trainers, normalizers

In [None]:
# Step 1: Load your text corpus
corpus_path = "../datasets/sample_corpus.txt"

with open(corpus_path, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

In [None]:
# Step 2: Setup the tokenizer model
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFD(),
    normalizers.Lowercase(),
])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(vocab_size=4000, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])

In [None]:
# Step 3: Train the tokenizer
tokenizer.train_from_iterator(lines, trainer=trainer)

In [None]:
# Step 4: Save the tokenizer
tokenizer.save("../tokenizer/custom_tokenizer.json")

print("✅ Tokenizer training complete. File saved as custom_tokenizer.json")