In [None]:
# ============================================================================
#
# tokenizer_builder.ipynb
# A Jupyter notebook to build and save Hugging Face compatible tokenizers from your YAML and corpus.
#
# Author: 
#   MoniGarr (Monica Peters), monigarr@MoniGarr.com
#
# This repository supports language revival & retention for
#     Polysynthetic, Low-Resource Indigenous Languages that
#       might lack industry standard language ISO codes.
#
# License: Apache 2.0
# 
# For technical consulting, collaboration, or mentorship on Indigenous
# Language Revival & Retention Tech Solutions (AI, XR, 3D, Cultural Protocols)
# contact:
#   MoniGarr (Monica Peters) – monigarr@monigarr.com
#   Founder of MoniGarr.com LLC and MohawkLanguage.ca
#   Akwesasne-based Onkwehonwe (Indigenous, Kanien’kéhake, Mohawk of Akwesasne)
#   https://www.linkedin.com/in/3dtechartist
#
# ============================================================================

In [None]:
# 📓 tokenizer_builder.ipynb

from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, normalizers
import json
import yaml

In [None]:
# 🔧 Setup
corpus_path = "../datasets/sample_corpus.txt"
yaml_path = "../datasets/kanienkeha_vocab_rules.yaml"
save_path = "custom_tokenizer.json"
vocab_size = 32000

In [None]:
# 📖 Load dialects and rules
with open(yaml_path, "r", encoding="utf-8") as f:
    vocab_data = yaml.safe_load(f)

dialects = list(vocab_data.get("dialects", {}).keys())
print("📚 Dialects:", dialects)


In [None]:
# 🧪 Build tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(
    vocab_size=vocab_size,
    special_tokens=["<unk>", "<pad>", "<s>", "</s>", "<mask>"] +
                   [f"<dialect_{d}>" for d in dialects]
)

tokenizer.train([corpus_path], trainer)
tokenizer.save(save_path)
print(f"✅ Saved tokenizer to {save_path}")