In [None]:
# 8_custom_tokenizer_from_vocab.ipynb
"""
Generate a custom tokenizer for a low-resource language (e.g., Kanien’kéha)
using vocab + token frequency + YAML rule matching.

Converts token frequency + vocab rules into SentencePiece tokenizer to use for LLM training

Outputs: can be loaded in HF or Transformers-compatible Scripts
    ../tokenizer/kanienkeha_tokenizer.model 
    ../tokenizer/kanienkeha_tokenizer.vocab


"""


In [None]:

# 📦 Step 1: Install SentencePiece
!pip install sentencepiece pyyaml pandas

In [None]:

# 🧠 Step 2: Import modules
import sentencepiece as spm
import pandas as pd
import yaml
from pathlib import Path

In [None]:

# 📄 Step 3: Load frequency data + rules
df = pd.read_csv("../datasets/vocab_analysis/token_frequencies.csv")
rules_path = "../datasets/kanienkeha_vocab_rules.yaml"

with open(rules_path, "r") as f:
    rules = yaml.safe_load(f)

In [None]:

# 📝 Step 4: Write training input file (1 token per line, weighted)
token_file = "../datasets/kanienkeha_tokenizer_input.txt"
with open(token_file, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        for _ in range(int(row["count"])):
            f.write(row["token"] + "\n")

In [None]:

# 🧪 Step 5: Train SentencePiece model
spm.SentencePieceTrainer.Train(
    input=token_file,
    model_prefix="../tokenizer/kanienkeha_tokenizer",
    vocab_size=800,
    character_coverage=1.0,
    model_type="unigram",  # better for morphologically rich languages
    user_defined_symbols=rules["prefixes"] + rules["suffixes"]
)

print("✅ Custom tokenizer trained. Files saved to ../tokenizer/")