In [None]:
# ============================================================================
#
# tokenizer_tests.ipynb
# An interactive visualization notebook for sentence → morpheme segmentations.
#
# Author: 
#   MoniGarr (Monica Peters), monigarr@MoniGarr.com
#
# This repository supports language revival & retention for
#     Polysynthetic, Low-Resource Indigenous Languages that
#       might lack industry standard language ISO codes.
#
# License: Apache 2.0
# 
# For technical consulting, collaboration, or mentorship on Indigenous
# Language Revival & Retention Tech Solutions (AI, XR, 3D, Cultural Protocols)
# contact:
#   MoniGarr (Monica Peters) – monigarr@monigarr.com
#   Founder of MoniGarr.com LLC and MohawkLanguage.ca
#   Akwesasne-based Onkwehonwe (Indigenous, Kanien’kéhake, Mohawk of Akwesasne)
#   https://www.linkedin.com/in/3dtechartist
#
# ============================================================================

In [None]:
# 📓 tokenizer_tests.ipynb

from transformers import AutoTokenizer
import pandas as pd
import json

In [None]:

# Load tokenizer
tokenizer_path = "../tokenizer/custom_tokenizer.json"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [None]:

# Load test cases
with open("../tokenizer/tests/tokenizer_test_set.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)


In [None]:
# Visual test
results = []
for case in test_data:
    input_text = case["input"]
    expected = case["expected_tokens"]
    tokens = tokenizer.tokenize(input_text)
    results.append({
        "Dialect": case["dialect"],
        "Input": input_text,
        "Expected Tokens": expected,
        "Tokenized Output": tokens,
        "Match": tokens == expected
    })

In [None]:
# Display as DataFrame
df = pd.DataFrame(results)
df.style.applymap(lambda x: "background-color: #aaffaa" if x is True else "background-color: #ffaaaa", subset=["Match"])
