In [None]:
# ============================================================================
#
# tokenizer_segmentation_scores.ipynb
# Here is a complete and user-friendly Jupyter Notebook designed for language teams and developers to:
# ‚úÖ Run tokenizer segmentation tests from a .csv
# ‚úÖ Compute precision, recall, and F1 score
# ‚úÖ Visualize token-by-token comparison
# ‚úÖ Highlight mismatches for easy human review
# ‚úÖ Export results to .ods (LibreOffice) and .html formats
#
# Author: 
#   MoniGarr (Monica Peters), monigarr@MoniGarr.com
#
# This repository supports language revival & retention for
#     Polysynthetic, Low-Resource Indigenous Languages that
#       might lack industry standard language ISO codes.
#
# License: Apache 2.0
# 
# For technical consulting, collaboration, or mentorship on Indigenous
# Language Revival & Retention Tech Solutions (AI, XR, 3D, Cultural Protocols)
# contact:
#   MoniGarr (Monica Peters) ‚Äì monigarr@monigarr.com
#   Founder of MoniGarr.com LLC and MohawkLanguage.ca
#   Akwesasne-based Onkwehonwe (Indigenous, Kanien‚Äôk√©hake, Mohawk of Akwesasne)
#   https://www.linkedin.com/in/3dtechartist
#
# ============================================================================

In [None]:
üß† Features Summary
Feature	                            Included ‚úÖ
Token-by-token comparison	                ‚úÖ
Precision, Recall, F1 per sentence	        ‚úÖ
Visual diff highlighting (HTML)	            ‚úÖ
ODS export for offline Elders	            ‚úÖ
HTML export for easy review	                ‚úÖ

In [None]:
# üìì tokenizer_segmentation_scores.ipynb

import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
import difflib
import os

In [None]:
# üîß CONFIG
csv_path = "../tokenizer/tests/tokenizer_test_set.csv"
tokenizer_path = "../tokenizer/custom_tokenizer.json"


In [None]:
# üìö Load data
df = pd.read_csv(csv_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [None]:
# URL encode helper
from urllib.parse import quote

In [None]:
# üìä Prepare result table
github_org = "monigarr""
github_repo = "mini-indig-llm-kit"

results = []


for _, row in df.iterrows():
    case_id = row["id"]
    dialect = row["dialect"]
    input_text = row["input"]
    expected_tokens = [tok.strip() for tok in row["expected_tokens"].split(",")]
    predicted_tokens = tokenizer.tokenize(input_text)

    # Scoring
    true_set = set(expected_tokens)
    pred_set = set(predicted_tokens)
    tp = len(true_set & pred_set)
    fp = len(pred_set - true_set)
    fn = len(true_set - pred_set)

    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0

    # üîç Diff with HTML highlight
    matcher = difflib.SequenceMatcher(None, expected_tokens, predicted_tokens)
    diff_html = ""
    for opcode, a0, a1, b0, b1 in matcher.get_opcodes():
        if opcode == 'equal':
            diff_html += " ".join([f"<span style='color:green'>{tok}</span>" for tok in expected_tokens[a0:a1]]) + " "
        else:
            diff_html += " ".join([f"<span style='color:red'>{tok}</span>" for tok in expected_tokens[a0:a1]]) + " "
            diff_html += " ".join([f"<span style='color:orange'>{tok}</span>" for tok in predicted_tokens[b0:b1]]) + " "

    # üìù Add annotation link
    issue_title = f"Annotation for Test {case_id}"
    issue_body = (
        f"**Input:** {input_text}\n"
        f"**Expected Tokens:** {', '.join(expected_tokens)}\n"
        f"**Predicted Tokens:** {', '.join(predicted_tokens)}\n"
        f"**Dialect:** {dialect}"
    )
    issue_url = (
        f"https://github.com/{github_org}/{github_repo}/issues/new?"
        f"title={quote(issue_title)}&body={quote(issue_body)}"
    )
    annotation_link = f"""<br><a href="{issue_url}" target="_blank" rel="noopener" style="font-size:small;">
    üìù Suggest correction</a>"""

    results.append({
        "ID": case_id,
        "Dialect": dialect,
        "Input": input_text,
        "Expected": ", ".join(expected_tokens),
        "Predicted": ", ".join(predicted_tokens),
        "Diff Highlight": diff_html + annotation_link,
        "Precision": round(precision, 2),
        "Recall": round(recall, 2),
        "F1": round(f1, 2)
    })

results_df = pd.DataFrame(results)

In [None]:
# üìà Display Scores
results_df[["ID", "Dialect", "Precision", "Recall", "F1"]]

In [None]:
from IPython.display import display, HTML

# Display all annotated rows
display(HTML(results_df[["ID", "Input", "Diff Highlight"]].to_html(escape=False, index=False)))

In [None]:
# üñºÔ∏è HTML Render
from IPython.display import display, HTML
display(HTML(results_df[["ID", "Input", "Diff Highlight"]].to_html(escape=False)))

In [None]:
# üßæ EXPORTS

# Plain CSV for tech review
results_df.to_csv("../tokenizer/tests/segmentation_results.csv", index=False)

# LibreOffice Version for Language Teams
results_df.to_excel("../tokenizer/tests/segmentation_results.ods", index=False)

# Visual diff to share / view in browser
results_df.to_html("../tokenizer/tests/segmentation_results.html", escape=False, index=False)

print("‚úÖ Exported results to CSV, ODS, and HTML formats.")