# NENA Tokenization

## Train tokenizers

In [35]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace

# Define the path to your foreign language corpus
dataset = "khan2016"
corpus_file = f"datasets/{dataset}/all.txt"

# Create tokenizer instances for BPE, WordPiece, and Unigram
bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
wordpiece_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
unigram_tokenizer = Tokenizer(Unigram())

# Define trainers for each tokenizer
bpe_trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
wordpiece_trainer = WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
unigram_trainer = UnigramTrainer(unk_token="[UNK]")

# Pre-tokenizer that splits the text into words (whitespace)
pre_tokenizer = Whitespace()
bpe_tokenizer.pre_tokenizer = pre_tokenizer
wordpiece_tokenizer.pre_tokenizer = pre_tokenizer
unigram_tokenizer.pre_tokenizer = pre_tokenizer

# Train tokenizers on the corpus
bpe_tokenizer.train([corpus_file], bpe_trainer)
wordpiece_tokenizer.train([corpus_file], wordpiece_trainer)
unigram_tokenizer.train([corpus_file], unigram_trainer)

# Save the vocabularies of each tokenizer to files
with open(f"vocab/{dataset}_vocab_bpe.txt", "w", encoding="utf-8") as bpe_file, \
     open(f"vocab/{dataset}_vocab_wordpiece.txt", "w", encoding="utf-8") as wordpiece_file, \
     open(f"vocab/{dataset}_vocab_unigram.txt", "w", encoding="utf-8") as unigram_file:

    # Write BPE vocabulary to file
    for token, _ in bpe_tokenizer.get_vocab().items():
        bpe_file.write(token + "\n")

    # Write WordPiece vocabulary to file
    for token, _ in wordpiece_tokenizer.get_vocab().items():
        wordpiece_file.write(token + "\n")

    # Write Unigram vocabulary to file
    for token, _ in unigram_tokenizer.get_vocab().items():
        unigram_file.write(token + "\n")

print("Tokenization complete. Vocabularies saved in respective files.")











## Test tokenizers on samples

In [36]:
# Sample text for testing the tokenizers
sample_words = [
    "⁺dára",
    "⁺pála",
    "⁺tála",
    "bnátə",
    "bráta",
    "dára",
    "máta",
    "maváy",
    "pála",
    "savə́lta",
    "šə́mma",
    "šə́mmu",
    "šə́mmuna",
    "sólə",
    "tála",
    "yéməš",
    "yémišu",
    "ṱánṱən",
    "ṱanṱə́nna",
    "ṱanṱə́nla",
    "ṱanṱə́nna",
    "ṱanṱənnála",
    "ṱunṱə́nla",
    "ṱunṱə́nna",
    "ṱunṱə́nnana",
    "ṱanṱúnələ",
    "ṱunṱə́nnola",
    "ṱanṱanta",
    "ṱanṱannána",
    "ʾávəd",
    "ʾavəd",
    "ʾoda",
    "və́dli",
    "vəttéla",
    "vúdun",
    "váda",
    "ʾodána",
]

for sample_word in sample_words:
    # Tokenize the sample text with each tokenizer and save to files
    with open(f"samples/{dataset}_{sample_word}_tokens_bpe.txt", "w", encoding="utf-8") as bpe_file, \
        open(f"samples/{dataset}_{sample_word}_tokens_wordpiece.txt", "w", encoding="utf-8") as wordpiece_file, \
        open(f"samples/{dataset}_{sample_word}_tokens_unigram.txt", "w", encoding="utf-8") as unigram_file:

        bpe_tokens = bpe_tokenizer.encode(sample_word).tokens
        wordpiece_tokens = wordpiece_tokenizer.encode(sample_word).tokens
        unigram_tokens = unigram_tokenizer.encode(sample_word).tokens

        # Write tokens to their respective files
        for token in bpe_tokens:
            bpe_file.write(token + "\n")

        for token in wordpiece_tokens:
            wordpiece_file.write(token + "\n")

        for token in unigram_tokens:
            unigram_file.write(token + "\n")

print("Tokenization complete. Tokens saved in respective files.")

Tokenization complete. Tokens saved in respective files.


## Convert samples to a markdown file

In [38]:
import os
import re

# Directory containing the sample files
directory = 'samples'

# List to store (word, dataset, tokenizer, tokens) tuples
data = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    # Check if the filename matches the expected pattern
    match = re.match(r'(.+)_(.+)_(.+)_(.+)\.txt', filename)
    if match:
        dataset, word, _, tokenizer = match.groups()

        # Read the tokenized results from the file
        with open(os.path.join(directory, filename), 'r') as file:
            tokens = file.read().splitlines()

        # Format the tokens as a comma-separated string
        tokens_str = ', '.join(tokens)

        # Add the data to the list
        data.append((word, dataset, tokenizer, tokens_str))

# Sort the data by word, then dataset, then tokenizer
data.sort(key=lambda x: (x[0], x[1], x[2]))

# Prepare the markdown table header
markdown_table = ["| Word | Dataset | Tokenizer | Tokens |", "|------|---------|-----------|--------|"]

# Add sorted data to the markdown table
for word, dataset, tokenizer, tokens in data:
    markdown_table.append(f"| {word} | {dataset} | {tokenizer} | {tokens} |")

# Print the markdown table
for line in markdown_table:
    print(line)


| Word | Dataset | Tokenizer | Tokens |
|------|---------|-----------|--------|
| bnátə | khan2016 | bpe | b, n, [UNK], tə |
| bnátə | khan2016 | unigram | b, n, á, t, ə |
| bnátə | khan2016 | wordpiece | [UNK] |
| bnátə | nazari2023 | bpe | bn, [UNK], tə |
| bnátə | nazari2023 | unigram | b, n, á, t, ə |
| bnátə | nazari2023 | wordpiece | [UNK] |
| bráta | khan2016 | bpe | br, [UNK], ta |
| bráta | khan2016 | unigram | b, r, á, ta |
| bráta | khan2016 | wordpiece | [UNK] |
| bráta | nazari2023 | bpe | br, [UNK], ta |
| bráta | nazari2023 | unigram | b, r, á, t, a |
| bráta | nazari2023 | wordpiece | [UNK] |
| dára | khan2016 | bpe | d, [UNK], ra |
| dára | khan2016 | unigram | d, á, ra |
| dára | khan2016 | wordpiece | [UNK] |
| dára | nazari2023 | bpe | d, [UNK], ra |
| dára | nazari2023 | unigram | d, á, ra |
| dára | nazari2023 | wordpiece | [UNK] |
| maváy | khan2016 | bpe | mav, [UNK], y |
| maváy | khan2016 | unigram | mav, á, y |
| maváy | khan2016 | wordpiece | [UNK] |
| maváy