# SentencePiece BPE Tokenizer for Armenian

# Part 1: Training Script

Import the library

In [5]:
import sentencepiece as spm

Train the BPE model

In [6]:
spm.SentencePieceTrainer.train(
    input='corpus.txt',
    model_prefix='hy_bpe', # output: hy_bpe.model + hy_bpe.vocab
    vocab_size=300, # vocabulary size
    model_type='bpe', # Byte Pair Encoding
    character_coverage=1.0, # include ALL Armenian Unicode characters
    pad_id=0, # <pad>  Padding token
    unk_id=1, # <unk>  Unknown token
    bos_id=2, # <s>    Beginning Of Sentence token
    eos_id=3, # </s>   End Of Sentence token
)

print("Generated files: hy_bpe.model,  hy_bpe.vocab")

Generated files: hy_bpe.model,  hy_bpe.vocab


 Load the trained model and inspect the vocabulary

In [7]:
sp = spm.SentencePieceProcessor()
sp.load('hy_bpe.model')

total_vocab = sp.get_piece_size()
print(f"Total tokens in vocabulary: {total_vocab}")

Total tokens in vocabulary: 300


Print first 30 vocabulary entries

In [8]:
print("First 30 vocabulary entries")
print(f"{'ID':>5}  {'Token'}\n")
for i in range(30):
    piece = sp.id_to_piece(i)
    print(f"{i:>5}  {piece}")

First 30 vocabulary entries
   ID  Token

    0  <pad>
    1  <unk>
    2  <s>
    3  </s>
    4  ’∏÷Ç
    5  ’°’∂
    6  ’°’µ
    7  ’•÷Ä
    8  ’°÷Ä
    9  ’∏÷Ç’∂
   10  ‚ñÅ’∞
   11  ’∏÷Ç’¥
   12  ’°’Ø
   13  ’∏÷Ç’©
   14  ‚ñÅ’ß
   15  ’∏÷Ç’©’µ
   16  ’•’∂
   17  ’∏÷Ç’©’µ’∏÷Ç’∂
   18  ‚ñÅ’Ä
   19  ’∂’•÷Ä
   20  ’°’Ω
   21  ‚ñÅ’Ä’°’µ
   22  ‚ñÅ’Ø
   23  ’∏÷Ä
   24  ’°’¥
   25  ’°’Ø’°’∂
   26  ’•÷Ç
   27  ’°’ø
   28  ‚ñÅ’•’∂
   29  ‚ñÅ’¥


Print last 30 vocabulary entries

In [9]:
print("Last 30 vocabulary entries")
print(f"{'ID':>5}  {'Token'}\n")

for i in range(total_vocab - 30, total_vocab):
    piece = sp.id_to_piece(i)
    print(f"{i:>5}  {piece}")

Last 30 vocabulary entries
   ID  Token

  270  ‘≤
  271  ’π
  272  ’ª
  273  ÷É
  274  ‘ø
  275  ’Ü
  276  ’è
  277  ’±
  278  ‘µ
  279  ’Ñ
  280  ‘≥
  281  ‘¥
  282  ‘æ
  283  ’ä
  284  ÷Ö
  285  ‘π
  286  ‘º
  287  ‘Ω
  288  ’á
  289  ’å
  290  ’ç
  291  ’é
  292  ’ñ
  293  ,
  294  ‘∏
  295  ‘ª
  296  ’Å
  297  ’Ç
  298  ’à
  299  ’ã


Observations guide

In [10]:
print("""
Observation Guide
‚Ä¢ IDs 0-3: special tokens: <pad>, <unk>, <s>, </s>
‚Ä¢ Short entries: single Armenian characters (base alphabet coverage)
‚Ä¢ Medium entries: morpheme fragments / common suffixes (BPE merges)
‚Ä¢ Long entries: frequent full words found in the corpus
""")


Observation Guide
‚Ä¢ IDs 0-3: special tokens: <pad>, <unk>, <s>, </s>
‚Ä¢ Short entries: single Armenian characters (base alphabet coverage)
‚Ä¢ Medium entries: morpheme fragments / common suffixes (BPE merges)
‚Ä¢ Long entries: frequent full words found in the corpus



# Part 2: Encoding and Decoding Script

Import the library

In [11]:
import sentencepiece as spm

Load the trained model

In [12]:
sp = spm.SentencePieceProcessor()
sp.load('hy_bpe.model')
print("Model loaded: hy_bpe.model")

Model loaded: hy_bpe.model


Test sentences

In [13]:
sentences = {
    "S1": "’Ä’°’µ’°’Ω’ø’°’∂’∂ ’∏÷Ç’∂’´ ’∞’°÷Ä’∏÷Ç’Ω’ø ’∫’°’ø’¥’∏÷Ç’©’µ’∏÷Ç’∂÷â",
    "S2": "‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ’°÷Ä’°’£ ’¶’°÷Ä’£’°’∂’∏÷Ç’¥ ’ß÷â",
    "S3": "‘æ÷Ä’°’£÷Ä’°’æ’∏÷Ä’∏÷Ç’¥’® ’Ø’°÷Ä÷á’∏÷Ä ’∞’¥’ø’∏÷Ç’©’µ’∏÷Ç’∂ ’ß ’°’∫’°’£’°’µ’´ ’∞’°’¥’°÷Ä÷â",
    "S4": "‘æ÷Ä’°’£÷Ä’°’æ’∏÷Ä’∏÷Ç’¥’® ’Ø’°÷Ä’•÷Ç’∏÷Ä ’∞’¥’ø’∏÷Ç’©’µ’∏÷Ç’∂ ’ß ’°’∫’°’£’°’µ’´ ’∞’°’¥’°÷Ä÷â"
}


Encode, decode, and verify each sentence

In [14]:
for label, sentence in sentences.items():
    print(f"{label}: {sentence}")

    # Encode to token pieces (subword strings)
    pieces = sp.encode(sentence, out_type=str)
    print(f"\nToken pieces: {pieces}")

    # Encode to token IDs (integers)
    ids = sp.encode(sentence, out_type=int)
    print(f"Token IDs: {ids}")

    # Decode IDs back to text
    decoded = sp.decode(ids)
    print(f"Decoded text: {decoded}")

    # Verify round-trip
    match = (decoded == sentence)
    print(f"Match original: {match}")
    print()

S1: ’Ä’°’µ’°’Ω’ø’°’∂’∂ ’∏÷Ç’∂’´ ’∞’°÷Ä’∏÷Ç’Ω’ø ’∫’°’ø’¥’∏÷Ç’©’µ’∏÷Ç’∂÷â

Token pieces: ['‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂', '’∂', '‚ñÅ’∏÷Ç’∂’´', '‚ñÅ’∞’°÷Ä’∏÷Ç’Ω’ø', '‚ñÅ’∫', '’°’ø', '’¥', '’∏÷Ç’©’µ’∏÷Ç’∂', '÷â']
Token IDs: [36, 236, 61, 222, 96, 27, 242, 17, 246]
Decoded text: ’Ä’°’µ’°’Ω’ø’°’∂’∂ ’∏÷Ç’∂’´ ’∞’°÷Ä’∏÷Ç’Ω’ø ’∫’°’ø’¥’∏÷Ç’©’µ’∏÷Ç’∂÷â
Match original: True

S2: ‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ’°÷Ä’°’£ ’¶’°÷Ä’£’°’∂’∏÷Ç’¥ ’ß÷â

Token pieces: ['‚ñÅ‘±÷Ä', '’∞', '’•’Ω’ø', '’°’Ø’°’∂', '‚ñÅ’¢', '’°’∂', '’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’®', '‚ñÅ’°÷Ä’°’£', '‚ñÅ’¶’°÷Ä’£', '’°’∂’∏÷Ç’¥', '‚ñÅ’ß', '÷â']
Token IDs: [150, 247, 99, 25, 57, 5, 230, 162, 133, 159, 14, 246]
Decoded text: ‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ’°÷Ä’°’£ ’¶’°÷Ä’£’°’∂’∏÷Ç’¥ ’ß÷â
Match original: True

S3: ‘æ÷Ä’°’£÷Ä’°’æ’∏÷Ä’∏÷Ç’¥’® ’Ø’°÷Ä÷á’∏÷Ä ’∞’¥’ø’∏÷Ç’©’µ’∏÷Ç’∂ ’ß ’°’∫’°’£’°’µ’´ ’∞’°’¥’°÷Ä÷â

Token pieces: ['‚ñÅ‘æ', '÷Ä’°’£', '÷Ä’°', '’æ', '’∏÷Ä', '’∏÷Ç’¥’®', '‚ñÅ’Ø’°÷Ä’•÷Ç’∏÷Ä', '‚ñÅ’∞', '’¥', '’ø', '’

In [15]:
# ’Ü’Ø’°’ø’•÷Å’´ ’∏÷Ä "÷á" ÷á "’•÷Ç" ’ø’°÷Ä’¢’•÷Ä’∏÷Ç’©’µ’°’∂ ’∫’°’ø’≥’°’º’∏’æ 3-÷Ä’§ ÷Ö÷Ä’´’∂’°’Ø’® False ’ß ’Ω’ø’°’∂’∏÷Ç’¥, ’æ’°÷Ä’Ø’°’Æ’® ’Ω’ø’∏÷Ç’£’•÷Å’´ ’°’æ’•’¨’°÷Å’∂’•’¨’∏’æ 4-÷Ä’§ ÷Ö÷Ä’´’∂’°’Ø’®

#  Part 3: Vocabulary Analysis Script

Imports

In [16]:
import sentencepiece as spm
import collections

Load model and corpus

In [17]:
sp = spm.SentencePieceProcessor()
sp.load('hy_bpe.model')
print("Model loaded: hy_bpe.model")

with open('corpus.txt', encoding='utf-8') as f:
    corpus_text = f.read()

total_vocab = sp.get_piece_size()

Model loaded: hy_bpe.model


Categorise vocabulary entries by length

In [18]:
single_chars  = [] # length == 1
subword_frags = [] # length 2‚Äì4
full_words    = [] # length >= 5

# Special token IDs to skip (pad, unk, bos, eos)
SPECIAL_IDS = {0, 1, 2, 3}

for i in range(total_vocab):
    if i in SPECIAL_IDS:
        continue
    piece = sp.id_to_piece(i)
    # Strip the SentencePiece space prefix ‚ñÅ for length measurement
    clean = piece.lstrip('‚ñÅ')
    length = len(clean)

    if length == 1:
        single_chars.append(piece)
    elif 2 <= length <= 4:
        subword_frags.append(piece)
    else:
        full_words.append(piece)

print("Vocabulary Structure")
print(f"\nSingle characters (length = 1): {len(single_chars)}")
print(f"Subword fragments (length 2‚Äì4): {len(subword_frags)}")
print(f"Full words (length 5+): {len(full_words)}")
print(f"Special tokens (pad/unk/bos/eos): {len(SPECIAL_IDS)}")
print(f"\nTotal vocabulary: {total_vocab}")

Vocabulary Structure

Single characters (length = 1): 100
Subword fragments (length 2‚Äì4): 155
Full words (length 5+): 41
Special tokens (pad/unk/bos/eos): 4

Total vocabulary: 300


10 most frequent token pieces across the entire corpus

In [19]:
print("Top 10 Most Frequent Token Pieces (full corpus)")

token_counts: collections.Counter = collections.Counter()

for line in corpus_text.splitlines():
    line = line.strip()
    if line:
        pieces = sp.encode(line, out_type=str)
        token_counts.update(pieces)

print(f"  {'Rank':>4}  {'Token':<3}  {'Count':>4}\n")

for rank, (piece, count) in enumerate(token_counts.most_common(10), start=1):
    print(f"  {rank:>4}  {piece:<3}  {count:>4}")


Top 10 Most Frequent Token Pieces (full corpus)
  Rank  Token  Count

     1  ÷â      93
     2  ‚ñÅ’ß     53
     3  ‚ñÅ      41
     4  ’∂      35
     5  ’°’∂     33
     6  ’´      33
     7  ‚ñÅ’•’∂    27
     8  ’®      25
     9  ÷Ä      22
    10  ’∏÷Ç’¥    20


Sample tokens from each category (for illustration)

In [20]:
print("Sample Single Characters:")
print("  ", single_chars[:20])

print("\nSample Subword Fragments:")
print("  ", subword_frags[:20])

print("\nSample Full Words:")
print("  ", full_words[:20])

Sample Single Characters:
   ['‚ñÅ’∞', '‚ñÅ’ß', '‚ñÅ’Ä', '‚ñÅ’Ø', '‚ñÅ’¥', '‚ñÅ’°', '‚ñÅ’¨', '‚ñÅ’∑', '‚ñÅ’£', '‚ñÅ’§', '‚ñÅ‘±', '‚ñÅ’¢', '‚ñÅ’™', '‚ñÅ‘≤', '‚ñÅ’ø', '‚ñÅ‘ø', '‚ñÅ’Ü', '‚ñÅ’è', '‚ñÅ’¶', '‚ñÅ’∫']

Sample Subword Fragments:
   ['’∏÷Ç', '’°’∂', '’°’µ', '’•÷Ä', '’°÷Ä', '’∏÷Ç’∂', '’∏÷Ç’¥', '’°’Ø', '’∏÷Ç’©', '’∏÷Ç’©’µ', '’•’∂', '’∂’•÷Ä', '’°’Ω', '‚ñÅ’Ä’°’µ', '’∏÷Ä', '’°’¥', '’°’Ø’°’∂', '’•÷Ç', '’°’ø', '‚ñÅ’•’∂']

Sample Full Words:
   ['’∏÷Ç’©’µ’∏÷Ç’∂', '’∏÷Ç’©’µ’∏÷Ç’∂’®', '’°’Ω’ø’°’∂', '‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂', '‚ñÅ’Ø’°÷Ä’•÷Ç’∏÷Ä', '‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂’∏÷Ç’¥', '’Ø’°’Ø’°’∂', '’∏÷Ç’©’µ’°’∂', '‚ñÅ’°’æ’°’∂’§', '‚ñÅ’∏÷Ç’∂’•’∂', '‚ñÅ’°’∑’≠’°÷Ä’∞', '’∏÷Ç’©’µ’∏÷Ç’∂’∂', '‚ñÅ’™’∏’≤’∏’æ', '‚ñÅ’°’æ’°’∂’§’°’Ø’°’∂', '’°’∂’∏÷Ç’¥', '’´÷Ä’∏÷Ç’¥', '’°’Ø’°’Ø’´÷Å', '‚ñÅ’™’°’¥’°’∂', '‚ñÅ’∞’°’¥’°÷Ä', '‚ñÅ’¨’•’¶’∏÷Ç’∂']


Observations:

In Armenian "’∏÷Ç" and "’•÷Ç" are a single letter, but there are two unicode characters, and they counted as subwords. Frequently
occurring suffixes and grammatical endings, such as "’∏÷Ç’¥" merged early by BPE. Common
high-frequency words like "’∞’°’µ" survive as full tokens because
they appear repeatedly across many sentences without being broken further. Longer,
less-frequent words like
"’°÷Ä’∞’•’Ω’ø’°’Ø’°’∂" or "’Æ÷Ä’°’£÷Ä’°’æ’∏÷Ä’∏÷Ç’¥’®" are split into multiple subword pieces,
showing that BPE trades off between compression and coverage. Overall, the 300-token
vocabulary manages to represent all Armenian Unicode characters (character_coverage=1.0)
while still learning meaningful morphological units from even this small 93-sentence corpus.


# Home Task: Large-Scale Training
SentencePiece BPE Tokenizer ‚Äî CC-100 Armenian Dataset

Imports

In [21]:
import sentencepiece as spm
import re
import unicodedata
from datasets import load_dataset
from collections import Counter
from huggingface_hub import hf_hub_download
import lzma
import os

Downloading CC-100 Armenian dataset

In [24]:
RAW_FILE_XZ  = 'hy.txt.xz' # compressed download
RAW_FILE_TXT = 'hy.txt' # decompressed plain text

if not os.path.exists(RAW_FILE_TXT):
    if not os.path.exists(RAW_FILE_XZ):
        os.system(
            "wget -q --show-progress "
            "https://data.statmt.org/cc-100/hy.txt.xz "
            f"-O {RAW_FILE_XZ}"
        )
        print("Download complete.")

    with lzma.open(RAW_FILE_XZ, 'rt', encoding='utf-8') as f_in, \
         open(RAW_FILE_TXT, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            f_out.write(line)
    print(f"Decompressed to: {RAW_FILE_TXT}")
else:
    print(f"Found cached file: {RAW_FILE_TXT} (skipping download)")

# Count total lines for information
with open(RAW_FILE_TXT, encoding='utf-8') as f:
    total_lines = sum(1 for line in f if line.strip())

print(f"Total available sentences in CC-100 Armenian: {total_lines:,}")

Found cached file: hy.txt (skipping download)
Total available sentences in CC-100 Armenian: 307,594


Extract 50,000 Sentences and Save to File

In [34]:
TARGET = 50_000
LARGE_CORPUS = 'corpus_large.txt'

sentences_large = []

with open(RAW_FILE_TXT, encoding='utf-8') as f:
    for raw in f:
        if len(sentences_large) >= TARGET:
            break
        line = raw.strip()
        if line:
            sentences_large.append(line)

print(f"Sentences collected : {len(sentences_large):,}")

# Save to file ‚Äî SentencePiece reads from a file path
with open(LARGE_CORPUS, 'w', encoding='utf-8') as f:
    f.write('\n'.join(sentences_large))

print(f"\nSaved to: {LARGE_CORPUS}")

# Quick preview
print("\nSample sentences from CC-100")
for i, line in enumerate(sentences_large[:5]):
    print(f"{i+1}) {line[:90]}{'...' if len(line) > 90 else ''}")

Sentences collected : 50,000

Saved to: corpus_large.txt

Sample sentences from CC-100
1) ‘±’∫’°’£’°’≤’∏÷Ç’©’°÷Å’∏÷Ç’¥. 2-÷Ä’§ ’¥’°’Ω. ’é’°÷Ä’∏÷Ç’™’°’∂ ‘±’æ’•’ø’´’Ω’µ’°’∂’´ ’∞’∏’§’æ’°’Æ’®
2) Nov 24, 2017 Comments Off on ‘±’∫’°’£’°’≤’∏÷Ç’©’°÷Å’∏÷Ç’¥. 2-÷Ä’§ ’¥’°’Ω. ’é’°÷Ä’∏÷Ç’™’°’∂ ‘±’æ’•’ø’´’Ω’µ’°’∂’´ ’∞’∏’§’æ’°’Æ’® ’Ä’°’µ’•’¨’´
3) ’Ñ’°’Ω 2-÷Ä’§ (’Ñ’°’Ω 1-’´’∂’®’ù ’°’µ’Ω’ø’•’≤ )
4) ‘±’∫’°’£’°’≤’∏÷Ç’©’°÷Å’∏÷Ç’¥-Armexit-’´ ’£’∏÷Ä’Æ’®’∂’©’°÷Å’∂ ’Ω’Ø’Ω’•’¨’∏÷Ç ’∞’°’¥’°÷Ä ’∂’°’≠ ’°’∂’∞÷Ä’°’™’•’∑’ø ’ß ’Ω’ø’•’≤’Æ’•’¨ ’∞’°’µ’Ø’°’Ø’°’∂ ’Ω’∏÷Ç’¢’µ’•’Ø...
5) ’Ñ’•÷Ä ’∫’°÷Ä’°’£’°’µ’∏÷Ç’¥ ’£’°’≤’∏÷Ç’©’°’µ’´’∂ ’æ’°÷Ä’π’°’Ø’°’¶’¥’´’∂ ’∞’∂’°÷Ä’°’æ’∏÷Ä ’ß ’∞’•’º’°÷Å’∂’•’¨ ’¥’´’°’µ’∂ ’™’∏’≤’∏’æ÷Ä’§’°’µ’´’∂ ’®’∂’§’æ’¶’¥’°’∂ ’¥’´’ª’∏÷Å...


Train Large SentencePiece BPE Model

In [33]:
spm.SentencePieceTrainer.train(
    input=LARGE_CORPUS,
    model_prefix='hy_bpe_large', # output: hy_bpe_large.model + hy_bpe_large.vocab
    vocab_size=8000, # in small was only 300
    model_type='bpe', # Byte Pair Encoding
    character_coverage=1.0,  # cover 100% of Armenian Unicode characters
    pad_id=0, # <pad> Padding token
    unk_id=1, # <unk> Unknown token
    bos_id=2, # <s> Beginning Of Sentence
    eos_id=3, # </s> End Of Sentence
)

print("Generated files: hy_bpe_large.model,  hy_bpe_large.vocab")

Generated files: hy_bpe_large.model,  hy_bpe_large.vocab


Load large model

In [35]:
sp_large = spm.SentencePieceProcessor()
sp_large.load('hy_bpe_large.model')

large_vocab_size = sp_large.get_piece_size()
print(f"\nLarge model vocabulary size: {large_vocab_size:,}")


Large model vocabulary size: 8,000


Inspect first and last 30 entries

In [37]:
print("\nFirst 30 vocabulary entries (large model)")
print(f"  {'ID':>5}  {'Token'}\n")

for i in range(30):
    print(f"  {i:>5}  {sp_large.id_to_piece(i)}")

print("\nLast 30 vocabulary entries (large model)")
print(f"{'ID':>5}  {'Token'}\n")

for i in range(large_vocab_size - 30, large_vocab_size):
    print(f"{i:>5}  {sp_large.id_to_piece(i)}")


First 30 vocabulary entries (large model)
     ID  Token

      0  <pad>
      1  <unk>
      2  <s>
      3  </s>
      4  ’∏÷Ç
      5  ’°’∂
      6  ’°÷Ä
      7  ’•÷Ä
      8  ’°’Ø
      9  ’°’µ
     10  ‚ñÅ’∞
     11  ’´’∂
     12  ’∏÷Ä
     13  ’∏÷Ç’¥
     14  ’•’¨
     15  ’∏÷Ç’©
     16  ’∂’•÷Ä
     17  ‚ñÅ’ß
     18  ’∏÷Ç’©’µ
     19  ‚ñÅ’¥
     20  ’°’¥
     21  ’•’∂
     22  ’∏÷Ç’∂
     23  ‚ñÅ’Ø
     24  ’°’ø
     25  ’°’Ω
     26  ’•÷Ç
     27  ’°’æ
     28  ’°’Ø’°’∂
     29  ‚ñÅ’∂

Last 30 vocabulary entries (large model)
   ID  Token

 7970  Â±±
 7971  Â¥é
 7972  ÊÉÖ
 7973  Êîø
 7974  Êï∞
 7975  Êñá
 7976  Êùë
 7977  Ê≠¶
 7978  Á¥∞
 7979  Ëîµ
 7980  Ë°å
 7981  Ë©≥
 7982  Èáé
 7983  Í∏∞
 7984  ÍπÄ
 7985  Î≤î
 7986  ÏÑú
 7987  Ïö∏
 7988  Ï∞Ω
 7989  ÌÇ§
 7990  Ìèâ
 7991  ÔÄ≠
 7992  Ô∏è
 7993  üåπ
 7994  üéÅ
 7995  üíù
 7996  üíü
 7997  üì¢
 7998  üòÄ
 7999  üòâ


Encode and Decode 5 Sentences

In [41]:
# 5 Armenian sentences chosen to cover different topics and word lengths
my_sentences = [
    "’Ä’°’µ’°’Ω’ø’°’∂’® ’£’•’≤’•÷Å’´’Ø ’•÷Ä’Ø’´÷Ä ’ß ‘ø’∏’æ’Ø’°’Ω’∏÷Ç’¥÷â", # Geography
    "‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ÷É’∏’≠’∏÷Ç’¥ ’ß ’°’∑’≠’°÷Ä’∞’®÷â", # Technology / AI
    "‘µ’Ω ’Ω’´÷Ä’∏÷Ç’¥ ’•’¥ ’∞’°’µ’Ø’°’Ø’°’∂ ’•÷Ä’°’™’∑’ø’∏÷Ç’©’µ’∏÷Ç’∂’®÷â", # Culture / personal
    "‘≥’´’ø’∏÷Ç’©’µ’∏÷Ç’∂’® ’¥’°÷Ä’§’Ø’∏÷Ç’©’µ’°’∂’® ’∂’∏÷Ä ’∞’∂’°÷Ä’°’æ’∏÷Ä’∏÷Ç’©’µ’∏÷Ç’∂’∂’•÷Ä ’ß ’¢’°÷Å’∏÷Ç’¥÷â", # Science
    "‘º’•’¶’∏÷Ç’∂ ’™’∏’≤’∏’æ÷Ä’§’´ ’∞’∏’£’´’∂ ’ß ÷á ’´’∂÷Ñ’∂’∏÷Ç’©’µ’°’∂ ’∞’´’¥÷Ñ’®÷â", # Language / identity
    "‘º’•’¶’∏÷Ç’∂ ’™’∏’≤’∏’æ÷Ä’§’´ ’∞’∏’£’´’∂ ’ß ’•÷Ç ’´’∂÷Ñ’∂’∏÷Ç’©’µ’°’∂ ’∞’´’¥÷Ñ’®÷â", # Language / identity
]

for idx, sentence in enumerate(my_sentences, start=1):
    print(f"\nSentence {idx}: {sentence}")

    pieces = sp_large.encode(sentence, out_type=str)
    ids    = sp_large.encode(sentence, out_type=int)
    decoded = sp_large.decode(ids)
    match  = (decoded == sentence)

    print(f"Token pieces: {pieces}")
    print(f"Token IDs: {ids}")
    print(f"Decoded: {decoded}")
    print(f"Match original: {match}")


Sentence 1: ’Ä’°’µ’°’Ω’ø’°’∂’® ’£’•’≤’•÷Å’´’Ø ’•÷Ä’Ø’´÷Ä ’ß ‘ø’∏’æ’Ø’°’Ω’∏÷Ç’¥÷â
Token pieces: ['‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂’®', '‚ñÅ’£’•’≤’•÷Å’´’Ø', '‚ñÅ’•÷Ä’Ø’´÷Ä', '‚ñÅ’ß', '‚ñÅ‘ø’∏’æ’Ø’°’Ω', '’∏÷Ç’¥', '÷â']
Token IDs: [1411, 2879, 2006, 17, 6203, 13, 7578]
Decoded: ’Ä’°’µ’°’Ω’ø’°’∂’® ’£’•’≤’•÷Å’´’Ø ’•÷Ä’Ø’´÷Ä ’ß ‘ø’∏’æ’Ø’°’Ω’∏÷Ç’¥÷â
Match original: True

Sentence 2: ‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ÷É’∏’≠’∏÷Ç’¥ ’ß ’°’∑’≠’°÷Ä’∞’®÷â
Token pieces: ['‚ñÅ‘±÷Ä', '’∞', '’•’Ω’ø', '’°’Ø’°’∂', '‚ñÅ’¢’°’∂', '’°’Ø’°’∂', '’∏÷Ç’©’µ’∏÷Ç’∂’®', '‚ñÅ÷É’∏’≠', '’∏÷Ç’¥', '‚ñÅ’ß', '‚ñÅ’°’∑’≠’°÷Ä’∞’®', '÷â']
Token IDs: [182, 7537, 814, 28, 551, 28, 131, 370, 13, 17, 5409, 7578]
Decoded: ‘±÷Ä’∞’•’Ω’ø’°’Ø’°’∂ ’¢’°’∂’°’Ø’°’∂’∏÷Ç’©’µ’∏÷Ç’∂’® ÷É’∏’≠’∏÷Ç’¥ ’ß ’°’∑’≠’°÷Ä’∞’®÷â
Match original: True

Sentence 3: ‘µ’Ω ’Ω’´÷Ä’∏÷Ç’¥ ’•’¥ ’∞’°’µ’Ø’°’Ø’°’∂ ’•÷Ä’°’™’∑’ø’∏÷Ç’©’µ’∏÷Ç’∂’®÷â
Token pieces: ['‚ñÅ‘µ’Ω', '‚ñÅ’Ω’´÷Ä’∏÷Ç’¥', '‚ñÅ’•’¥', '‚ñÅ’∞’°’µ’Ø’°’Ø’°’∂', '‚ñÅ’•÷Ä’°’™’∑’ø', '’∏÷Ç’©’µ’∏÷Ç’∂’®', '÷â

Compare Average Tokens per Sentence

Small model (vocab=300) vs Large model (vocab=8000)

In [47]:
# Load the small model trained in Part 1
sp_small = spm.SentencePieceProcessor()
sp_small.load('hy_bpe.model')

# Use the original small corpus as a shared benchmark
with open('corpus.txt', encoding='utf-8') as f:
    benchmark_sentences = [line.strip() for line in f if line.strip()]

small_token_counts = []
large_token_counts = []

for sentence in benchmark_sentences:
    small_tokens = sp_small.encode(sentence, out_type=str)
    large_tokens = sp_large.encode(sentence, out_type=str)
    small_token_counts.append(len(small_tokens))
    large_token_counts.append(len(large_tokens))

avg_small = sum(small_token_counts) / len(small_token_counts)
avg_large = sum(large_token_counts) / len(large_token_counts)
reduction = ((avg_small - avg_large) / avg_small) * 100

print(f"\nCorpus: {len(benchmark_sentences)} sentences (corpus.txt)")
print(f"\n{'Model':<25}  {'Vocab Size':>10}  {'Avg Tokens/Sentence':>20}\n")
print(f"{'Small model (Part 1)':<25}  {sp_small.get_piece_size():>10,}  {avg_small:>20.2f}")
print(f"{'Large model (Home Task)':<25}  {sp_large.get_piece_size():>10,}  {avg_large:>20.2f}")
print(f"\nToken reduction with large model: {reduction:.1f}%")
print(f"(fewer tokens = more information packed per token = better efficiency)")

print("\nExamples:")
for sentence in benchmark_sentences[:3]:
    s_pieces = sp_small.encode(sentence, out_type=str)
    l_pieces = sp_large.encode(sentence, out_type=str)
    print(f"\nSentence: {sentence}")
    print(f"Small: {s_pieces}  ({len(s_pieces)} tokens)")
    print(f"Large: {l_pieces}  ({len(l_pieces)} tokens)")


Corpus: 93 sentences (corpus.txt)

Model                      Vocab Size   Avg Tokens/Sentence

Small model (Part 1)              300                 17.38
Large model (Home Task)         8,000                 10.91

Token reduction with large model: 37.2%
(fewer tokens = more information packed per token = better efficiency)

Examples:

Sentence: ’Ä’°’µ’°’Ω’ø’°’∂’® ÷É’∏÷Ñ÷Ä, ’¢’°’µ÷Å ’∞’°÷Ä’∏÷Ç’Ω’ø ’∫’°’ø’¥’∏÷Ç’©’µ’∏÷Ç’∂ ’∏÷Ç’∂’•÷Å’∏’≤ ’•÷Ä’Ø’´÷Ä ’ß÷â
Small: ['‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂', '’®', '‚ñÅ÷É', '’∏', '÷Ñ÷Ä', ',', '‚ñÅ’¢', '’°’µ', '÷Å', '‚ñÅ’∞’°÷Ä’∏÷Ç’Ω’ø', '‚ñÅ’∫', '’°’ø', '’¥', '’∏÷Ç’©’µ’∏÷Ç’∂', '‚ñÅ’∏÷Ç’∂', '’•÷Å', '’∏’≤', '‚ñÅ’•÷Ä', '’Ø', '’´÷Ä', '‚ñÅ’ß', '÷â']  (22 tokens)
Large: ['‚ñÅ’Ä’°’µ’°’Ω’ø’°’∂’®', '‚ñÅ÷É’∏÷Ñ÷Ä', ',', '‚ñÅ’¢’°’µ÷Å', '‚ñÅ’∞’°÷Ä’∏÷Ç’Ω’ø', '‚ñÅ’∫’°’ø’¥’∏÷Ç’©’µ’∏÷Ç’∂', '‚ñÅ’∏÷Ç’∂’•÷Å’∏’≤', '‚ñÅ’•÷Ä’Ø’´÷Ä', '‚ñÅ’ß', '÷â']  (10 tokens)

Sentence: ‘µ÷Ä÷á’°’∂’® ’Ä’°’µ’°’Ω’ø’°’∂’´ ’¥’°’µ÷Ä’°÷Ñ’°’≤’°÷Ñ’∂ ’ß ÷á ’°’¥’•’∂’°’¥’•’Æ ÷Ñ’°’≤’°÷Ñ’®÷â
Small: ['‚ñÅ‘µ÷Ä', '’•

Observations:

Training on 50,000 CC-100 sentences (instead of 93 sentences) gives the BPE more information about which character sequences co-occur frequently
in Armenian. As a result, the large model learns
thousands of complete words and longer  combinations, whereas the small
model breaking even common words into many
fragments.

It is visible in token efficiency: the large model produces
fewer tokens per sentence on average. Fewer tokens means each
token carries more meaning, which reduces the sequence length.

Armenian is a rich language with many inflectional suffixes (
"-’∏÷Ç’¥", "-’∏÷Ç’©’µ’∏÷Ç’∂"). With only 300 tokens the small model splits words more.
The large model, has seen suffixes
thousands of times and merges them early, producing meaningful
subword units.

In examples we can see that "’¥’°’µ÷Ä’°÷Ñ’°’≤’°÷Ñ’∂" word was spleted into 7 tokens by small model and only 2 tokens by large model. "÷Ñ’°’≤’°÷Ñ’®" word into 6 tokens by small one, and 1 token by big one. It's huge.

On other hand, we also can see that both small and large models understand words like "’∞’°’µ" and "’Ä’°’µ’°’Ω’ø’°’∂’´" as one token.