In [2]:
# Part 1: Training Script (using existing corpus.txt)

import sentencepiece as spm

# Train SentencePiece BPE model
spm.SentencePieceTrainer.train(
    input="corpus.txt",       
    model_prefix="hy_bpe",
    vocab_size=300,
    model_type="bpe",
    character_coverage=1.0
)

# Load trained model
sp = spm.SentencePieceProcessor()
sp.load("hy_bpe.model")

# Vocabulary statistics
vocab_size = sp.get_piece_size()
print("Total vocabulary size:", vocab_size)

print("\nFirst 30 vocabulary entries:")
for i in range(30):
    print(i, sp.id_to_piece(i))

print("\nLast 30 vocabulary entries:")
for i in range(vocab_size - 30, vocab_size):
    print(i, sp.id_to_piece(i))

Total vocabulary size: 300

First 30 vocabulary entries:
0 <unk>
1 <s>
2 </s>
3 ու
4 ան
5 այ
6 եր
7 ար
8 ուն
9 ▁հ
10 ում
11 ակ
12 ութ
13 ▁է
14 ությ
15 են
16 ություն
17 ▁Հ
18 ներ
19 աս
20 ▁Հայ
21 ▁կ
22 որ
23 ամ
24 ական
25 եւ
26 ատ
27 ▁են
28 ▁մ
29 ▁հայ

Last 30 vocabulary entries:
270 Բ
271 չ
272 ջ
273 փ
274 Կ
275 Ն
276 Տ
277 ձ
278 Ե
279 Մ
280 Գ
281 Դ
282 Ծ
283 Պ
284 օ
285 Թ
286 Լ
287 Խ
288 Շ
289 Ռ
290 Ս
291 Վ
292 Ֆ
293 ,
294 Ը
295 Ի
296 Ձ
297 Ղ
298 Ո
299 Ջ


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: hy_bpe
  model_type: BPE
  vocab_size: 300
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privac

In [3]:
# Part 2: Encoding and Decoding Script

import sentencepiece as spm

# Load trained model
sp = spm.SentencePieceProcessor()
sp.load("hy_bpe.model")

# Test sentences
sentences = {
    "S1": "Հայաստանն ունի հարուստ պատմություն։",
    "S2": "Արհեստական բանականությունը արագ զարգանում է։",
    "S3": "Ծրագրավորումը կարևոր հմտություն է ապագայի համար։"
}

for name, sentence in sentences.items():
    print(f"\n{name}: {sentence}")
    
    # Encode to token pieces
    pieces = sp.encode(sentence, out_type=str)
    print("Token pieces:", pieces)
    
    # Encode to token IDs
    ids = sp.encode(sentence, out_type=int)
    print("Token IDs:", ids)
    
    # Decode back
    decoded = sp.decode(ids)
    print("Decoded:", decoded)
    
    # Check equality
    print("Matches original:", decoded == sentence)


S1: Հայաստանն ունի հարուստ պատմություն։
Token pieces: ['▁Հայաստան', 'ն', '▁ունի', '▁հարուստ', '▁պ', 'ատ', 'մ', 'ություն', '։']
Token IDs: [35, 236, 60, 221, 95, 26, 242, 16, 246]
Decoded: Հայաստանն ունի հարուստ պատմություն։
Matches original: True

S2: Արհեստական բանականությունը արագ զարգանում է։
Token pieces: ['▁Ար', 'հ', 'եստ', 'ական', '▁բ', 'ան', 'ականությունը', '▁արագ', '▁զարգ', 'անում', '▁է', '։']
Token IDs: [149, 247, 98, 24, 56, 4, 229, 161, 132, 158, 13, 246]
Decoded: Արհեստական բանականությունը արագ զարգանում է։
Matches original: True

S3: Ծրագրավորումը կարևոր հմտություն է ապագայի համար։
Token pieces: ['▁Ծ', 'րագ', 'րա', 'վ', 'որ', 'ումը', '▁կարեւոր', '▁հ', 'մ', 'տ', 'ություն', '▁է', '▁ապագայի', '▁համար', '։']
Token IDs: [188, 202, 73, 252, 22, 208, 40, 9, 242, 244, 16, 13, 220, 165, 246]
Decoded: Ծրագրավորումը կարեւոր հմտություն է ապագայի համար։
Matches original: False


In [4]:
# Part 3: Vocabulary Analysis Script

import sentencepiece as spm
from collections import Counter

sp = spm.SentencePieceProcessor()
sp.load("hy_bpe.model")

vocab_size = sp.get_piece_size()

single_chars = 0
subwords = 0
full_words = 0

for i in range(vocab_size):
    piece = sp.id_to_piece(i)
    
    # Remove SentencePiece whitespace marker
    clean_piece = piece.replace("▁", "")
    length = len(clean_piece)
    
    if length == 1:
        single_chars += 1
    elif 2 <= length <= 4:
        subwords += 1
    elif length >= 5:
        full_words += 1

print("Single Armenian characters:", single_chars)
print("Subword fragments (2–4 chars):", subwords)
print("Full words (5+ chars):", full_words)

# Read entire corpus from file
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = f.read()

# Encode full corpus
all_pieces = sp.encode(corpus, out_type=str)
counter = Counter(all_pieces)

print("\nTop 10 most frequent token pieces:")
for piece, freq in counter.most_common(10):
    print(piece, "->", freq)

Single Armenian characters: 100
Subword fragments (2–4 chars): 158
Full words (5+ chars): 41

Top 10 most frequent token pieces:
։ -> 93
▁է -> 53
▁ -> 41
ն -> 35
ան -> 33
ի -> 33
▁են -> 27
ը -> 25
ր -> 22
ում -> 20
