In [1]:
EMBEDDING_DIMENSION = 50
CONTEXT_WINDOW = 4
NEGATIVE_SAMPLES = 2
BATCH_SIZE = 32

In [2]:
from khmernltk import word_tokenize
from collections import Counter
import numpy as np
import tensorflow as tf

2025-01-26 20:04:02.966445: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load and preprocess text
with open("temples.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Clean text
ignore_words = ["។", "៕", "ៗ", "៘", "៙", "៚", "៛", "\n", " ", "[", "]", "(", ")", "!", "?", ",", ".", ":", ";", "-", "_", "=", "+", "*", "&", "^", "%", "$", "#", "@", "!", "~", "`", "|", "\\", "/", "{", "}", "<", ">", '"', "'","°","`","′","″"]
for word in ignore_words:
    text = text.replace(word, "")

# Remove all non-Khmer characters
text = "".join([char for char in text if char >= "ក" and char <= "៿"])

text[:99]

'ប្រាសាទអង្គរវត្តឬប្រាសាទអង្គរតូចមានទីតាំងស្ថិតនៅភាគខាងជើងនៃក្រុងសៀមរាបនៃខេត្តសៀមរាបប្រាសាទអង្គរវត្ត'

In [4]:
# Save text for later use
with open("data/temples_cleaned.txt", "w", encoding="utf-8") as file:
    file.write(text)

In [5]:
# Tokenize Khmer text
tokens = word_tokenize(text)

len(tokens), tokens[:10]

| 2025-01-26 20:04:08,123 | [1;32mINFO[0m | khmer-nltk | Loaded model from /Users/maohieng/master-degree/learn_ai/.venv/lib/python3.8/site-packages/khmernltk/word_tokenize/sklearn_crf_ner_10000.sav |


(9170,
 ['ប្រាសាទ',
  'អង្គរវត្ត',
  'ឬ',
  'ប្រាសាទ',
  'អង្គរ',
  'តូច',
  'មាន',
  'ទីតាំង',
  'ស្ថិត',
  'នៅ'])

In [6]:
# Filter out
word_counts = Counter(tokens)
print(f"Original vocab size: {len(word_counts)}")
filtered_tokens = [word for word in tokens if word_counts[word] >= 10 and word != " "]
vocab = set(filtered_tokens)
print(f"Cleaned vocab size: {len(vocab)}")
vocab

Original vocab size: 2211
Cleaned vocab size: 172


{'កណ្ដាល',
 'កណ្តាល',
 'កន្លែង',
 'កម្ពុជា',
 'កំពុង',
 'កំពែង',
 'ក៏',
 'ក្នុង',
 'ក្បាច់',
 'ក្រុង',
 'ក្រុម',
 'ខាង',
 'ខាងកើត',
 'ខាងក្នុង',
 'ខាងក្រៅ',
 'ខាងជើង',
 'ខាងត្បូង',
 'ខាងលិច',
 'ខេត្ត',
 'ខ្ពស់',
 'ខ្មែរ',
 'ខ្លួន',
 'ខ្លះ',
 'គឺ',
 'គឺជា',
 'គូ',
 'គេ',
 'គោ',
 'គ្នា',
 'ឃើញ',
 'ចម្លាក់',
 'ចុង',
 'ចូល',
 'ចេញ',
 'ចំនួន',
 'ចំពោះ',
 'ច្រក',
 'ច្រើន',
 'ឆ្នាំ',
 'ជញ្ជាំង',
 'ជា',
 'ជាង',
 'ជាច្រើន',
 'ជាប់',
 'ជាមួយ',
 'ជុំវិញ',
 'ឈើ',
 'ឈ្មោះ',
 'ដល់',
 'ដី',
 'ដូច',
 'ដូចជា',
 'ដើម',
 'ដើម្បី',
 'ដែរ',
 'ដែល',
 'ដោយ',
 'ដំបូង',
 'ដំបូល',
 'ដ៏',
 'តាម',
 'តូច',
 'តែ',
 'តំណាង',
 'តំបន់',
 'ត្រូវបាន',
 'ថា',
 'ថែវ',
 'ថ្ងៃ',
 'ថ្ម',
 'ទាំង',
 'ទាំងអស់',
 'ទិស',
 'ទី',
 'ទីក្រុង',
 'ទឹក',
 'ទៀត',
 'ទេ',
 'ទេវតា',
 'ទេសចរណ៍',
 'ទៅ',
 'ធំ',
 'ធ្វើ',
 'នគរ',
 'នានា',
 'និង',
 'នីមួយ',
 'នឹង',
 'នូវ',
 'នេះ',
 'នៃ',
 'នោះ',
 'នៅ',
 'នៅក្នុង',
 'នៅលើ',
 'ន័យ',
 'បង្ហាញ',
 'បន្ទាយស្រី',
 'បាន',
 'បារាំង',
 'បី',
 'បុរាណ',
 'បុរៈ',
 'បំផុត',
 'ប៉ម',
 'ប៉ុន្តែ',
 'ប្រជាជន',
 'ប្

In [7]:
# Save vocabulary for later use
with open("data/vocab.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(vocab))

In [8]:
# Create word to index and index to word mappings
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}

In [9]:
word_to_index

{'ជាប់': 0,
 'ដូចជា': 1,
 'សម័យ': 2,
 'មាន': 3,
 'មិន': 4,
 'ជាច្រើន': 5,
 'ដើម្បី': 6,
 'ព្រះបាទ': 7,
 'បុរៈ': 8,
 'ដំបូល': 9,
 'ឡើយ': 10,
 'ពិភពលោក': 11,
 'ខ្លះ': 12,
 'ក្នុង': 13,
 'សម្រាប់': 14,
 'ទៀត': 15,
 'សំណង់': 16,
 'និង': 17,
 'ក្បាច់': 18,
 'ឃើញ': 19,
 'មួយ': 20,
 'ទេសចរណ៍': 21,
 'អង្គរ': 22,
 'ដែល': 23,
 'ខាងក្នុង': 24,
 'ដំបូង': 25,
 'បាន': 26,
 'ប្រើ': 27,
 'ខ្លួន': 28,
 'បី': 29,
 'វរ្ម័ន': 30,
 'ថែវ': 31,
 'ដី': 32,
 'ដើម': 33,
 'អំពី': 34,
 'ឡើង': 35,
 'ចេញ': 36,
 'ទេវតា': 37,
 'នគរ': 38,
 'បំផុត': 39,
 'ទឹក': 40,
 'កំពែង': 41,
 'នេះ': 42,
 'ដោយ': 43,
 'ដែរ': 44,
 'ប្រទេស': 45,
 'គឺជា': 46,
 'ខាង': 47,
 'លក្ខណៈ': 48,
 'ទាំងអស់': 49,
 'ជញ្ជាំង': 50,
 'ខាងត្បូង': 51,
 'ពាក្យ': 52,
 'ចម្លាក់': 53,
 'គ្នា': 54,
 'ខាងលិច': 55,
 'ឈ្មោះ': 56,
 'វា': 57,
 'ផ្នែក': 58,
 'ឈើ': 59,
 'នៅ': 60,
 'បន្ទាយស្រី': 61,
 'នៅក្នុង': 62,
 'នូវ': 63,
 'រចនាបថ': 64,
 'ត្រូវបាន': 65,
 'កម្ពុជា': 66,
 'កំពុង': 67,
 'តែ': 68,
 'រាង': 69,
 'រាជធានី': 70,
 'មកពី': 71,
 'ស្រុក': 72,
 'សៀមរាប': 73,

In [10]:
index_to_word

{0: 'ជាប់',
 1: 'ដូចជា',
 2: 'សម័យ',
 3: 'មាន',
 4: 'មិន',
 5: 'ជាច្រើន',
 6: 'ដើម្បី',
 7: 'ព្រះបាទ',
 8: 'បុរៈ',
 9: 'ដំបូល',
 10: 'ឡើយ',
 11: 'ពិភពលោក',
 12: 'ខ្លះ',
 13: 'ក្នុង',
 14: 'សម្រាប់',
 15: 'ទៀត',
 16: 'សំណង់',
 17: 'និង',
 18: 'ក្បាច់',
 19: 'ឃើញ',
 20: 'មួយ',
 21: 'ទេសចរណ៍',
 22: 'អង្គរ',
 23: 'ដែល',
 24: 'ខាងក្នុង',
 25: 'ដំបូង',
 26: 'បាន',
 27: 'ប្រើ',
 28: 'ខ្លួន',
 29: 'បី',
 30: 'វរ្ម័ន',
 31: 'ថែវ',
 32: 'ដី',
 33: 'ដើម',
 34: 'អំពី',
 35: 'ឡើង',
 36: 'ចេញ',
 37: 'ទេវតា',
 38: 'នគរ',
 39: 'បំផុត',
 40: 'ទឹក',
 41: 'កំពែង',
 42: 'នេះ',
 43: 'ដោយ',
 44: 'ដែរ',
 45: 'ប្រទេស',
 46: 'គឺជា',
 47: 'ខាង',
 48: 'លក្ខណៈ',
 49: 'ទាំងអស់',
 50: 'ជញ្ជាំង',
 51: 'ខាងត្បូង',
 52: 'ពាក្យ',
 53: 'ចម្លាក់',
 54: 'គ្នា',
 55: 'ខាងលិច',
 56: 'ឈ្មោះ',
 57: 'វា',
 58: 'ផ្នែក',
 59: 'ឈើ',
 60: 'នៅ',
 61: 'បន្ទាយស្រី',
 62: 'នៅក្នុង',
 63: 'នូវ',
 64: 'រចនាបថ',
 65: 'ត្រូវបាន',
 66: 'កម្ពុជា',
 67: 'កំពុង',
 68: 'តែ',
 69: 'រាង',
 70: 'រាជធានី',
 71: 'មកពី',
 72: 'ស្រុក',
 73: 'សៀមរាប',

In [11]:
# Generate training data for skip-gram model
def generate_training_data(tokens, window_size, vocab):
    training_data = []
    for i, word in enumerate(tokens):
        if word in vocab:
            context_words = (
                tokens[max(0, i - window_size) : i]
                + tokens[i + 1 : i + window_size + 1]
            )
            context_words = [w for w in context_words if w in vocab]
            for context_word in context_words:
                training_data.append((word_to_index[word], word_to_index[context_word]))
    return training_data

In [12]:
training_data = generate_training_data(filtered_tokens, CONTEXT_WINDOW, vocab)
print(f"Number of training samples: {len(training_data)}")

Number of training samples: 41484


In [13]:
vocab_size = len(vocab)
vocab_size

172

In [14]:
# Skip-Gram Model
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIMENSION, input_length=1),
        tf.keras.layers.Dense(vocab_size, activation="softmax"),
    ]
)

In [15]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")


In [16]:
# Prepare the data for training
def generate_batch(training_data, batch_size, negative_samples):
    total_positive_samples = 0
    total_negative_samples = 0
    while True:
        x, y = [], []
        for _ in range(batch_size):
            target_word, context_word = training_data[
                np.random.randint(0, len(training_data))
            ]
            x.append(target_word)
            y.append(context_word)
            total_positive_samples += 1
            for _ in range(negative_samples):
                negative_word = np.random.randint(0, vocab_size)
                x.append(target_word)
                y.append(negative_word)
                total_negative_samples += 1
        print(f"Total positive samples: {total_positive_samples}")
        print(f"Total negative samples: {total_negative_samples}")
        yield np.array(x), np.array(y)


In [17]:
next(generate_batch(training_data, BATCH_SIZE, NEGATIVE_SAMPLES))

Total positive samples: 32
Total negative samples: 64


(array([118, 118, 118,  93,  93,  93, 116, 116, 116, 105, 105, 105,  65,
         65,  65,  26,  26,  26,  23,  23,  23, 171, 171, 171,  26,  26,
         26,  17,  17,  17, 102, 102, 102,  95,  95,  95,  43,  43,  43,
         66,  66,  66, 120, 120, 120, 126, 126, 126,  64,  64,  64,  23,
         23,  23,  60,  60,  60,   9,   9,   9,  92,  92,  92,  74,  74,
         74, 105, 105, 105, 118, 118, 118,   1,   1,   1, 157, 157, 157,
         33,  33,  33, 137, 137, 137, 159, 159, 159, 124, 124, 124, 111,
        111, 111,   6,   6,   6]),
 array([ 42,  66,  55,   2,  54,  41,  17,  75,  28,  17,  42, 148,  42,
        155,  48, 134, 105, 142,  26,  81, 102,  22,  42,  71,   2,  31,
          4,  54,  64,  10, 164, 122,  14,  54,  71, 171,  57,  61, 118,
         27,   2,   7, 123, 132,  72,  65, 166, 106,  23,  55,  64, 124,
        164,  21,  10, 124,  84, 133, 157, 128,  60,  77, 131,  42, 114,
        162, 151, 165, 142, 132,   7, 155,  60,  78,   1,  41,  87,  91,
         25, 160

In [18]:
steps_per_epoch = len(training_data) // BATCH_SIZE
steps_per_epoch

1296

In [19]:
model.fit(
    generate_batch(training_data, BATCH_SIZE, NEGATIVE_SAMPLES),
    steps_per_epoch=steps_per_epoch,
    epochs=20,
)

Total positive samples: 32
Total negative samples: 64
Epoch 1/20
Total positive samples: 64
Total negative samples: 128
Total positive samples: 96
Total negative samples: 192
   1/1296 [..............................] - ETA: 12:25 - loss: 5.1483Total positive samples: 128
Total negative samples: 256
Total positive samples: 160
Total negative samples: 320
Total positive samples: 192
Total negative samples: 384
Total positive samples: 224
Total negative samples: 448
Total positive samples: 256
Total negative samples: 512
Total positive samples: 288
Total negative samples: 576
Total positive samples: 320
Total negative samples: 640
Total positive samples: 352
Total negative samples: 704
Total positive samples: 384
Total negative samples: 768
Total positive samples: 416
Total negative samples: 832
Total positive samples: 448
Total negative samples: 896
Total positive samples: 480
Total negative samples: 960
Total positive samples: 512
Total negative samples: 1024
Total positive samples: 54

<keras.src.callbacks.History at 0x7fe09eb73430>

* Total positive samples: 829536
* Total negative samples: 1659072

In [20]:
# Extract the word embeddings
embeddings = model.layers[0].get_weights()[0]
embeddings

array([[ 0.10226771, -0.00744244,  0.10923031, ...,  0.10605022,
         0.21774758,  0.18077594],
       [ 0.20394632, -0.08855589, -0.06771737, ..., -0.1871466 ,
        -0.00121989,  0.2196817 ],
       [ 0.01537234, -0.00395851,  0.2151167 , ...,  0.06884751,
         0.07945223,  0.26297703],
       ...,
       [-0.13707477, -0.14259335,  0.28817305, ...,  0.29393882,
         0.05627891,  0.21917753],
       [ 0.08812016, -0.20878942,  0.07121568, ..., -0.23354512,
         0.07592183,  0.04676022],
       [-0.11039198,  0.09776252,  0.24466206, ...,  0.10718122,
         0.19718195, -0.14760989]], dtype=float32)

In [21]:
# Save the embeddings
np.save("data/word_embeddings.npy", embeddings)
np.save("data/word_to_index.npy", word_to_index)
np.save("data/index_to_word.npy", index_to_word)

In [22]:
# Save word  to embeddings
word_to_embeddings = {word: embeddings[i] for word, i in word_to_index.items()}
np.save("data/word_to_embeddings.npy", word_to_embeddings)

In [23]:
word_to_embeddings

{'ជាប់': array([ 0.10226771, -0.00744244,  0.10923031, -0.34008032, -0.28030527,
         0.2684023 ,  0.14439997,  0.31463847,  0.53040034, -0.09072138,
         0.28647617, -0.19860315, -0.16020623,  0.18188228, -0.40852645,
        -0.1423637 ,  0.08551092, -0.0982253 ,  0.24257547, -0.32194653,
         0.09737758,  0.16402394,  0.01842574,  0.64360803,  0.35086933,
         0.27417383, -0.10738769, -0.08436559, -0.27757713, -0.33825925,
        -0.3869431 ,  0.29002956,  0.09918953, -0.05536466,  0.37603432,
         0.20912877,  0.19247036, -0.16182612, -0.05543986,  0.2101659 ,
        -0.17720227,  0.17394833,  0.29489985,  0.2760776 , -0.1514967 ,
         0.05554482,  0.3853603 ,  0.10605022,  0.21774758,  0.18077594],
       dtype=float32),
 'ដូចជា': array([ 0.20394632, -0.08855589, -0.06771737, -0.02435153, -0.02087013,
         0.10777866,  0.08186459, -0.17050004, -0.05771196, -0.0008194 ,
        -0.04829808,  0.31301248,  0.21362793,  0.20667286, -0.08433729,
        -0