# Mini Project 3 - I. Word Embeddings

In [24]:
EMBEDDING_DIM = 50
CONTEXT_WINDOW = 4
NEGATIVE_SAMPLES = 2

UNKNOWN_TOKEN = "<UNK>"

In [2]:
from khmernltk import word_tokenize
from collections import Counter
import numpy as np
import tensorflow as tf

2025-01-28 10:30:41.177010: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Scratch: Read corpus and tokenizing

In [3]:
# Load and preprocess text
with open("temples.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [4]:
unclean_tokens = word_tokenize(text)
print(f"number of tokens: {len(unclean_tokens)}")
token_counters = Counter(unclean_tokens)
print(f"Original vocab size: {len(token_counters)}")

| 2025-01-28 10:32:17,580 | [1;32mINFO[0m | khmer-nltk | Loaded model from /Users/maohieng/master-degree/learn_ai/.venv/lib/python3.8/site-packages/khmernltk/word_tokenize/sklearn_crf_ner_10000.sav |


number of tokens: 11521
Original vocab size: 2276


### Clean the tokens

In [5]:
import re

cleaned_vocabs = Counter()

zero_width_space = '\u200B' # zero width space from Internet
zero_width_space2 = '\u200C' # zero space from experiment below
ignores = [" ","-", "—", ":", "(", ")", "។", "[", "]", "ៗ", "៕", "៛", "៚", "៙", "៘", "៖", "\"",""]
ignores.append(zero_width_space)
ignores.append(zero_width_space2)

for token, count in token_counters.items():
    if token.strip() and token not in ignores and not re.match(r"^\[.*$", token) and not re.match(r".*\]$", token) and not re.match(r"^\(.*$", token) and not re.match(r".*\)$", token) and not re.match(r"^\d+$", token) and not token.isascii():
        cleaned_vocabs[token] = count

print(f"Clean vocabs: {len(cleaned_vocabs)}")

Clean vocabs: 2091


In [74]:
def get_unicode_code_points(token):
    """
    Get the Unicode code points of each character in the token.
    """
    return [ord(char) for char in token]

def format_code_points_as_hex(code_points):
    """
    Format the Unicode code points as hexadecimal strings.
    """
    return [f"U+{code_point:04X}" for code_point in code_points]

In [77]:
# with open("cleaned_vocabs.txt", "w", encoding="utf-8") as file:
#     row = []
#     for token, count in cleaned_vocabs.items():
        # code_points = get_unicode_code_points(token)
        # hex_code_points = format_code_points_as_hex(code_points)
        # row.append(f"{token} {count}")

    # file.write("\n".join(row))

In [6]:
cleaned_tokens = []
for token in unclean_tokens:
    if token in cleaned_vocabs:
        cleaned_tokens.append(token)

print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [7]:
with open("cleaned_tokens.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(cleaned_tokens))

In [9]:
vocabs_counter = Counter(cleaned_tokens)
print(f"vocab size: {len(vocabs_counter)}")

vocab size: 2091


## Alternative: Read Cleaned tokens

In [3]:
# Read cleaned tokens
with open("cleaned_tokens.txt", "r", encoding="utf-8") as file:
    cleaned_tokens = file.read().split()

In [4]:
print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [5]:
vocabs_counter = Counter(cleaned_tokens)
print(f"vocab size: {len(vocabs_counter)}")

vocab size: 2091


In [6]:
print(vocabs_counter['ទីតាំង'])

7


## Skip-gram model
Given a Khmer text corpus “temples.txt” extracted from 3 Wikipedia pages1, build a skip-gram
model/classifier to find representation/embedding of each Khmer word in the corpus. Use the
following settings for your implementation:
- The word embedding has a dimension of 50
- For the skip-gram model, use context window $L$ = ±4 and negative sampling with $k$ = 2
- To tokenize word, you can use Khmer nltk2
- Your vocabulary should contain meaningful and frequent words so:
    - Words whose frequency is less than 10 are ignored
    - Spaces are considered a stop word and are also ignored

### Preparing dataset

#### Clean vocabs and less frequency words  (<10)

In [10]:
FREQURNCY_THRESHOLD = 10
limited_vocabs = set()
for token, count in vocabs_counter.items():
    if count >= FREQURNCY_THRESHOLD:
        limited_vocabs.add(token)
    else:
        limited_vocabs.add(UNKNOWN_TOKEN)

limited_vocabs = list(limited_vocabs)
print(f"vocab size after limiting: {len(limited_vocabs)}")
limited_vocabs

vocab size after limiting: 175


['ម៉ែត្រ',
 'មួយ',
 'ជាប់',
 'កម្ពុជា',
 'ឈ្មោះ',
 'ទិស',
 'សាសនា',
 'ដំបូង',
 'ទេសចរណ៍',
 'ស្ថិត',
 'វិញ',
 'រួម',
 'សម័យ',
 'ប្រាង្គ',
 'នឹង',
 'បុរាណ',
 'អំពី',
 'ទី',
 'ប្រវែង',
 'ចម្លាក់',
 'ប៉ម',
 'ផ្នែក',
 'ខាង',
 'អាច',
 'សំណង់',
 'ដូចជា',
 'ឡើយ',
 'ពិភពលោក',
 'មក',
 'សំខាន់',
 'កំពុង',
 'រាង',
 'អោយ',
 'អង្គរវត្ត',
 'ធ្វើ',
 'ស្ថាបត្យកម្ម',
 'ថា',
 'ក្នុង',
 'ថ្ម',
 'ខ្មែរ',
 'អង្គរ',
 'នៃ',
 'ភ្នំ',
 'ពួក',
 'ផងដែរ',
 'ប្រាសាទ',
 'តាម',
 'ស្រាល',
 'មិន',
 'ហៅ',
 'ខ្លួន',
 'ច្រើន',
 'ពីរ',
 'ទាំងអស់',
 'ព័ទ្ធ',
 'ឡើង',
 'នៅ',
 'គឺ',
 'ទៀត',
 'យ៉ាង',
 'តំបន់',
 'អ្នក',
 'របស់',
 'ដល់',
 'ដើម',
 'ចេញ',
 'ជាង',
 'ខេត្ត',
 'គ្នា',
 'នគរ',
 'ទីក្រុង',
 'ថ្ងៃ',
 'លើ',
 'ជញ្ជាំង',
 'ទេ',
 'ខាងលិច',
 'ចុង',
 'បារាំង',
 'ដំបូល',
 'ផ្សេង',
 'នូវ',
 'ក្បាច់',
 'ផ្លូវ',
 'ជា',
 'ចូល',
 'ធំ',
 'ខ្ពស់',
 'សតវត្ស',
 'ខាងក្នុង',
 'ខ្លះ',
 'ចំនួន',
 'នីមួយ',
 'ខាងត្បូង',
 'ថែវ',
 'ជុំវិញ',
 'លក្ខណៈ',
 'ខាងកើត',
 'ហើយ',
 'វា',
 'តូច',
 'មាន',
 'ដូច',
 'នោះ',
 'ប្រហែល',
 'បុរៈ',
 'ឬ',
 'រាជធានី'

In [23]:
vocab_size = len(limited_vocabs)
vocab_size

175

#### Word2Number

In [11]:
# Word to index and index to word
word_to_index = {word: i for i, word in enumerate(limited_vocabs)}
index_to_word = {i: word for word, i in word_to_index.items()}

In [12]:
print(len(word_to_index), word_to_index)
print(index_to_word)

175 {'ម៉ែត្រ': 0, 'មួយ': 1, 'ជាប់': 2, 'កម្ពុជា': 3, 'ឈ្មោះ': 4, 'ទិស': 5, 'សាសនា': 6, 'ដំបូង': 7, 'ទេសចរណ៍': 8, 'ស្ថិត': 9, 'វិញ': 10, 'រួម': 11, 'សម័យ': 12, 'ប្រាង្គ': 13, 'នឹង': 14, 'បុរាណ': 15, 'អំពី': 16, 'ទី': 17, 'ប្រវែង': 18, 'ចម្លាក់': 19, 'ប៉ម': 20, 'ផ្នែក': 21, 'ខាង': 22, 'អាច': 23, 'សំណង់': 24, 'ដូចជា': 25, 'ឡើយ': 26, 'ពិភពលោក': 27, 'មក': 28, 'សំខាន់': 29, 'កំពុង': 30, 'រាង': 31, 'អោយ': 32, 'អង្គរវត្ត': 33, 'ធ្វើ': 34, 'ស្ថាបត្យកម្ម': 35, 'ថា': 36, 'ក្នុង': 37, 'ថ្ម': 38, 'ខ្មែរ': 39, 'អង្គរ': 40, 'នៃ': 41, 'ភ្នំ': 42, 'ពួក': 43, 'ផងដែរ': 44, 'ប្រាសាទ': 45, 'តាម': 46, 'ស្រាល': 47, 'មិន': 48, 'ហៅ': 49, 'ខ្លួន': 50, 'ច្រើន': 51, 'ពីរ': 52, 'ទាំងអស់': 53, 'ព័ទ្ធ': 54, 'ឡើង': 55, 'នៅ': 56, 'គឺ': 57, 'ទៀត': 58, 'យ៉ាង': 59, 'តំបន់': 60, 'អ្នក': 61, 'របស់': 62, 'ដល់': 63, 'ដើម': 64, 'ចេញ': 65, 'ជាង': 66, 'ខេត្ត': 67, 'គ្នា': 68, 'នគរ': 69, 'ទីក្រុង': 70, 'ថ្ងៃ': 71, 'លើ': 72, 'ជញ្ជាំង': 73, 'ទេ': 74, 'ខាងលិច': 75, 'ចុង': 76, 'បារាំង': 77, 'ដំបូល': 78, 'ផ្សេង': 79, 'នូវ': 80, 'ក្បា

In [14]:
UNKNOWN_INDEX = word_to_index[UNKNOWN_TOKEN]
UNKNOWN_INDEX

129

In [15]:
limited_tokens = []
data = []
for token in cleaned_tokens:
    if token in word_to_index:
        limited_tokens.append(token)
        data.append(word_to_index[token])
    else:
        limited_tokens.append(UNKNOWN_TOKEN)
        data.append(word_to_index[UNKNOWN_TOKEN])

In [16]:
print(len(limited_tokens), limited_tokens)
print(len(data), data)

9086 ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', '<UNK>', 'ស្ថិត', 'នៅ', 'ភាគ', 'ខាងជើង', 'នៃ', 'ក្រុង', 'សៀមរាប', 'នៃ', 'ខេត្ត', 'សៀមរាប', 'ប្រាសាទ', 'អង្គរវត្ត', 'ជា', 'ប្រាសាទ', '<UNK>', 'សាសនា', 'ធំ', 'បំផុត', 'និង', 'ជា', '<UNK>', 'សាសនា', 'ដ៏', 'ធំ', 'បំផុត', 'នៅក្នុង', 'លោក', 'ប្រាសាទ', 'នេះ', 'ត្រូវបាន', '<UNK>', 'ដោយ', 'ព្រះបាទ', '<UNK>', 'វរ្ម័ន', 'ទី២', 'ដែល', 'ជា', '<UNK>', 'ដ៏', 'ធំ', '<UNK>', 'និង', 'មាន', 'ឈ្មោះ', '<UNK>', '<UNK>', '<UNK>', '<UNK>', 'ទៅ', '<UNK>', '<UNK>', 'លើ', 'ពិភពលោក', 'ប្រាសាទ', 'នេះ', '<UNK>', 'ឡើង', 'នៅ', 'ដើម', 'សតវត្ស', 'ទី', 'ដែល', 'ស្ថិត', 'នៅក្នុង', 'រាជធានី', '<UNK>', 'បុរៈ', 'ប្រាសាទ', 'អង្គរវត្ត', 'ជា', 'ប្រាសាទ', '<UNK>', 'ដើម្បី', '<UNK>', 'ដល់', 'ព្រះវិស្ណុ', 'ប្រាសាទ', 'នេះ', 'ជា', 'ប្រាសាទ', 'ដែល', 'នៅ', '<UNK>', '<UNK>', 'បំផុត', 'នៅក្នុង', 'តំបន់', 'អង្គរ', 'ប្រាសាទ', 'អង្គរវត្ត', 'ប្រើ', '<UNK>', '<UNK>', '<UNK>', 'ជាង', '<UNK>', '<UNK>', 'និង', '<UNK>', 'ជាង', '<UNK>', '<UNK>', 'និង', '<UNK>', 'ថ្ម', '<UNK>', '

In [17]:
np.save("word_to_index.npy", word_to_index)

In [48]:
def map_embbedings_to_word(embbedings, word_to_index):
    """
    Map the embeddings to the word.
    """
    word_to_embbedings = {}
    for word, index in word_to_index.items():
        word_to_embbedings[word] = embbedings[index]
    return word_to_embbedings

#### Create dataset

In [19]:
def generate_skipgram_data2(token_indices, window_size):
    """Generate skip-gram pairs."""
    skipgram_pairs = []
    for center_idx, center_word in enumerate(token_indices):
        context_range = range(max(center_idx - window_size, 0), 
                                min(center_idx + window_size + 1, len(token_indices)))
        for context_idx in context_range:
            if center_idx != context_idx:
                skipgram_pairs.append((center_word, token_indices[context_idx]))
    return skipgram_pairs

In [34]:
training_data = generate_skipgram_data2(data, CONTEXT_WINDOW)

In [35]:
print(f"Number of training data: {len(training_data)}")
print(f"Training data: {training_data[:10]}")

Number of training data: 72668
Training data: [(45, 33), (45, 105), (45, 45), (45, 40), (33, 45), (33, 105), (33, 45), (33, 40), (33, 99), (105, 45)]


### Skip-gram Model 1 (Inefficient) - Using pytorch

In [22]:
import torch
import torch.nn as nn

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(SkipGramModel, self).__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_word, context_word, negative_samples):
        # Embeddings for the center, context, and negative samples
        center_embeds = self.in_embedding(center_word)  # (batch_size, embedding_dim)
        context_embeds = self.out_embedding(context_word)  # (batch_size, embedding_dim)
        negative_embeds = self.out_embedding(negative_samples)  # (batch_size, k, embedding_dim)

        # Positive score (center and context)
        pos_score = torch.sum(center_embeds * context_embeds, dim=1)  # (batch_size)
        pos_loss = -torch.log(torch.sigmoid(pos_score))  # (batch_size)

        # Negative score (center and negative samples)
        neg_score = torch.bmm(negative_embeds, center_embeds.unsqueeze(2)).squeeze(2)  # (batch_size, k)
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)  # (batch_size)

        return torch.mean(pos_loss + neg_loss)
    
    def get_word_emdedding(self, word_idx):
        return self.in_embedding(torch.tensor(word_idx, dtype=torch.long))

In [131]:
import torch.optim as optim
import random

def train_skipgram(data, vocab_size: int, embedding_dim=50, neg_samples=2, epochs=10, lr=0.01):
    model = SkipGramModel(vocab_size, embedding_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        for center, context in data:
            negative_samples = random.choices(list(range(vocab_size)), k=neg_samples)

            # Convert to tensors
            center_tensor = torch.tensor(center, dtype=torch.long).unsqueeze(0)
            context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
            negative_tensor = torch.tensor(negative_samples, dtype=torch.long).unsqueeze(0)

            # Zero gradients
            optimizer.zero_grad()

            # Compute loss and backpropagate
            loss = model(center_tensor, context_tensor, negative_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")
    
    return model

In [132]:
sg_model = train_skipgram(training_data, vocab_size, embedding_dim=EMBEDDING_DIM, neg_samples=NEGATIVE_SAMPLES, epochs=10, lr=0.01)

Epoch 1, Loss: 199999.3792
Epoch 2, Loss: 106418.1202
Epoch 3, Loss: 83332.6858
Epoch 4, Loss: 74421.3169
Epoch 5, Loss: 70855.0928
Epoch 6, Loss: 68358.0562
Epoch 7, Loss: 67068.0173
Epoch 8, Loss: 66072.9121
Epoch 9, Loss: 65542.6928
Epoch 10, Loss: 64900.8895


In [135]:
word_embedded = sg_model.get_word_emdedding(word_to_index['អង្គរវត្ត'])

In [136]:
print(word_embedded)

tensor([ 2.6869e-01, -8.0715e-01,  8.8348e-02, -1.5651e-02, -3.0536e-01,
        -4.5655e-01, -4.0824e-01, -3.7830e-01, -3.5524e-01, -3.0825e-01,
        -3.5478e-01,  1.5774e-01,  2.9116e-02,  3.4075e-04,  1.5851e-01,
        -2.0537e-01,  2.1613e-01,  8.4399e-01, -3.7808e-01, -2.4047e-01,
         3.3064e-01, -2.8880e-01, -5.3783e-02,  3.5023e-01,  6.1740e-01,
         2.9699e-01, -2.7800e-01,  3.6858e-01,  5.9942e-01, -4.4517e-01,
         6.4570e-01,  7.7118e-01,  1.8923e-01, -1.6945e-01, -1.5987e-01,
        -3.6847e-01, -4.2477e-01, -5.0249e-01, -3.0586e-01, -3.1183e-01,
         4.1797e-01, -2.2914e-01, -6.6145e-02,  2.6570e-01, -9.5747e-01,
         4.7499e-01,  3.9335e-03,  3.6768e-02,  5.3335e-01,  3.0709e-01],
       grad_fn=<EmbeddingBackward0>)


### Skip-Gram Model 2 - Simple using tensorflow

In [36]:
def generate_batch(training_data, vocab_size, batch_size=128, negative_samples=2):
    total_positive_samples = 0
    total_negative_samples = 0
    while True:
        x, y = [], []
        for _ in range(batch_size):
            target_word, context_word = training_data[
                np.random.randint(0, len(training_data))
            ]
            x.append(target_word)
            y.append(context_word)
            total_positive_samples += 1
            for _ in range(negative_samples):
                negative_word = np.random.randint(0, vocab_size)
                x.append(target_word)
                y.append(negative_word)
                total_negative_samples += 1
        # print(f"Total positive samples: {total_positive_samples}")
        # print(f"Total negative samples: {total_negative_samples}")
        yield np.array(x), np.array(y)

In [37]:
BATCH_SIZE = 128

In [None]:
next(generate_batch(training_data, vocab_size, BATCH_SIZE, NEGATIVE_SAMPLES))

(array([122, 122, 122, 171, 171, 171,  38,  38,  38, 136, 136, 136, 129,
        129, 129,  54,  54,  54, 167, 167, 167,  45,  45,  45, 129, 129,
        129,  19,  19,  19,  28,  28,  28,  40,  40,  40, 100, 100, 100,
        153, 153, 153, 129, 129, 129, 129, 129, 129,  26,  26,  26, 170,
        170, 170, 142, 142, 142,  38,  38,  38,  26,  26,  26, 129, 129,
        129, 129, 129, 129,  83,  83,  83, 109, 109, 109, 111, 111, 111,
        167, 167, 167,  37,  37,  37,  93,  93,  93, 129, 129, 129,  40,
         40,  40, 113, 113, 113,  56,  56,  56, 171, 171, 171, 129, 129,
        129, 129, 129, 129, 129, 129, 129,  41,  41,  41, 125, 125, 125,
        129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
        129, 129,  40,  40,  40, 129, 129, 129,  88,  88,  88, 129, 129,
        129, 129, 129, 129,  57,  57,  57, 129, 129, 129, 129, 129, 129,
        129, 129, 129, 143, 143, 143, 129, 129, 129,  93,  93,  93, 129,
        129, 129, 129, 129, 129, 129, 129, 129,   2

In [38]:
steps_per_epoch = len(training_data) // BATCH_SIZE
steps_per_epoch

567

In [41]:
sg_model2 = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, EMBEDDING_DIM, input_length=1),
        tf.keras.layers.Dense(vocab_size, activation="softmax"),
    ]
)
sg_model2.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
sg_model2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1, 50)             8750      
                                                                 
 dense_4 (Dense)             (None, 1, 175)            8925      
                                                                 
Total params: 17675 (69.04 KB)
Trainable params: 17675 (69.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [42]:
sg_model2.fit(
    generate_batch(training_data, len(limited_vocabs), BATCH_SIZE, NEGATIVE_SAMPLES),
    steps_per_epoch=steps_per_epoch,
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fdc7a1d22e0>

In [43]:
# Extract the word embeddings
embeddings2 = sg_model2.layers[0].get_weights()[0]
embeddings2

array([[ 0.34314328,  0.16131228,  0.44718328, ...,  0.13729778,
         0.21720968, -0.43638453],
       [ 0.07011422,  0.24414092, -0.03252668, ...,  0.1172427 ,
        -0.2201246 , -0.17495075],
       [ 0.09402708,  0.11027738,  0.06284635, ..., -0.20585698,
         0.08602231,  0.28442106],
       ...,
       [ 0.41316658,  0.10325684,  0.28811285, ...,  0.42324513,
         0.02588182, -0.07002091],
       [ 0.17677775,  0.11788683,  0.0892663 , ...,  0.07262385,
         0.16315602, -0.18525146],
       [ 0.12497172,  0.25270548,  0.20184487, ..., -0.32456404,
         0.23078842, -0.24352162]], dtype=float32)

In [49]:
# Loop through the embeddings and map to words
word_to_embedding2 = map_embbedings_to_word(embeddings2, word_to_index)

In [50]:
word_to_embedding2[UNKNOWN_TOKEN]

array([ 0.16753983,  0.132449  ,  0.15013312,  0.11804002,  0.1578595 ,
       -0.11143141, -0.1021077 ,  0.14897558,  0.14881858,  0.09320206,
        0.00211084,  0.1704435 ,  0.1218603 , -0.05136535, -0.13168128,
       -0.07553571, -0.08030909,  0.14270206,  0.14497234, -0.14769045,
       -0.10816906, -0.12233432, -0.12564212, -0.12505579,  0.13124353,
        0.07932316,  0.11207277, -0.10654891, -0.02392155,  0.11998791,
        0.08446173,  0.06410007, -0.13074847, -0.14837801, -0.13433222,
        0.12091426,  0.08062878,  0.14520523,  0.08977051, -0.02435171,
        0.08754583,  0.1283506 , -0.19930771,  0.07290059, -0.14655863,
        0.12077398,  0.18148227,  0.12433725,  0.04860165, -0.06517328],
      dtype=float32)

In [51]:
np.save("word_to_embedding2.npy", word_to_embedding2)

### Skip Game Model 3 - Custom class using tensorflow

In [52]:
import random

def get_negative_samples(vocab_size, positive_samples, num_negative):
    """Generate negative samples."""
    negatives = []
    for _ in range(len(positive_samples)):
        negative_samples = random.choices(range(vocab_size), k=num_negative)
        negatives.append(negative_samples)
    return negatives

In [60]:
class SkipGramModel3(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel3, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="target_embedding")
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="context_embedding")

    def call(self, inputs):
        target, context, negative_samples = inputs
        target_embed = self.target_embedding(target)
        context_embed = self.context_embedding(context)
        negative_embed = self.context_embedding(negative_samples)

        pos_similarity = tf.reduce_sum(target_embed * context_embed, axis=-1)
        neg_similarity = tf.reduce_sum(target_embed[:, None, :] * negative_embed, axis=-1)

        pos_loss = -tf.math.log(tf.keras.activations.sigmoid(pos_similarity))
        neg_loss = -tf.reduce_sum(tf.math.log(tf.keras.activations.sigmoid(-neg_similarity)), axis=-1)

        return tf.reduce_mean(pos_loss + neg_loss)


In [64]:
def train_skipgram_model3(model, skipgram_pairs, vocab_size, neg_samples=2, epochs=20, batch_size=256, learning_rate=0.01):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    targets, contexts = zip(*skipgram_pairs)

    # Generate negative samples
    negative_samples = get_negative_samples(vocab_size, skipgram_pairs, neg_samples)

    # Convert lists to NumPy arrays
    targets = np.array(targets, dtype=np.int32)
    contexts = np.array(contexts, dtype=np.int32)
    negative_samples = np.array(negative_samples, dtype=np.int32)

    # Ensure `negative_samples` has the shape (num_pairs, k) for proper batching
    negative_samples = np.reshape(negative_samples, (len(targets), -1))

    # Prepare the dataset
    dataset = tf.data.Dataset.from_tensor_slices((targets, contexts, negative_samples))
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size, drop_remainder=True)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataset:
            target_batch, context_batch, negative_batch = batch

            with tf.GradientTape() as tape:
                loss = model([target_batch, context_batch, negative_batch])

            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            total_loss += loss.numpy()

    return optimizer

In [55]:
BATCH_SIZE = 128

In [62]:
model3 = SkipGramModel3(vocab_size, EMBEDDING_DIM)

In [65]:
optimizer = train_skipgram_model3(model3, training_data, vocab_size, learning_rate=0.01, epochs=30, neg_samples=NEGATIVE_SAMPLES, batch_size=BATCH_SIZE)

In [66]:
embedding_layer = model3.get_layer("target_embedding")
embeddings3 = embedding_layer.get_weights()[0]
embeddings3

array([[ 0.78856117,  0.42537412, -0.10304266, ...,  1.0220801 ,
         0.6449248 , -0.98438585],
       [-0.14612241, -0.33671436,  0.15315543, ..., -0.13023931,
        -0.0189624 ,  0.1473515 ],
       [ 0.04548035,  0.19784392, -0.6987631 , ...,  0.5283081 ,
         0.5133444 ,  0.8902088 ],
       ...,
       [-0.8662698 ,  0.55937535,  0.07461716, ...,  0.16996843,
         0.9999869 ,  0.600572  ],
       [-0.52951014,  0.48686543, -0.5237877 , ...,  0.20237026,
        -0.9765463 ,  1.1163844 ],
       [ 0.26818374,  0.9866674 , -0.916924  , ..., -0.2782876 ,
         1.063252  ,  0.07396555]], dtype=float32)

In [67]:
word_to_embedding3 = map_embbedings_to_word(embeddings3, word_to_index)
word_to_embedding3

{'ម៉ែត្រ': array([ 0.78856117,  0.42537412, -0.10304266, -0.1694668 , -1.0403315 ,
         0.16671076,  0.9122989 ,  0.42843598,  0.23822822,  0.5104852 ,
        -0.33968967, -0.20061298,  0.4179288 , -0.85742867,  0.51336163,
        -0.55744237, -0.15684064, -0.60071325,  0.908374  , -0.16290338,
         0.17280239, -1.924011  ,  1.3662716 ,  0.4803774 , -0.51880527,
         0.3493336 ,  0.28587535,  0.8615672 , -0.36634883,  1.4486471 ,
         0.82032716,  0.6433435 , -0.29409298,  1.2066045 , -1.3981357 ,
        -0.29071587, -0.34819332, -0.771911  ,  0.5586823 ,  0.56688887,
        -0.72821486,  0.27798772,  1.2371739 ,  0.35497442, -0.41416073,
         0.62614113,  0.49539885,  1.0220801 ,  0.6449248 , -0.98438585],
       dtype=float32),
 'មួយ': array([-0.14612241, -0.33671436,  0.15315543,  0.2146338 , -0.19863823,
        -0.44332337, -0.18305634, -0.17982213,  0.07601891, -0.04121186,
        -0.31866348,  0.05402759, -0.29614162,  0.055171  ,  0.79900736,
         0

In [68]:
np.save("word_to_embedding3.npy", word_to_embedding3)

### Skip-Gram Model 4

In [69]:
import numpy as np

def skipgram(tokenized, vocab_size, window_size=4, negative_samples=2):
    """
    Generate skip-gram pairs with positive and negative examples manually.

    Args:
    - tokenized: List of word indices (tokenized corpus).
    - vocab_size: Size of the vocabulary.
    - window_size: Size of the context window (± window_size).
    - negative_samples: Number of negative samples to generate per positive pair.

    Returns:
    - targets: Target words (center words).
    - contexts: Context words (positive and negative examples).
    - labels: Labels for training (1 for positive pairs, 0 for negative pairs).
    """
    targets, contexts, labels = [], [], []

    for i, target in enumerate(tokenized):
        # Define context window boundaries
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(tokenized))

        # Positive examples: Words within the context window
        positive_context = [tokenized[j] for j in range(start, end) if j != i]
        for context in positive_context:
            targets.append(target)
            contexts.append(context)
            labels.append(1)  # Positive pair

        # Negative sampling: Randomly sample words outside the context
        negative_context = np.random.choice(
            vocab_size, 
            size=negative_samples, 
            replace=False
        )
        for neg_context in negative_context:
            targets.append(target)
            contexts.append(neg_context)
            labels.append(0)  # Negative pair

    return np.array(targets), np.array(contexts), np.array(labels)

In [71]:
# Generate Skip-gram Data
targets, contexts, labels = skipgram(data, len(limited_vocabs), window_size=CONTEXT_WINDOW, negative_samples=NEGATIVE_SAMPLES)

In [72]:
targets.shape, contexts.shape, labels.shape

((90840,), (90840,), (90840,))

In [73]:
def build_skipgram_model(vocab_size, embedding_dim):
    """Build a skip-gram model using negative sampling."""
    # Inputs for target and context words
    target_input = tf.keras.Input(shape=(1,), name="target_input")
    context_input = tf.keras.Input(shape=(1,), name="context_input")
    
    # Embedding layers for target and context
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_dim, 
        input_length=1, 
        name="embedding"
    )
    
    target_embedding = embedding_layer(target_input)
    context_embedding = embedding_layer(context_input)
    
    # Compute dot product
    dot_product = tf.reduce_sum(target_embedding * context_embedding, axis=-1)
    output = tf.keras.layers.Activation("sigmoid")(dot_product)
    
    # Build and compile model
    model = tf.keras.Model(inputs=[target_input, context_input], outputs=output)
    model.compile(optimizer="adam", loss="binary_crossentropy")
    return model

In [74]:
model = build_skipgram_model(len(limited_vocabs), EMBEDDING_DIM)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 target_input (InputLayer)   [(None, 1)]                  0         []                            
                                                                                                  
 context_input (InputLayer)  [(None, 1)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1, 50)                8750      ['target_input[0][0]',        
                                                                     'context_input[0][0]']       
                                                                                                  
 tf.math.multiply (TFOpLamb  (None, 1, 50)                0         ['embedding[0][0]',       

In [75]:
epochs = 20
batch_size = 32
# Train the Model
history = model.fit(
    [targets, contexts],
    labels,
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [76]:
# Extract Embeddings
embeddings4 = model.get_layer("embedding").get_weights()[0]
embeddings4

array([[-0.15629084, -0.8970666 ,  0.4800078 , ..., -0.54553217,
         0.5355184 , -0.42573494],
       [-0.7243208 , -0.65916896,  0.07415514, ..., -0.09845594,
        -0.31677756,  0.52318865],
       [ 0.2689868 ,  0.38010615,  0.25538456, ..., -0.0221277 ,
         0.24451256, -0.00276685],
       ...,
       [-0.8536216 , -0.8333387 , -0.3278065 , ..., -0.03215203,
         0.13493305,  0.6316623 ],
       [ 0.20597246,  0.43285722,  0.17723486, ...,  0.19524492,
         0.04884125,  0.20175783],
       [ 0.05033246, -0.06928261, -0.20199299, ..., -0.09024346,
        -0.11988508,  0.10561695]], dtype=float32)

In [77]:
# Loop through the embeddings and map to words
word_to_embedding4 = map_embbedings_to_word(embeddings4, word_to_index)

In [78]:
word_to_embedding4

{'ម៉ែត្រ': array([-0.15629084, -0.8970666 ,  0.4800078 ,  0.08314565,  0.22218458,
         0.3808433 ,  0.75632036,  0.27772215, -1.1926374 ,  0.09634368,
         0.3321798 , -0.1670536 ,  0.4282477 ,  0.01800672,  0.19031854,
         0.2645517 ,  0.11199795, -0.15306789, -0.06078188,  0.03340449,
        -0.36180997,  0.00928806,  0.3675855 , -0.20665288,  0.12785387,
         0.78322923,  0.04688639,  0.2464455 ,  0.3182812 ,  0.29023963,
        -0.5259692 ,  0.13871273,  0.17846605, -0.18147491,  0.87896454,
        -0.00691455, -1.1659542 ,  0.12654985,  0.13947435, -0.13368607,
         0.04631158,  0.1113827 ,  0.62899333,  0.4883759 ,  0.00312707,
        -0.20332108,  0.02057537, -0.54553217,  0.5355184 , -0.42573494],
       dtype=float32),
 'មួយ': array([-0.7243208 , -0.65916896,  0.07415514, -0.10682237,  1.7370589 ,
        -0.53759   ,  0.17332369, -0.07557572, -0.41624826, -0.88639754,
        -0.20751965, -0.45044145,  0.5410938 ,  0.15362747, -0.69487953,
        -1

In [79]:
np.save("word_to_embedding4.npy", word_to_embedding4)

## Read trained embeddings

In [81]:
file_word_to_embeddings = "word_to_embedding4.npy" # Change as you prefer

In [82]:
word_to_embeddings = np.load(file_word_to_embeddings, allow_pickle=True).item()
word_to_embeddings

{'ម៉ែត្រ': array([-0.15629084, -0.8970666 ,  0.4800078 ,  0.08314565,  0.22218458,
         0.3808433 ,  0.75632036,  0.27772215, -1.1926374 ,  0.09634368,
         0.3321798 , -0.1670536 ,  0.4282477 ,  0.01800672,  0.19031854,
         0.2645517 ,  0.11199795, -0.15306789, -0.06078188,  0.03340449,
        -0.36180997,  0.00928806,  0.3675855 , -0.20665288,  0.12785387,
         0.78322923,  0.04688639,  0.2464455 ,  0.3182812 ,  0.29023963,
        -0.5259692 ,  0.13871273,  0.17846605, -0.18147491,  0.87896454,
        -0.00691455, -1.1659542 ,  0.12654985,  0.13947435, -0.13368607,
         0.04631158,  0.1113827 ,  0.62899333,  0.4883759 ,  0.00312707,
        -0.20332108,  0.02057537, -0.54553217,  0.5355184 , -0.42573494],
       dtype=float32),
 'មួយ': array([-0.7243208 , -0.65916896,  0.07415514, -0.10682237,  1.7370589 ,
        -0.53759   ,  0.17332369, -0.07557572, -0.41624826, -0.88639754,
        -0.20751965, -0.45044145,  0.5410938 ,  0.15362747, -0.69487953,
        -1