# Mini Project 3 - I. Word Embeddings

In [1]:
EMBEDDING_DIMENSION = 50
CONTEXT_WINDOW = 4
NEGATIVE_SAMPLES = 2
BATCH_SIZE = 32
UNKNOWN_TOKEN = "<UNK>"

In [2]:
from khmernltk import word_tokenize
from collections import Counter
import numpy as np
import tensorflow as tf

2025-01-28 09:43:37.081575: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Scratch: Read corpus and tokenizing

In [47]:
# Load and preprocess text
with open("temples.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [48]:
unclean_tokens = word_tokenize(text)
print(f"number of tokens: {len(unclean_tokens)}")
token_counters = Counter(unclean_tokens)
print(f"Original vocab size: {len(token_counters)}")

number of tokens: 11521
Original vocab size: 2276


### Clean the tokens

In [76]:
import re

cleaned_vocabs = Counter()

zero_width_space = '\u200B' # zero width space from Internet
zero_width_space2 = '\u200C' # zero space from experiment below
ignores = [" ","-", "—", ":", "(", ")", "។", "[", "]", "ៗ", "៕", "៛", "៚", "៙", "៘", "៖", "\"",""]
ignores.append(zero_width_space)
ignores.append(zero_width_space2)

for token, count in token_counters.items():
    if token.strip() and token not in ignores and not re.match(r"^\[.*$", token) and not re.match(r".*\]$", token) and not re.match(r"^\(.*$", token) and not re.match(r".*\)$", token) and not re.match(r"^\d+$", token) and not token.isascii():
        cleaned_vocabs[token] = count

print(f"Clean vocabs: {len(cleaned_vocabs)}")

Clean vocabs: 2091


In [74]:
def get_unicode_code_points(token):
    """
    Get the Unicode code points of each character in the token.
    """
    return [ord(char) for char in token]

def format_code_points_as_hex(code_points):
    """
    Format the Unicode code points as hexadecimal strings.
    """
    return [f"U+{code_point:04X}" for code_point in code_points]

In [77]:
with open("cleaned_vocabs.txt", "w", encoding="utf-8") as file:
    row = []
    for token, count in cleaned_vocabs.items():
        # code_points = get_unicode_code_points(token)
        # hex_code_points = format_code_points_as_hex(code_points)
        row.append(f"{token} {count}")

    file.write("\n".join(row))

In [79]:
cleaned_tokens = []
for token in unclean_tokens:
    if token in cleaned_vocabs:
        cleaned_tokens.append(token)

print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [80]:
with open("cleaned_tokens.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(cleaned_tokens))

## Alternative: Read Cleaned tokens

In [3]:
# Read cleaned tokens
with open("cleaned_tokens.txt", "r", encoding="utf-8") as file:
    cleaned_tokens = file.read().split()

In [4]:
print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [5]:
vocabs_counter = Counter(cleaned_tokens)
print(f"vocab size: {len(vocabs_counter)}")

vocab size: 2091


In [6]:
print(vocabs_counter['ទីតាំង'])

7


## Skip-gram model
Given a Khmer text corpus “temples.txt” extracted from 3 Wikipedia pages1, build a skip-gram
model/classifier to find representation/embedding of each Khmer word in the corpus. Use the
following settings for your implementation:
- The word embedding has a dimension of 50
- For the skip-gram model, use context window $L$ = ±4 and negative sampling with $k$ = 2
- To tokenize word, you can use Khmer nltk2
- Your vocabulary should contain meaningful and frequent words so:
    - Words whose frequency is less than 10 are ignored
    - Spaces are considered a stop word and are also ignored

### Preparing dataset

#### Clean vocabs and less frequency words  (<10)

In [7]:
FREQURNCY_THRESHOLD = 10
limited_vocabs = set()
for token, count in vocabs_counter.items():
    if count >= FREQURNCY_THRESHOLD:
        limited_vocabs.add(token)
    else:
        limited_vocabs.add(UNKNOWN_TOKEN)

limited_vocabs = list(limited_vocabs)
print(f"vocab size after limiting: {len(limited_vocabs)}")
limited_vocabs

vocab size after limiting: 175


['កំពុង',
 'ថែវ',
 'កន្លែង',
 'ប្រទេស',
 'និង',
 'ជាច្រើន',
 'នានា',
 'សៀមរាប',
 'ចំ',
 'អំពី',
 'ផ្នែក',
 'ខាងលិច',
 'ឈើ',
 'ថ្ម',
 'ចេញ',
 'ឆ្នាំ',
 'ខាងក្នុង',
 'ទី២',
 'ម៉ែត្រ',
 'ភក់',
 'លក្ខណៈ',
 'ន័យ',
 'ប្រាសាទ',
 'គេ',
 'ចុង',
 'សំខាន់',
 'ប្រវែង',
 'នេះ',
 'គ្នា',
 'សម្រាប់',
 'ភាគ',
 'សតវត្ស',
 'ដ៏',
 'ខ្លួន',
 'ខ្មែរ',
 'កំពែង',
 'ស្ថិត',
 'បន្ទាយស្រី',
 'តែ',
 'នឹង',
 'វិញ',
 'ទាំងអស់',
 'អាច',
 'ច្រក',
 'លើ',
 'ជុំវិញ',
 'រឺ',
 'ប៉ម',
 'ទេសចរណ៍',
 'របស់',
 'អង្គរវត្ត',
 'ក៏',
 'លោក',
 'ទីក្រុង',
 'ដែល',
 'ប្រាង្គ',
 'ខ្លះ',
 'ទី',
 'ក្រុង',
 'រួម',
 'ព្រះវិស្ណុ',
 'អោយ',
 'ទាំង',
 'គឺ',
 'ប្រើ',
 'មក',
 'ឃើញ',
 'បំផុត',
 'នៅក្នុង',
 'តំបន់',
 'មួយ',
 'ក្បាច់',
 'ផ្លូវ',
 'បី',
 'តាម',
 'ខេត្ត',
 'គោ',
 'ទឹក',
 'ថា',
 'ដល់',
 'សម័យ',
 'វា',
 'រាង',
 '<UNK>',
 'ចំពោះ',
 'ជាង',
 'នីមួយ',
 'អង្គរ',
 'ឬ',
 'ទេវតា',
 'ដូច',
 'ជាមួយ',
 'ក្នុង',
 'នគរ',
 'ដើម្បី',
 'ជាប់',
 'ចម្លាក់',
 'បុរាណ',
 'ប៉ុន្តែ',
 'ទិស',
 'ផ្សេង',
 'ពាក្យ',
 'សតវត្សរ៍',
 'កម្ពុជា',
 'ថ្ងៃ',
 'ស្រុក',
 '

#### Word2Number

In [8]:
# Word to index and index to word
word_to_index = {word: i for i, word in enumerate(limited_vocabs)}
index_to_word = {i: word for word, i in word_to_index.items()}

In [9]:
limited_tokens = []
indices = []
for token in cleaned_tokens:
    if token in word_to_index:
        limited_tokens.append(token)
        indices.append(word_to_index[token])
    else:
        limited_tokens.append(UNKNOWN_TOKEN)
        indices.append(word_to_index[UNKNOWN_TOKEN])

In [10]:
print(len(word_to_index), word_to_index)
print(index_to_word)
print(len(limited_tokens), limited_tokens)
print(len(indices), indices)

175 {'កំពុង': 0, 'ថែវ': 1, 'កន្លែង': 2, 'ប្រទេស': 3, 'និង': 4, 'ជាច្រើន': 5, 'នានា': 6, 'សៀមរាប': 7, 'ចំ': 8, 'អំពី': 9, 'ផ្នែក': 10, 'ខាងលិច': 11, 'ឈើ': 12, 'ថ្ម': 13, 'ចេញ': 14, 'ឆ្នាំ': 15, 'ខាងក្នុង': 16, 'ទី២': 17, 'ម៉ែត្រ': 18, 'ភក់': 19, 'លក្ខណៈ': 20, 'ន័យ': 21, 'ប្រាសាទ': 22, 'គេ': 23, 'ចុង': 24, 'សំខាន់': 25, 'ប្រវែង': 26, 'នេះ': 27, 'គ្នា': 28, 'សម្រាប់': 29, 'ភាគ': 30, 'សតវត្ស': 31, 'ដ៏': 32, 'ខ្លួន': 33, 'ខ្មែរ': 34, 'កំពែង': 35, 'ស្ថិត': 36, 'បន្ទាយស្រី': 37, 'តែ': 38, 'នឹង': 39, 'វិញ': 40, 'ទាំងអស់': 41, 'អាច': 42, 'ច្រក': 43, 'លើ': 44, 'ជុំវិញ': 45, 'រឺ': 46, 'ប៉ម': 47, 'ទេសចរណ៍': 48, 'របស់': 49, 'អង្គរវត្ត': 50, 'ក៏': 51, 'លោក': 52, 'ទីក្រុង': 53, 'ដែល': 54, 'ប្រាង្គ': 55, 'ខ្លះ': 56, 'ទី': 57, 'ក្រុង': 58, 'រួម': 59, 'ព្រះវិស្ណុ': 60, 'អោយ': 61, 'ទាំង': 62, 'គឺ': 63, 'ប្រើ': 64, 'មក': 65, 'ឃើញ': 66, 'បំផុត': 67, 'នៅក្នុង': 68, 'តំបន់': 69, 'មួយ': 70, 'ក្បាច់': 71, 'ផ្លូវ': 72, 'បី': 73, 'តាម': 74, 'ខេត្ត': 75, 'គោ': 76, 'ទឹក': 77, 'ថា': 78, 'ដល់': 79, 'សម័យ': 80, 'វា':

In [11]:
np.save("I_word_to_index.npy", word_to_index)
np.save("I_indices.npy", np.array(indices))

with open("limited_tokens.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(limited_tokens))

#### Create dataset

In [12]:
def generate_skipgram_data1(token_indices, window_size):
    """Generate skip-gram pairs."""
    data = []
    for center_idx in range(len(token_indices)):
        for context_idx in range(-window_size, window_size + 1):
            if context_idx == 0 or center_idx + context_idx < 0 or center_idx + context_idx >= len(token_indices):
                continue
            data.append((token_indices[center_idx], token_indices[center_idx + context_idx]))
    return data

In [13]:
def generate_skipgram_data2(token_indices, window_size):
    """Generate skip-gram pairs."""
    skipgram_pairs = []
    for center_idx, center_word in enumerate(token_indices):
        context_range = range(max(center_idx - window_size, 0), 
                                min(center_idx + window_size + 1, len(token_indices)))
        for context_idx in context_range:
            if center_idx != context_idx:
                skipgram_pairs.append((center_word, token_indices[context_idx]))
    return skipgram_pairs

In [14]:
training_data = generate_skipgram_data1(indices, CONTEXT_WINDOW)

In [15]:
print(f"Number of training data: {len(training_data)}")
print(f"Training data: {training_data[:10]}")

Number of training data: 72668
Training data: [(22, 50), (22, 88), (22, 22), (22, 87), (50, 22), (50, 88), (50, 22), (50, 87), (50, 120), (88, 22)]


In [16]:
training_data2 = generate_skipgram_data2(indices, CONTEXT_WINDOW)

In [17]:
print(f"Number of training data: {len(training_data2)}")
print(f"Training data: {training_data2[:10]}")

Number of training data: 72668
Training data: [(22, 50), (22, 88), (22, 22), (22, 87), (50, 22), (50, 88), (50, 22), (50, 87), (50, 120), (88, 22)]


### Skip-gram Model 1 - Using pytorch

In [129]:
import torch
import torch.nn as nn

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(SkipGramModel, self).__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_word, context_word, negative_samples):
        # Embeddings for the center, context, and negative samples
        center_embeds = self.in_embedding(center_word)  # (batch_size, embedding_dim)
        context_embeds = self.out_embedding(context_word)  # (batch_size, embedding_dim)
        negative_embeds = self.out_embedding(negative_samples)  # (batch_size, k, embedding_dim)

        # Positive score (center and context)
        pos_score = torch.sum(center_embeds * context_embeds, dim=1)  # (batch_size)
        pos_loss = -torch.log(torch.sigmoid(pos_score))  # (batch_size)

        # Negative score (center and negative samples)
        neg_score = torch.bmm(negative_embeds, center_embeds.unsqueeze(2)).squeeze(2)  # (batch_size, k)
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)  # (batch_size)

        return torch.mean(pos_loss + neg_loss)
    
    def get_word_emdedding(self, word_idx):
        return self.in_embedding(torch.tensor(word_idx, dtype=torch.long))

In [131]:
import torch.optim as optim
import random

def train_skipgram(data, vocab_size: int, embedding_dim=50, neg_samples=2, epochs=10, lr=0.01):
    model = SkipGramModel(vocab_size, embedding_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        for center, context in data:
            negative_samples = random.choices(list(range(vocab_size)), k=neg_samples)

            # Convert to tensors
            center_tensor = torch.tensor(center, dtype=torch.long).unsqueeze(0)
            context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
            negative_tensor = torch.tensor(negative_samples, dtype=torch.long).unsqueeze(0)

            # Zero gradients
            optimizer.zero_grad()

            # Compute loss and backpropagate
            loss = model(center_tensor, context_tensor, negative_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")
    
    return model

In [132]:
sg_model = train_skipgram(training_data, limited_vocab_size)

Epoch 1, Loss: 199999.3792
Epoch 2, Loss: 106418.1202
Epoch 3, Loss: 83332.6858
Epoch 4, Loss: 74421.3169
Epoch 5, Loss: 70855.0928
Epoch 6, Loss: 68358.0562
Epoch 7, Loss: 67068.0173
Epoch 8, Loss: 66072.9121
Epoch 9, Loss: 65542.6928
Epoch 10, Loss: 64900.8895


In [135]:
word_embedded = sg_model.get_word_emdedding(word_to_index['អង្គរវត្ត'])

In [136]:
print(word_embedded)

tensor([ 2.6869e-01, -8.0715e-01,  8.8348e-02, -1.5651e-02, -3.0536e-01,
        -4.5655e-01, -4.0824e-01, -3.7830e-01, -3.5524e-01, -3.0825e-01,
        -3.5478e-01,  1.5774e-01,  2.9116e-02,  3.4075e-04,  1.5851e-01,
        -2.0537e-01,  2.1613e-01,  8.4399e-01, -3.7808e-01, -2.4047e-01,
         3.3064e-01, -2.8880e-01, -5.3783e-02,  3.5023e-01,  6.1740e-01,
         2.9699e-01, -2.7800e-01,  3.6858e-01,  5.9942e-01, -4.4517e-01,
         6.4570e-01,  7.7118e-01,  1.8923e-01, -1.6945e-01, -1.5987e-01,
        -3.6847e-01, -4.2477e-01, -5.0249e-01, -3.0586e-01, -3.1183e-01,
         4.1797e-01, -2.2914e-01, -6.6145e-02,  2.6570e-01, -9.5747e-01,
         4.7499e-01,  3.9335e-03,  3.6768e-02,  5.3335e-01,  3.0709e-01],
       grad_fn=<EmbeddingBackward0>)


### Skip-Gram Model 2 - Simple using tensorflow

In [23]:
def generate_batch(training_data, vocab_size, batch_size, negative_samples):
    total_positive_samples = 0
    total_negative_samples = 0
    while True:
        x, y = [], []
        for _ in range(batch_size):
            target_word, context_word = training_data[
                np.random.randint(0, len(training_data))
            ]
            x.append(target_word)
            y.append(context_word)
            total_positive_samples += 1
            for _ in range(negative_samples):
                negative_word = np.random.randint(0, vocab_size)
                x.append(target_word)
                y.append(negative_word)
                total_negative_samples += 1
        # print(f"Total positive samples: {total_positive_samples}")
        # print(f"Total negative samples: {total_negative_samples}")
        yield np.array(x), np.array(y)

In [24]:
next(generate_batch(training_data, len(limited_vocabs), BATCH_SIZE, NEGATIVE_SAMPLES))

(array([153, 153, 153,  20,  20,  20,  62,  62,  62,  40,  40,  40,   7,
          7,   7,  38,  38,  38, 132, 132, 132,   8,   8,   8,  25,  25,
         25,  27,  27,  27,  75,  75,  75,  72,  72,  72,  79,  79,  79,
        173, 173, 173,  97,  97,  97, 115, 115, 115, 169, 169, 169, 171,
        171, 171, 162, 162, 162, 125, 125, 125,  59,  59,  59,   8,   8,
          8, 165, 165, 165, 101, 101, 101,  93,  93,  93,  55,  55,  55,
        113, 113, 113, 165, 165, 165,  57,  57,  57,  72,  72,  72,   7,
          7,   7,  57,  57,  57]),
 array([ 99, 126,   9, 125,  23, 129,  84, 141, 104,  79, 107,  71,  90,
        111,   5, 170,  71, 106,  57, 147, 164, 164,  37,  43,   8,  60,
         90,  81, 156,  40,   8,  69, 122,  10,  71, 121,  55, 168,  28,
          8,  19, 102, 110,  72,  37,   8,  48, 173,  43,  46, 161, 170,
        136, 115,   7, 156,  54,  66,  49,  78, 168, 101,  39, 171,  75,
          6, 162,   5,  23,  15,  58, 101,   9,  41,  28,  99,  10,  70,
        123, 102

In [25]:
steps_per_epoch = len(training_data) // BATCH_SIZE
steps_per_epoch

1310

In [26]:
sg_model2 = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(len(limited_vocabs), EMBEDDING_DIMENSION, input_length=1),
        tf.keras.layers.Dense(len(limited_vocabs), activation="softmax"),
    ]
)
sg_model2.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

In [27]:
sg_model2.fit(
    generate_batch(training_data, len(limited_vocabs), BATCH_SIZE, NEGATIVE_SAMPLES),
    steps_per_epoch=steps_per_epoch,
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fe0c2f26850>

In [30]:
# Extract the word embeddings
embeddings2 = sg_model2.layers[0].get_weights()[0]
embeddings2

array([[-0.11182633,  0.06225012,  0.03999365, ...,  0.09716614,
        -0.22655903, -0.31317216],
       [-0.15385886, -0.16094327, -0.18639304, ...,  0.41144848,
        -0.27976117,  0.04344371],
       [-0.03213493, -0.11641014, -0.01208379, ..., -0.4148307 ,
         0.10783792,  0.03114374],
       ...,
       [ 0.08485515,  0.20776129,  0.27263108, ..., -0.09409647,
        -0.20459221,  0.13095443],
       [ 0.01761165, -0.06932025, -0.10412469, ...,  0.1424022 ,
         0.07342461,  0.01953124],
       [ 0.01401601,  0.00989925, -0.02110025, ..., -0.04379189,
        -0.01745558, -0.02654355]], dtype=float32)

In [31]:
# Save the embeddings
np.save("I_embeddings2.npy", embeddings2)

### Skip Game Model 3 - Custom class using tensorflow

In [33]:
import random

def get_negative_samples(vocab_size, positive_samples, num_negative):
    """Generate negative samples."""
    negatives = []
    for _ in range(len(positive_samples)):
        negative_samples = random.choices(range(vocab_size), k=num_negative)
        negatives.append(negative_samples)
    return negatives

In [28]:
class SkipGramModel3(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel3, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="target_embedding")
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="context_embedding")

    def call(self, inputs):
        target, context, negative_samples = inputs
        target_embed = self.target_embedding(target)
        context_embed = self.context_embedding(context)
        negative_embed = self.context_embedding(negative_samples)

        pos_similarity = tf.reduce_sum(target_embed * context_embed, axis=-1)
        neg_similarity = tf.reduce_sum(target_embed[:, None, :] * negative_embed, axis=-1)

        pos_loss = -tf.math.log(tf.keras.activations.sigmoid(pos_similarity))
        neg_loss = -tf.reduce_sum(tf.math.log(tf.keras.activations.sigmoid(-neg_similarity)), axis=-1)

        return tf.reduce_mean(pos_loss + neg_loss)


In [36]:
def train_skipgram_model3(skipgram_pairs, vocab_size, embedding_dim=50, neg_samples=2, epochs=20, batch_size=256, learning_rate=0.01):
    # Build the Skip-gram model
    model = SkipGramModel3(vocab_size, embedding_dim)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    targets, contexts = zip(*skipgram_pairs)

    # Generate negative samples
    negative_samples = get_negative_samples(vocab_size, skipgram_pairs, neg_samples)

    # Convert lists to NumPy arrays
    targets = np.array(targets, dtype=np.int32)
    contexts = np.array(contexts, dtype=np.int32)
    negative_samples = np.array(negative_samples, dtype=np.int32)

    # Ensure `negative_samples` has the shape (num_pairs, k) for proper batching
    negative_samples = np.reshape(negative_samples, (len(targets), -1))

    # Prepare the dataset
    dataset = tf.data.Dataset.from_tensor_slices((targets, contexts, negative_samples))
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size, drop_remainder=True)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataset:
            target_batch, context_batch, negative_batch = batch

            with tf.GradientTape() as tape:
                loss = model([target_batch, context_batch, negative_batch])

            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            total_loss += loss.numpy()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    return model

In [43]:
sg_model3 = train_skipgram_model3(training_data2, len(limited_vocabs), learning_rate=0.01, epochs=30)

Epoch 1/30, Loss: 279.6546
Epoch 2/30, Loss: 259.9659
Epoch 3/30, Loss: 252.6989
Epoch 4/30, Loss: 247.5729
Epoch 5/30, Loss: 243.6836
Epoch 6/30, Loss: 240.7338
Epoch 7/30, Loss: 238.6875
Epoch 8/30, Loss: 237.0988
Epoch 9/30, Loss: 235.6580
Epoch 10/30, Loss: 234.7808
Epoch 11/30, Loss: 233.8327
Epoch 12/30, Loss: 233.2568
Epoch 13/30, Loss: 232.6561
Epoch 14/30, Loss: 232.1166
Epoch 15/30, Loss: 231.7462
Epoch 16/30, Loss: 231.3378
Epoch 17/30, Loss: 231.2093
Epoch 18/30, Loss: 230.7712
Epoch 19/30, Loss: 230.5703
Epoch 20/30, Loss: 230.3621
Epoch 21/30, Loss: 230.0591
Epoch 22/30, Loss: 229.8899
Epoch 23/30, Loss: 229.8050
Epoch 24/30, Loss: 229.4494
Epoch 25/30, Loss: 229.2482
Epoch 26/30, Loss: 229.2772
Epoch 27/30, Loss: 229.2135
Epoch 28/30, Loss: 229.0219
Epoch 29/30, Loss: 228.9130
Epoch 30/30, Loss: 228.7991


In [45]:
embedding_layer = sg_model3.get_layer("target_embedding")
embeddings3 = embedding_layer.get_weights()[0]
embeddings3

array([[ 0.09214092,  0.2639317 , -0.01208984, ..., -0.25854397,
         0.00940623, -0.15930103],
       [-0.12530985, -0.4615725 , -0.01678889, ...,  0.1533431 ,
         0.119219  , -0.02646273],
       [-0.81657416, -1.0885906 ,  0.78610325, ...,  0.13089012,
         0.55973065,  0.23671834],
       ...,
       [ 0.38961536, -1.7982954 , -1.3742402 , ..., -0.10934696,
         0.03096619,  0.93433195],
       [ 0.14754331, -0.01242159,  0.03215573, ...,  1.2177708 ,
        -0.04930114,  0.18251729],
       [-0.18653333,  1.574383  ,  0.42919046, ..., -0.15763019,
         0.12777998, -0.10498662]], dtype=float32)

In [46]:
# Save the embeddings
np.save("embeddings3.npy", embeddings3)

### Skip-Gram Model 4

In [18]:
import numpy as np

def skipgram(tokenized, vocab_size, window_size=4, negative_samples=2):
    """
    Generate skip-gram pairs with positive and negative examples manually.

    Args:
    - tokenized: List of word indices (tokenized corpus).
    - vocab_size: Size of the vocabulary.
    - window_size: Size of the context window (± window_size).
    - negative_samples: Number of negative samples to generate per positive pair.

    Returns:
    - targets: Target words (center words).
    - contexts: Context words (positive and negative examples).
    - labels: Labels for training (1 for positive pairs, 0 for negative pairs).
    """
    targets, contexts, labels = [], [], []

    for i, target in enumerate(tokenized):
        # Define context window boundaries
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(tokenized))

        # Positive examples: Words within the context window
        positive_context = [tokenized[j] for j in range(start, end) if j != i]
        for context in positive_context:
            targets.append(target)
            contexts.append(context)
            labels.append(1)  # Positive pair

        # Negative sampling: Randomly sample words outside the context
        negative_context = np.random.choice(
            vocab_size, 
            size=negative_samples, 
            replace=False
        )
        for neg_context in negative_context:
            targets.append(target)
            contexts.append(neg_context)
            labels.append(0)  # Negative pair

    return np.array(targets), np.array(contexts), np.array(labels)

In [19]:
# Generate Skip-gram Data
targets, contexts, labels = skipgram(indices, len(limited_vocabs), window_size=CONTEXT_WINDOW, negative_samples=NEGATIVE_SAMPLES)
targets[:2], contexts[:2], labels[:2]

(array([22, 22]), array([50, 88]), array([1, 1]))

In [17]:
def build_skipgram_model(vocab_size, embedding_dim):
    """Build a skip-gram model using negative sampling."""
    # Inputs for target and context words
    target_input = tf.keras.Input(shape=(1,), name="target_input")
    context_input = tf.keras.Input(shape=(1,), name="context_input")
    
    # Embedding layers for target and context
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_dim, 
        input_length=1, 
        name="embedding"
    )
    
    target_embedding = embedding_layer(target_input)
    context_embedding = embedding_layer(context_input)
    
    # Compute dot product
    dot_product = tf.reduce_sum(target_embedding * context_embedding, axis=-1)
    output = tf.keras.layers.Activation("sigmoid")(dot_product)
    
    # Build and compile model
    model = tf.keras.Model(inputs=[target_input, context_input], outputs=output)
    model.compile(optimizer="adam", loss="binary_crossentropy")
    return model

In [18]:
model = build_skipgram_model(len(limited_vocabs), EMBEDDING_DIMENSION)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 target_input (InputLayer)   [(None, 1)]                  0         []                            
                                                                                                  
 context_input (InputLayer)  [(None, 1)]                  0         []                            
                                                                                                  
 embedding (Embedding)       (None, 1, 50)                8750      ['target_input[0][0]',        
                                                                     'context_input[0][0]']       
                                                                                                  
 tf.math.multiply (TFOpLamb  (None, 1, 50)                0         ['embedding[0][0]',       

In [19]:
epochs = 20
batch_size = 32
# Train the Model
history = model.fit(
    [targets, contexts],
    labels,
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
# Extract Embeddings
embeddings4 = model.get_layer("embedding").get_weights()[0]
embeddings4

array([[ 0.17700568,  0.167575  , -0.4565982 , ..., -0.3633192 ,
         1.3828644 ,  0.25987756],
       [-0.52396274,  0.10379477, -0.3350835 , ..., -0.21254343,
         0.18124491, -0.00421848],
       [ 0.68996924, -0.31943733,  0.23552766, ...,  0.40455407,
         0.17505044, -0.34202713],
       ...,
       [ 0.27136925,  0.39427167,  0.01233269, ..., -0.446789  ,
         0.12509896,  0.54302114],
       [-0.41140708,  0.17523365, -0.01412673, ..., -0.38838515,
         0.4091903 , -0.31206805],
       [ 0.17653717, -0.24588977, -0.43444896, ...,  0.04948792,
        -0.01961078, -0.11755933]], dtype=float32)

In [21]:
# Loop through the embeddings and map to words
word_to_embedding4 = {}
for i, word in index_to_word.items():
    word_to_embedding4[word] = embeddings4[i]

In [22]:
word_to_embedding4

{'<UNK>': array([ 0.17700568,  0.167575  , -0.4565982 , -0.12468477, -0.16406532,
        -0.2894348 ,  0.10196926, -0.16389488,  0.11904307,  0.2527621 ,
        -0.5365126 ,  0.14401014,  0.09015285, -0.07594073, -0.20686638,
         0.14518945,  0.18605575,  0.12100258,  0.12381797,  0.24827866,
        -0.1160071 , -0.21443748, -0.15278058, -0.17781441, -1.0892442 ,
        -0.12274525,  0.19193815, -0.13220717,  0.14577834, -0.4401892 ,
         0.23904003,  0.37154558, -0.15388237, -0.24147114,  0.35301903,
         0.1357985 , -0.09185273,  0.3691543 , -0.11072241,  0.28609943,
        -0.18654402, -0.2582477 , -0.14672914, -0.1396053 ,  0.10261047,
        -0.08844615, -0.41337252, -0.3633192 ,  1.3828644 ,  0.25987756],
       dtype=float32),
 'ឬ': array([-0.52396274,  0.10379477, -0.3350835 , -0.51454204,  0.1473687 ,
         0.10425425, -0.8012706 , -0.2076964 , -0.24390541, -0.03804474,
        -0.23433842, -1.0017115 ,  0.60725194, -0.19806425, -0.04846774,
        -0.39

In [23]:
np.save("I_word_to_embedding4.npy", word_to_embedding4)

## Read trained embeddings

In [27]:
# file_embbedings = "I_embeddings2.npy" # Change as you prefer
# file_word_to_embeddings = "I_word_to_embeddings2.npy" # Change as you prefer

file_word_to_embeddings = "I_word_to_embedding4.npy" # Change as you prefer

In [28]:
word_to_embeddings = np.load(file_word_to_embeddings, allow_pickle=True).item()
word_to_embeddings

{'<UNK>': array([ 0.17700568,  0.167575  , -0.4565982 , -0.12468477, -0.16406532,
        -0.2894348 ,  0.10196926, -0.16389488,  0.11904307,  0.2527621 ,
        -0.5365126 ,  0.14401014,  0.09015285, -0.07594073, -0.20686638,
         0.14518945,  0.18605575,  0.12100258,  0.12381797,  0.24827866,
        -0.1160071 , -0.21443748, -0.15278058, -0.17781441, -1.0892442 ,
        -0.12274525,  0.19193815, -0.13220717,  0.14577834, -0.4401892 ,
         0.23904003,  0.37154558, -0.15388237, -0.24147114,  0.35301903,
         0.1357985 , -0.09185273,  0.3691543 , -0.11072241,  0.28609943,
        -0.18654402, -0.2582477 , -0.14672914, -0.1396053 ,  0.10261047,
        -0.08844615, -0.41337252, -0.3633192 ,  1.3828644 ,  0.25987756],
       dtype=float32),
 'ឬ': array([-0.52396274,  0.10379477, -0.3350835 , -0.51454204,  0.1473687 ,
         0.10425425, -0.8012706 , -0.2076964 , -0.24390541, -0.03804474,
        -0.23433842, -1.0017115 ,  0.60725194, -0.19806425, -0.04846774,
        -0.39