# Mini Project 3 - I. Word Embeddings

In [4]:
EMBEDDING_DIMENSION = 50
CONTEXT_WINDOW = 4
NEGATIVE_SAMPLES = 2
BATCH_SIZE = 32

In [5]:
from khmernltk import word_tokenize
from collections import Counter
import numpy as np
import tensorflow as tf

2025-01-27 15:33:03.013418: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Scratch: Read corpus and tokenizing

In [47]:
# Load and preprocess text
with open("temples.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [48]:
unclean_tokens = word_tokenize(text)
print(f"number of tokens: {len(unclean_tokens)}")
token_counters = Counter(unclean_tokens)
print(f"Original vocab size: {len(token_counters)}")

number of tokens: 11521
Original vocab size: 2276


### Clean the tokens

In [76]:
import re

cleaned_vocabs = Counter()

zero_width_space = '\u200B' # zero width space from Internet
zero_width_space2 = '\u200C' # zero space from experiment below
ignores = [" ","-", "—", ":", "(", ")", "។", "[", "]", "ៗ", "៕", "៛", "៚", "៙", "៘", "៖", "\"",""]
ignores.append(zero_width_space)
ignores.append(zero_width_space2)

for token, count in token_counters.items():
    if token.strip() and token not in ignores and not re.match(r"^\[.*$", token) and not re.match(r".*\]$", token) and not re.match(r"^\(.*$", token) and not re.match(r".*\)$", token) and not re.match(r"^\d+$", token) and not token.isascii():
        cleaned_vocabs[token] = count

print(f"Clean vocabs: {len(cleaned_vocabs)}")

Clean vocabs: 2091


In [74]:
def get_unicode_code_points(token):
    """
    Get the Unicode code points of each character in the token.
    """
    return [ord(char) for char in token]

def format_code_points_as_hex(code_points):
    """
    Format the Unicode code points as hexadecimal strings.
    """
    return [f"U+{code_point:04X}" for code_point in code_points]

In [77]:
with open("cleaned_vocabs.txt", "w", encoding="utf-8") as file:
    row = []
    for token, count in cleaned_vocabs.items():
        # code_points = get_unicode_code_points(token)
        # hex_code_points = format_code_points_as_hex(code_points)
        row.append(f"{token} {count}")

    file.write("\n".join(row))

In [79]:
cleaned_tokens = []
for token in unclean_tokens:
    if token in cleaned_vocabs:
        cleaned_tokens.append(token)

print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [80]:
with open("cleaned_tokens.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(cleaned_tokens))

## Alternative: Read Cleaned tokens

In [8]:
# Read cleaned tokens
with open("cleaned_tokens.txt", "r", encoding="utf-8") as file:
    cleaned_tokens = file.read().split()

In [9]:
print(f"number of cleaned tokens: {len(cleaned_tokens)}")
print(f"Cleaned tokens: {cleaned_tokens[:10]}")

number of cleaned tokens: 9086
Cleaned tokens: ['ប្រាសាទ', 'អង្គរវត្ត', 'ឬ', 'ប្រាសាទ', 'អង្គរ', 'តូច', 'មាន', 'ទីតាំង', 'ស្ថិត', 'នៅ']


In [10]:
vocabs_counter = Counter(cleaned_tokens)
print(f"vocab size: {len(vocabs_counter)}")

vocab size: 2091


In [11]:
print(vocabs_counter['ទីតាំង'])

7


## Skip-gram model
Given a Khmer text corpus “temples.txt” extracted from 3 Wikipedia pages1, build a skip-gram
model/classifier to find representation/embedding of each Khmer word in the corpus. Use the
following settings for your implementation:
- The word embedding has a dimension of 50
- For the skip-gram model, use context window $L$ = ±4 and negative sampling with $k$ = 2
- To tokenize word, you can use Khmer nltk2
- Your vocabulary should contain meaningful and frequent words so:
    - Words whose frequency is less than 10 are ignored
    - Spaces are considered a stop word and are also ignored

### Preparing dataset

#### Clean vocabs and less frequency words  (<10)

In [12]:
FREQURNCY_THRESHOLD = 10
limited_vocabs = set()
for token, count in vocabs_counter.items():
    if count >= FREQURNCY_THRESHOLD:
        limited_vocabs.add(token)
    else:
        limited_vocabs.add("<UNK>")

limited_vocabs = list(limited_vocabs)
print(f"vocab size after limiting: {len(limited_vocabs)}")
limited_vocabs

vocab size after limiting: 175


['ថែវ',
 'ប្រមាណ',
 'ប៉ុន្តែ',
 'ប្រជាជន',
 'រាង',
 'ក្បាច់',
 'ចំនួន',
 'និង',
 'ដែល',
 'យ៉ាង',
 'បង្ហាញ',
 'សំណង់',
 'ជាង',
 'មកពី',
 'មួយ',
 'ក៏',
 'ទិស',
 'ដូចជា',
 'ប្រទេស',
 'ថ្ងៃ',
 'ឆ្នាំ',
 'ខាងត្បូង',
 'ចម្លាក់',
 'នូវ',
 'អោយ',
 'ត្រូវបាន',
 'ឈើ',
 'សៀមរាប',
 'ស្ថិត',
 'ស្រុក',
 'នគរ',
 'ទី២',
 'ចំ',
 'ភ្នំ',
 'មិន',
 'អាច',
 'កណ្តាល',
 'តូច',
 'មាន',
 'ន័យ',
 'តំបន់',
 'ចុង',
 'បី',
 'នីមួយ',
 'ច្រើន',
 'ធ្វើ',
 'ព័ទ្ធ',
 'បុរៈ',
 'ទីក្រុង',
 'កន្លែង',
 'ធំ',
 'ចូល',
 'អង្គរ',
 'ស្រាល',
 'ព្រះបាទ',
 'អង្គរវត្ត',
 'កម្ពុជា',
 'ឈ្មោះ',
 'ប្រើ',
 'ប្រាសាទ',
 'ដំបូង',
 'កំពែង',
 'ខាងក្នុង',
 'តំណាង',
 'ម៉ែត្រ',
 'សម័យ',
 'គេ',
 'ដល់',
 'ឬ',
 'ក្រុម',
 'រចនាបថ',
 'ផ្នែក',
 'កណ្ដាល',
 'ខេត្ត',
 'សម្រាប់',
 'លៀន',
 'នានា',
 'នោះ',
 'មក',
 'នៅ',
 'ដី',
 'ផ្សេង',
 'ជាច្រើន',
 'ដោយ',
 'ពួក',
 'ពេល',
 'ប្រហែល',
 'វិញ',
 'វរ្ម័ន',
 'ផ្លូវ',
 'បាន',
 'រួម',
 'រឺ',
 'ដើម',
 'ជាប់',
 'លក្ខណៈ',
 'ក្នុង',
 'គ្នា',
 'ប៉ម',
 'ភាគ',
 'ភក់',
 'របស់',
 'បុរាណ',
 'អ្នក',
 'បារាំង',
 'ពាក្យ',
 'គោ

#### Word2Number

In [13]:
# Word to index and index to word
word_to_index = {word: i for i, word in enumerate(limited_vocabs)}
index_to_word = {i: word for word, i in word_to_index.items()}

In [14]:
limited_tokens = []
indices = []
for token in cleaned_tokens:
    if token in word_to_index:
        limited_tokens.append(token)
        indices.append(word_to_index[token])

In [15]:
print(len(word_to_index), word_to_index)
print(index_to_word)
print(len(limited_tokens), limited_tokens)
print(len(indices), indices)

175 {'ថែវ': 0, 'ប្រមាណ': 1, 'ប៉ុន្តែ': 2, 'ប្រជាជន': 3, 'រាង': 4, 'ក្បាច់': 5, 'ចំនួន': 6, 'និង': 7, 'ដែល': 8, 'យ៉ាង': 9, 'បង្ហាញ': 10, 'សំណង់': 11, 'ជាង': 12, 'មកពី': 13, 'មួយ': 14, 'ក៏': 15, 'ទិស': 16, 'ដូចជា': 17, 'ប្រទេស': 18, 'ថ្ងៃ': 19, 'ឆ្នាំ': 20, 'ខាងត្បូង': 21, 'ចម្លាក់': 22, 'នូវ': 23, 'អោយ': 24, 'ត្រូវបាន': 25, 'ឈើ': 26, 'សៀមរាប': 27, 'ស្ថិត': 28, 'ស្រុក': 29, 'នគរ': 30, 'ទី២': 31, 'ចំ': 32, 'ភ្នំ': 33, 'មិន': 34, 'អាច': 35, 'កណ្តាល': 36, 'តូច': 37, 'មាន': 38, 'ន័យ': 39, 'តំបន់': 40, 'ចុង': 41, 'បី': 42, 'នីមួយ': 43, 'ច្រើន': 44, 'ធ្វើ': 45, 'ព័ទ្ធ': 46, 'បុរៈ': 47, 'ទីក្រុង': 48, 'កន្លែង': 49, 'ធំ': 50, 'ចូល': 51, 'អង្គរ': 52, 'ស្រាល': 53, 'ព្រះបាទ': 54, 'អង្គរវត្ត': 55, 'កម្ពុជា': 56, 'ឈ្មោះ': 57, 'ប្រើ': 58, 'ប្រាសាទ': 59, 'ដំបូង': 60, 'កំពែង': 61, 'ខាងក្នុង': 62, 'តំណាង': 63, 'ម៉ែត្រ': 64, 'សម័យ': 65, 'គេ': 66, 'ដល់': 67, 'ឬ': 68, 'ក្រុម': 69, 'រចនាបថ': 70, 'ផ្នែក': 71, 'កណ្ដាល': 72, 'ខេត្ត': 73, 'សម្រាប់': 74, 'លៀន': 75, 'នានា': 76, 'នោះ': 77, 'មក': 78, 'នៅ': 79, 'ដី':

In [16]:
np.save("I_word_to_index.npy", word_to_index)
np.save("I_indices.npy", np.array(indices))

with open("limited_tokens.txt", "w", encoding="utf-8") as file:
    file.write(" ".join(limited_tokens))

#### Create dataset

In [17]:
def generate_skipgram_data1(token_indices, window_size):
    """Generate skip-gram pairs."""
    data = []
    for center_idx in range(len(token_indices)):
        for context_idx in range(-window_size, window_size + 1):
            if context_idx == 0 or center_idx + context_idx < 0 or center_idx + context_idx >= len(token_indices):
                continue
            data.append((token_indices[center_idx], token_indices[center_idx + context_idx]))
    return data

In [18]:
def generate_skipgram_data2(token_indices, window_size):
    """Generate skip-gram pairs."""
    skipgram_pairs = []
    for center_idx, center_word in enumerate(token_indices):
        context_range = range(max(center_idx - window_size, 0), 
                                min(center_idx + window_size + 1, len(token_indices)))
        for context_idx in context_range:
            if center_idx != context_idx:
                skipgram_pairs.append((center_word, token_indices[context_idx]))
    return skipgram_pairs

In [19]:
training_data = generate_skipgram_data1(indices, CONTEXT_WINDOW)

In [20]:
print(f"Number of training data: {len(training_data)}")
print(f"Training data: {training_data[:10]}")

Number of training data: 41940
Training data: [(59, 55), (59, 68), (59, 59), (59, 52), (55, 59), (55, 68), (55, 59), (55, 52), (55, 37), (68, 59)]


In [21]:
training_data2 = generate_skipgram_data2(indices, CONTEXT_WINDOW)

In [22]:
print(f"Number of training data: {len(training_data2)}")
print(f"Training data: {training_data2[:10]}")

Number of training data: 41940
Training data: [(59, 55), (59, 68), (59, 59), (59, 52), (55, 59), (55, 68), (55, 59), (55, 52), (55, 37), (68, 59)]


### Skip-gram Model 1 - Using pytorch

In [129]:
import torch
import torch.nn as nn

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(SkipGramModel, self).__init__()
        self.in_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.out_embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, center_word, context_word, negative_samples):
        # Embeddings for the center, context, and negative samples
        center_embeds = self.in_embedding(center_word)  # (batch_size, embedding_dim)
        context_embeds = self.out_embedding(context_word)  # (batch_size, embedding_dim)
        negative_embeds = self.out_embedding(negative_samples)  # (batch_size, k, embedding_dim)

        # Positive score (center and context)
        pos_score = torch.sum(center_embeds * context_embeds, dim=1)  # (batch_size)
        pos_loss = -torch.log(torch.sigmoid(pos_score))  # (batch_size)

        # Negative score (center and negative samples)
        neg_score = torch.bmm(negative_embeds, center_embeds.unsqueeze(2)).squeeze(2)  # (batch_size, k)
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)  # (batch_size)

        return torch.mean(pos_loss + neg_loss)
    
    def get_word_emdedding(self, word_idx):
        return self.in_embedding(torch.tensor(word_idx, dtype=torch.long))

In [131]:
import torch.optim as optim
import random

def train_skipgram(data, vocab_size: int, embedding_dim=50, neg_samples=2, epochs=10, lr=0.01):
    model = SkipGramModel(vocab_size, embedding_dim)
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0
        for center, context in data:
            negative_samples = random.choices(list(range(vocab_size)), k=neg_samples)

            # Convert to tensors
            center_tensor = torch.tensor(center, dtype=torch.long).unsqueeze(0)
            context_tensor = torch.tensor(context, dtype=torch.long).unsqueeze(0)
            negative_tensor = torch.tensor(negative_samples, dtype=torch.long).unsqueeze(0)

            # Zero gradients
            optimizer.zero_grad()

            # Compute loss and backpropagate
            loss = model(center_tensor, context_tensor, negative_tensor)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")
    
    return model

In [132]:
sg_model = train_skipgram(training_data, limited_vocab_size)

Epoch 1, Loss: 199999.3792
Epoch 2, Loss: 106418.1202
Epoch 3, Loss: 83332.6858
Epoch 4, Loss: 74421.3169
Epoch 5, Loss: 70855.0928
Epoch 6, Loss: 68358.0562
Epoch 7, Loss: 67068.0173
Epoch 8, Loss: 66072.9121
Epoch 9, Loss: 65542.6928
Epoch 10, Loss: 64900.8895


In [135]:
word_embedded = sg_model.get_word_emdedding(word_to_index['អង្គរវត្ត'])

In [136]:
print(word_embedded)

tensor([ 2.6869e-01, -8.0715e-01,  8.8348e-02, -1.5651e-02, -3.0536e-01,
        -4.5655e-01, -4.0824e-01, -3.7830e-01, -3.5524e-01, -3.0825e-01,
        -3.5478e-01,  1.5774e-01,  2.9116e-02,  3.4075e-04,  1.5851e-01,
        -2.0537e-01,  2.1613e-01,  8.4399e-01, -3.7808e-01, -2.4047e-01,
         3.3064e-01, -2.8880e-01, -5.3783e-02,  3.5023e-01,  6.1740e-01,
         2.9699e-01, -2.7800e-01,  3.6858e-01,  5.9942e-01, -4.4517e-01,
         6.4570e-01,  7.7118e-01,  1.8923e-01, -1.6945e-01, -1.5987e-01,
        -3.6847e-01, -4.2477e-01, -5.0249e-01, -3.0586e-01, -3.1183e-01,
         4.1797e-01, -2.2914e-01, -6.6145e-02,  2.6570e-01, -9.5747e-01,
         4.7499e-01,  3.9335e-03,  3.6768e-02,  5.3335e-01,  3.0709e-01],
       grad_fn=<EmbeddingBackward0>)


### Skip-Gram Model 2 - Simple using tensorflow

In [23]:
def generate_batch(training_data, vocab_size, batch_size, negative_samples):
    total_positive_samples = 0
    total_negative_samples = 0
    while True:
        x, y = [], []
        for _ in range(batch_size):
            target_word, context_word = training_data[
                np.random.randint(0, len(training_data))
            ]
            x.append(target_word)
            y.append(context_word)
            total_positive_samples += 1
            for _ in range(negative_samples):
                negative_word = np.random.randint(0, vocab_size)
                x.append(target_word)
                y.append(negative_word)
                total_negative_samples += 1
        # print(f"Total positive samples: {total_positive_samples}")
        # print(f"Total negative samples: {total_negative_samples}")
        yield np.array(x), np.array(y)

In [24]:
next(generate_batch(training_data, len(limited_vocabs), BATCH_SIZE, NEGATIVE_SAMPLES))

(array([153, 153, 153,  20,  20,  20,  62,  62,  62,  40,  40,  40,   7,
          7,   7,  38,  38,  38, 132, 132, 132,   8,   8,   8,  25,  25,
         25,  27,  27,  27,  75,  75,  75,  72,  72,  72,  79,  79,  79,
        173, 173, 173,  97,  97,  97, 115, 115, 115, 169, 169, 169, 171,
        171, 171, 162, 162, 162, 125, 125, 125,  59,  59,  59,   8,   8,
          8, 165, 165, 165, 101, 101, 101,  93,  93,  93,  55,  55,  55,
        113, 113, 113, 165, 165, 165,  57,  57,  57,  72,  72,  72,   7,
          7,   7,  57,  57,  57]),
 array([ 99, 126,   9, 125,  23, 129,  84, 141, 104,  79, 107,  71,  90,
        111,   5, 170,  71, 106,  57, 147, 164, 164,  37,  43,   8,  60,
         90,  81, 156,  40,   8,  69, 122,  10,  71, 121,  55, 168,  28,
          8,  19, 102, 110,  72,  37,   8,  48, 173,  43,  46, 161, 170,
        136, 115,   7, 156,  54,  66,  49,  78, 168, 101,  39, 171,  75,
          6, 162,   5,  23,  15,  58, 101,   9,  41,  28,  99,  10,  70,
        123, 102

In [25]:
steps_per_epoch = len(training_data) // BATCH_SIZE
steps_per_epoch

1310

In [26]:
sg_model2 = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(len(limited_vocabs), EMBEDDING_DIMENSION, input_length=1),
        tf.keras.layers.Dense(len(limited_vocabs), activation="softmax"),
    ]
)
sg_model2.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

In [27]:
sg_model2.fit(
    generate_batch(training_data, len(limited_vocabs), BATCH_SIZE, NEGATIVE_SAMPLES),
    steps_per_epoch=steps_per_epoch,
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7fe0c2f26850>

In [30]:
# Extract the word embeddings
embeddings2 = sg_model2.layers[0].get_weights()[0]
embeddings2

array([[-0.11182633,  0.06225012,  0.03999365, ...,  0.09716614,
        -0.22655903, -0.31317216],
       [-0.15385886, -0.16094327, -0.18639304, ...,  0.41144848,
        -0.27976117,  0.04344371],
       [-0.03213493, -0.11641014, -0.01208379, ..., -0.4148307 ,
         0.10783792,  0.03114374],
       ...,
       [ 0.08485515,  0.20776129,  0.27263108, ..., -0.09409647,
        -0.20459221,  0.13095443],
       [ 0.01761165, -0.06932025, -0.10412469, ...,  0.1424022 ,
         0.07342461,  0.01953124],
       [ 0.01401601,  0.00989925, -0.02110025, ..., -0.04379189,
        -0.01745558, -0.02654355]], dtype=float32)

In [31]:
# Save the embeddings
np.save("I_embeddings2.npy", embeddings2)

### Skip Game Model 3 - Custom class using tensorflow

In [33]:
import random

def get_negative_samples(vocab_size, positive_samples, num_negative):
    """Generate negative samples."""
    negatives = []
    for _ in range(len(positive_samples)):
        negative_samples = random.choices(range(vocab_size), k=num_negative)
        negatives.append(negative_samples)
    return negatives

In [28]:
class SkipGramModel3(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel3, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="target_embedding")
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="context_embedding")

    def call(self, inputs):
        target, context, negative_samples = inputs
        target_embed = self.target_embedding(target)
        context_embed = self.context_embedding(context)
        negative_embed = self.context_embedding(negative_samples)

        pos_similarity = tf.reduce_sum(target_embed * context_embed, axis=-1)
        neg_similarity = tf.reduce_sum(target_embed[:, None, :] * negative_embed, axis=-1)

        pos_loss = -tf.math.log(tf.keras.activations.sigmoid(pos_similarity))
        neg_loss = -tf.reduce_sum(tf.math.log(tf.keras.activations.sigmoid(-neg_similarity)), axis=-1)

        return tf.reduce_mean(pos_loss + neg_loss)


In [36]:
def train_skipgram_model3(skipgram_pairs, vocab_size, embedding_dim=50, neg_samples=2, epochs=20, batch_size=256, learning_rate=0.01):
    # Build the Skip-gram model
    model = SkipGramModel3(vocab_size, embedding_dim)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    targets, contexts = zip(*skipgram_pairs)

    # Generate negative samples
    negative_samples = get_negative_samples(vocab_size, skipgram_pairs, neg_samples)

    # Convert lists to NumPy arrays
    targets = np.array(targets, dtype=np.int32)
    contexts = np.array(contexts, dtype=np.int32)
    negative_samples = np.array(negative_samples, dtype=np.int32)

    # Ensure `negative_samples` has the shape (num_pairs, k) for proper batching
    negative_samples = np.reshape(negative_samples, (len(targets), -1))

    # Prepare the dataset
    dataset = tf.data.Dataset.from_tensor_slices((targets, contexts, negative_samples))
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size, drop_remainder=True)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataset:
            target_batch, context_batch, negative_batch = batch

            with tf.GradientTape() as tape:
                loss = model([target_batch, context_batch, negative_batch])

            grads = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            total_loss += loss.numpy()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    return model

In [43]:
sg_model3 = train_skipgram_model3(training_data2, len(limited_vocabs), learning_rate=0.01, epochs=30)

Epoch 1/30, Loss: 279.6546
Epoch 2/30, Loss: 259.9659
Epoch 3/30, Loss: 252.6989
Epoch 4/30, Loss: 247.5729
Epoch 5/30, Loss: 243.6836
Epoch 6/30, Loss: 240.7338
Epoch 7/30, Loss: 238.6875
Epoch 8/30, Loss: 237.0988
Epoch 9/30, Loss: 235.6580
Epoch 10/30, Loss: 234.7808
Epoch 11/30, Loss: 233.8327
Epoch 12/30, Loss: 233.2568
Epoch 13/30, Loss: 232.6561
Epoch 14/30, Loss: 232.1166
Epoch 15/30, Loss: 231.7462
Epoch 16/30, Loss: 231.3378
Epoch 17/30, Loss: 231.2093
Epoch 18/30, Loss: 230.7712
Epoch 19/30, Loss: 230.5703
Epoch 20/30, Loss: 230.3621
Epoch 21/30, Loss: 230.0591
Epoch 22/30, Loss: 229.8899
Epoch 23/30, Loss: 229.8050
Epoch 24/30, Loss: 229.4494
Epoch 25/30, Loss: 229.2482
Epoch 26/30, Loss: 229.2772
Epoch 27/30, Loss: 229.2135
Epoch 28/30, Loss: 229.0219
Epoch 29/30, Loss: 228.9130
Epoch 30/30, Loss: 228.7991


In [45]:
embedding_layer = sg_model3.get_layer("target_embedding")
embeddings3 = embedding_layer.get_weights()[0]
embeddings3

array([[ 0.09214092,  0.2639317 , -0.01208984, ..., -0.25854397,
         0.00940623, -0.15930103],
       [-0.12530985, -0.4615725 , -0.01678889, ...,  0.1533431 ,
         0.119219  , -0.02646273],
       [-0.81657416, -1.0885906 ,  0.78610325, ...,  0.13089012,
         0.55973065,  0.23671834],
       ...,
       [ 0.38961536, -1.7982954 , -1.3742402 , ..., -0.10934696,
         0.03096619,  0.93433195],
       [ 0.14754331, -0.01242159,  0.03215573, ...,  1.2177708 ,
        -0.04930114,  0.18251729],
       [-0.18653333,  1.574383  ,  0.42919046, ..., -0.15763019,
         0.12777998, -0.10498662]], dtype=float32)

In [46]:
# Save the embeddings
np.save("embeddings3.npy", embeddings3)

## Read trained embeddings

In [36]:
file_embbedings = "I_embeddings2.npy" # Change as you prefer
file_word_to_embeddings = "I_word_to_embeddings2.npy" # Change as you prefer

In [37]:
embeddings = np.load(file_embbedings)
embeddings

array([[-0.11182633,  0.06225012,  0.03999365, ...,  0.09716614,
        -0.22655903, -0.31317216],
       [-0.15385886, -0.16094327, -0.18639304, ...,  0.41144848,
        -0.27976117,  0.04344371],
       [-0.03213493, -0.11641014, -0.01208379, ..., -0.4148307 ,
         0.10783792,  0.03114374],
       ...,
       [ 0.08485515,  0.20776129,  0.27263108, ..., -0.09409647,
        -0.20459221,  0.13095443],
       [ 0.01761165, -0.06932025, -0.10412469, ...,  0.1424022 ,
         0.07342461,  0.01953124],
       [ 0.01401601,  0.00989925, -0.02110025, ..., -0.04379189,
        -0.01745558, -0.02654355]], dtype=float32)

In [38]:
word_to_embeddings = {word: embeddings[i] for word, i in word_to_index.items()}
word_to_embeddings

{'ថែវ': array([-0.11182633,  0.06225012,  0.03999365,  0.11941579, -0.30701733,
        -0.11291081, -0.08435429, -0.09988827,  0.07295563, -0.26975915,
         0.25357652, -0.03080184,  0.26745665, -0.06211999,  0.18378237,
         0.3197929 ,  0.42985633,  0.07373494,  0.23360533, -0.20695128,
        -0.11331622,  0.11008541, -0.18423977, -0.08820968, -0.01617474,
        -0.03839512,  0.07004926,  0.08079888,  0.03478885,  0.10471139,
         0.06070754, -0.1657328 ,  0.2188062 , -0.17392518, -0.11617279,
        -0.04367946,  0.01765901,  0.14790802, -0.01988765,  0.01934716,
        -0.18898992,  0.2616369 , -0.16194248,  0.11806435,  0.14261317,
        -0.05453829,  0.27322534,  0.09716614, -0.22655903, -0.31317216],
       dtype=float32),
 'ប្រមាណ': array([-0.15385886, -0.16094327, -0.18639304, -0.28830844,  0.09492248,
         0.07436807, -0.0058606 , -0.07937295,  0.08957793, -0.06966694,
        -0.3062197 , -0.17192289,  0.19771844, -0.1563863 , -0.34480113,
         0

In [39]:
np.save(file_word_to_embeddings, word_to_embeddings)