In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are great pets",
    "the mat is soft and warm"
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1  # +1 for padding

sequences = tokenizer.texts_to_sequences(corpus)

In [3]:
def generate_training_data(sequences, window_size=2):
    contexts = []
    targets = []
    
    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target = sequence[i]
            contexts.append(context)
            targets.append(target)
    
    return np.array(contexts), np.array(targets)

X, y = generate_training_data(sequences)

X = pad_sequences(X, maxlen=4)

In [4]:
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=10, input_length=4))
model.add(Lambda(lambda x: tf.reduce_mean(x, axis=1)))  
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=100)

Epoch 1/100





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.8335
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.1250 - loss: 2.8304
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.1250 - loss: 2.8272
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2500 - loss: 2.8241
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.2500 - loss: 2.8210
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.2500 - loss: 2.8178
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.2500 - loss: 2.8147
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.2500 - loss: 2.8116
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x1888f9eaab0>

In [5]:
word_embeddings = model.layers[0].get_weights()[0]

word_index = tokenizer.word_index


print('Vocabulary Size:', len(word_index))
print('Vocabulary Sample:', list(word_index.items())[:10],"\n\n")


embeddings_dict = {word: word_embeddings[idx] for word, idx in word_index.items()}

print("{:<10} | {}".format("Word", "Embedding"))
print("-" * 40)
for word, embedding in embeddings_dict.items():
    print("{:<10} | {}".format(word, np.round(embedding, 3)))


Vocabulary Size: 16
Vocabulary Sample: [('the', 1), ('sat', 2), ('on', 3), ('mat', 4), ('and', 5), ('cat', 6), ('dog', 7), ('log', 8), ('cats', 9), ('dogs', 10)] 


Word       | Embedding
----------------------------------------
the        | [ 0.07   0.276  0.057 -0.107 -0.088 -0.186 -0.252  0.348 -0.02  -0.08 ]
sat        | [-0.14   0.152  0.173  0.165 -0.118 -0.179 -0.186  0.169  0.138  0.129]
on         | [ 0.175  0.112 -0.151 -0.189  0.08  -0.175 -0.128  0.191 -0.197 -0.189]
mat        | [-0.125  0.161  0.01   0.098 -0.068  0.116 -0.06  -0.061  0.004  0.229]
and        | [-0.147  0.174 -0.003  0.036 -0.066 -0.056 -0.087  0.1   -0.053  0.214]
cat        | [ 0.052  0.121  0.072 -0.002 -0.071 -0.106 -0.092  0.117  0.057 -0.005]
dog        | [-0.02   0.126  0.107 -0.018 -0.038 -0.192 -0.136  0.111  0.036  0.005]
log        | [-0.131  0.07   0.103  0.128 -0.103 -0.156 -0.057  0.137  0.077  0.075]
cats       | [-0.121  0.112 -0.13   0.12  -0.078 -0.144 -0.079  0.099  0.172  0.044]
dogs  