In [1]:
# Data Preparation
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K

corpus = [
    "the quick brown fox jumps over the lazy dog",
    "i love programming in python",
    "natural language processing is fascinating",
    "i enjoy learning new things every day"
]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1


In [2]:
# Generate Training Data
def generate_cbow_data(corpus, window_size=2):
    contexts, targets = [], []
    for sentence in corpus:
        tokens = tokenizer.texts_to_sequences([sentence])[0]
        for i, word in enumerate(tokens):
            context = [tokens[j] for j in range(max(0, i-window_size), min(len(tokens), i+window_size+1)) if j != i]
            target = word
            contexts.append(context)
            targets.append(target)
    contexts = pad_sequences(contexts, padding='post')  # Ensure all contexts are of the same length
    return np.array(contexts), np.array(targets)

contexts, targets = generate_cbow_data(corpus)


In [6]:
# Train the Model
embedding_dim = 10

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=contexts.shape[1]),
    Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_dim,)),
    Dense(vocab_size, activation='softmax')
])

In [4]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(contexts, targets, epochs=100, verbose=1)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.3462 - loss: 3.0111
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.3462 - loss: 3.0083
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.3462 - loss: 3.0055
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.3462 - loss: 3.0027
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.3462 - loss: 2.9999
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 0.3462 - loss: 2.9971
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.3462 - loss: 2.9942
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step - accuracy: 0.3462 - loss: 2.9913
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x1b881e8c2f0>

In [7]:
def predict_word(context):
    context_seq = tokenizer.texts_to_sequences([context])[0]
    context_seq = pad_sequences([context_seq], maxlen=contexts.shape[1], padding='post')
    predicted_index = np.argmax(model.predict(context_seq), axis=-1)
    return tokenizer.index_word.get(predicted_index[0], "unknown")

context = ["the", "quick", "brown","over"]
predicted_word = predict_word(context)
print(f"Predicted word for context {context}: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 448ms/step
Predicted word for context ['the', 'quick', 'brown', 'over']: fox
