In [75]:
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras import backend as K

In [77]:
# Step 1: Data Preparation
text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

# Clean and tokenize the text
text = re.sub(r'[^\w\s]', '', text).lower()
words = text.split()

# Create vocabulary mappings
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}
vocab_size = len(word2id) + 1  # Add 1 for padding

# Convert words to token IDs
tokenized_text = [word2id[word] for word in words]



In [79]:
print(word2id)
print("\n")
print(id2word)

{'the': 1, 'of': 2, 'a': 3, 'processes': 4, 'we': 5, 'are': 6, 'to': 7, 'computational': 8, 'process': 9, 'abstract': 10, 'called': 11, 'about': 12, 'study': 13, 'idea': 14, 'beings': 15, 'that': 16, 'inhabit': 17, 'computers': 18, 'as': 19, 'they': 20, 'evolve': 21, 'manipulate': 22, 'other': 23, 'things': 24, 'data': 25, 'evolution': 26, 'is': 27, 'directed': 28, 'by': 29, 'pattern': 30, 'rules': 31, 'program': 32, 'people': 33, 'create': 34, 'programs': 35, 'direct': 36, 'in': 37, 'effect': 38, 'conjure': 39, 'spirits': 40, 'computer': 41, 'with': 42, 'our': 43, 'spells': 44}


{1: 'the', 2: 'of', 3: 'a', 4: 'processes', 5: 'we', 6: 'are', 7: 'to', 8: 'computational', 9: 'process', 10: 'abstract', 11: 'called', 12: 'about', 13: 'study', 14: 'idea', 15: 'beings', 16: 'that', 17: 'inhabit', 18: 'computers', 19: 'as', 20: 'they', 21: 'evolve', 22: 'manipulate', 23: 'other', 24: 'things', 25: 'data', 26: 'evolution', 27: 'is', 28: 'directed', 29: 'by', 30: 'pattern', 31: 'rules', 32: 'p

In [91]:
# Step 2: Generate Training Data
window_size = 2  # Context window size
X_train = []
y_train = []

# Create context-target pairs
for i in range(window_size, len(tokenized_text) - window_size):
    context = tokenized_text[i - window_size:i] + tokenized_text[i + 1:i + 1 + window_size]
    target = tokenized_text[i]
    X_train.append(context)
    y_train.append(target)

# Convert lists to NumPy arrays
X_train = np.array(X_train)
y_train = to_categorical(y_train, num_classes=vocab_size)

X_train

array([[ 5,  6,  7, 13],
       [ 6, 12, 13,  1],
       [12,  7,  1, 14],
       [ 7, 13, 14,  2],
       [13,  1,  2,  3],
       [ 1, 14,  3,  8],
       [14,  2,  8,  9],
       [ 2,  3,  9,  8],
       [ 3,  8,  8,  4],
       [ 8,  9,  4,  6],
       [ 9,  8,  6, 10],
       [ 8,  4, 10, 15],
       [ 4,  6, 15, 16],
       [ 6, 10, 16, 17],
       [10, 15, 17, 18],
       [15, 16, 18, 19],
       [16, 17, 19, 20],
       [17, 18, 20, 21],
       [18, 19, 21,  4],
       [19, 20,  4, 22],
       [20, 21, 22, 23],
       [21,  4, 23, 10],
       [ 4, 22, 10, 24],
       [22, 23, 24, 11],
       [23, 10, 11, 25],
       [10, 24, 25,  1],
       [24, 11,  1, 26],
       [11, 25, 26,  2],
       [25,  1,  2,  3],
       [ 1, 26,  3,  9],
       [26,  2,  9, 27],
       [ 2,  3, 27, 28],
       [ 3,  9, 28, 29],
       [ 9, 27, 29,  3],
       [27, 28,  3, 30],
       [28, 29, 30,  2],
       [29,  3,  2, 31],
       [ 3, 30, 31, 11],
       [30,  2, 11,  3],
       [ 2, 31,  3, 32],


In [83]:
# Step 3: Define and Train the Model
embedding_dim = 10

# CBOW Model using Keras Sequential API
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window_size * 2),
    Lambda(lambda x: K.mean(x, axis=1)),  # Average embeddings for context
    Dense(vocab_size, activation='softmax')
])

# Compile and train the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, verbose=1)



Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.0115 - loss: 3.8079     
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0449 - loss: 3.8059 
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0449 - loss: 3.8042 
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0668 - loss: 3.8025 
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0553 - loss: 3.8006
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.0460 - loss: 3.7988      
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.1002 - loss: 3.7970  
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0s/step - accuracy: 0.1325 - loss: 3.7951  
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x17716925490>

In [84]:
# Step 4: Prediction
# Function to predict the target word for a given context
def predict_word(context_words):
    context_ids = [word2id[word] for word in context_words if word in word2id]
    context_ids = np.array(context_ids).reshape(1, -1)  # Reshape for prediction
    prediction = model.predict(context_ids)
    predicted_word_id = np.argmax(prediction)
    return id2word[predicted_word_id]

# Testing with a context
test_context = ['computational', 'process', 'abstract', 'beings']
predicted_word = predict_word(test_context)
print(f"Predicted word for the context {test_context}: {predicted_word}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Predicted word for the context ['computational', 'process', 'abstract', 'beings']: a
