In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer

# --- a. Data preparation ---
text = "machine learning is fun when you understand machine learning"
tokens = text.split()
window = 2  # look 2 words left and 2 right
t = Tokenizer()
t.fit_on_texts([text])
word2id = t.word_index
vocab_size = len(word2id) + 1

# build context -> target pairs (CBOW style)
contexts = []
targets = []
for i in range(window, len(tokens) - window):
    ctx = [ word2id[tokens[j]] for j in range(i-window, i) ] + \
          [ word2id[tokens[j]] for j in range(i+1, i+window+1) ]
    contexts.append(ctx)                 # list of length 2*window
    targets.append(word2id[tokens[i]])   # target id

X = np.array(contexts)   # shape (N, 2*window)
y = np.array(targets)    # shape (N,)

# --- c. Build CBOW model (train to predict target from context) ---
EMB = 10
context_size = X.shape[1]   # 4 if window=2

context_input = Input(shape=(context_size,), dtype='int32')
emb = Embedding(vocab_size, EMB)         # shared embeddings
emb_ctx = emb(context_input)             # (batch, context_size, EMB)

# average context embeddings
avg = Lambda(lambda x: tf.reduce_mean(x, axis=1))(emb_ctx)   # (batch, EMB)
out = Dense(vocab_size, activation='softmax')(avg)           # predict target id

model = Model(inputs=context_input, outputs=out)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# --- d. Train (very small epochs for exam/demo) ---
model.fit(X, y, epochs=50, verbose=0)   # increase epochs if desired
embeddings = model.get_weights()[0]     # embedding matrix

#print("Embedding for 'machine':", embeddings[word2id['machine']])

print("\nWord Embeddings (Each word → vector of 10 values):")
for word, idx in word2id.items():
    print(f"{word:12s} → {embeddings[idx]}")





Word Embeddings (Each word → vector of 10 values):
machine      → [-0.11543908 -0.01945515  0.02464952 -0.03414501 -0.03975163 -0.09832981
 -0.01888901 -0.06747308 -0.05062633 -0.00416391]
learning     → [-0.02966433 -0.10022102  0.03092061 -0.04376037  0.08314127 -0.11535399
  0.04611897 -0.0119428   0.02665767  0.08213678]
is           → [ 0.0617182  -0.08048289  0.0658352   0.00374401  0.05601666 -0.00064834
  0.01900298  0.07589655 -0.0078079  -0.02476015]
fun          → [-0.01471815 -0.03119268 -0.09974968  0.01303967 -0.0149772  -0.00337561
 -0.06032557 -0.09344924  0.03048224 -0.07893804]
when         → [-0.00387253 -0.04420397  0.03748788 -0.04735139  0.03595938 -0.1187416
  0.00888482 -0.06287563 -0.01897783  0.08889901]
you          → [ 0.09051117 -0.08704613  0.0659629  -0.07421072  0.07282966 -0.0489095
  0.03947134 -0.01350609 -0.02389773 -0.05695556]
understand   → [-0.02705354 -0.03270413 -0.0659134   0.00766318  0.01751989  0.00895919
  0.01211833 -0.05278099 -0.092257