### Install dependencies and load word2vec

In [1]:
import gensim.downloader as api
print("Loading GoogleNews Word2Vec (300d)...")
word2vec = api.load("word2vec-google-news-300")  # ~1.6GB
print("Model loaded!")


Loading GoogleNews Word2Vec (300d)...
Model loaded!


### Get vocabulary and tokenize

In [2]:
vocabulary = [
    "cat", "dog", "apple", "banana", "computer", "keyboard", "python", "java", "music", "punk",
    "science", "school", "city", "vampire", "monster", "machine", "robot", "artificial", "intelligence", "network"
]
word2idx = {word: idx for idx, word in enumerate(vocabulary)}


### Create embeddings matrix

In [3]:
import numpy as np

embedding_dim = 300
embedding_matrix = np.zeros((len(vocabulary), embedding_dim))

for word, idx in word2idx.items():
    if word in word2vec:
        embedding_matrix[idx] = word2vec[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))  # OOV handling

### Build Keras embeddings and create model

In [4]:
import tensorflow as tf

embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(vocabulary),
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    trainable=False  # Set to True to fine-tune embeddings
)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None,), dtype='int32'),
    embedding_layer,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

2025-04-09 09:21:14.349470: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 09:21:14.639289: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744183274.744855    2064 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744183274.774549    2064 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 09:21:15.025917: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Test it

In [5]:
test_sentence = ["cat", "loves", "banana", "and", "robot"]
input_indices = [word2idx.get(word, 0) for word in test_sentence]  # fallback to index 0 for OOV
input_padded = tf.keras.preprocessing.sequence.pad_sequences([input_indices], maxlen=10)

output = model(input_padded)
print("Model output:", output.numpy())

Model output: [[0.4351716]]


In [6]:
word = "robot"
word_index = word2idx[word]  # get index in embedding matrix
embedding_vector = embedding_layer(tf.constant([word_index]))  # shape: (1, embedding_dim)
embedding_vector = embedding_vector.numpy().squeeze()

print(f"Embedding for '{word}':")
print(embedding_vector)
print("Shape:", embedding_vector.shape)

Embedding for 'robot':
[-4.29687500e-02  1.02050781e-01 -4.17480469e-02  3.36914062e-02
 -4.27246094e-02 -8.54492188e-03  9.37500000e-02  1.10839844e-01
  6.73828125e-02 -4.55078125e-01 -1.37939453e-02  6.34765625e-02
  1.68945312e-01 -1.78710938e-01  1.53808594e-02  1.21093750e-01
 -2.34375000e-01 -2.36511230e-03  2.07519531e-02 -1.88476562e-01
  1.71875000e-01  9.81445312e-02 -4.32128906e-02  1.24511719e-01
 -1.43554688e-01  5.54199219e-02 -1.50390625e-01  6.98242188e-02
  1.68457031e-02 -1.02539062e-01 -1.10839844e-01 -2.81250000e-01
 -1.92382812e-01  1.54418945e-02 -2.58789062e-02 -2.41210938e-01
 -3.63769531e-02  2.50000000e-01 -1.44653320e-02 -3.19824219e-02
  4.33349609e-03 -1.89453125e-01  1.70898438e-01  4.96093750e-01
 -9.08203125e-02 -1.52343750e-01  4.58984375e-02  1.37695312e-01
  3.35937500e-01 -1.01562500e-01 -3.69140625e-01 -2.23388672e-02
  2.22167969e-02 -8.20312500e-02 -9.22851562e-02 -1.34277344e-02
  4.04296875e-01 -1.94335938e-01  2.81250000e-01  1.38671875e-01
 -