In [1]:
import os
import json

from tqdm.notebook import tqdm
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras import Sequential

### References

1. [Implementing Word2Vec in Tensorflow](https://medium.com/analytics-vidhya/implementing-word2vec-in-tensorflow-44f93cf2665f)
2. [Word2Vec with TensorFlow](https://www.scaler.com/topics/tensorflow/tensorflow-word2vwc/)

In [2]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

In [3]:
document = read_file("./subjects_cleaned/text/33130.txt")
document



In [4]:
tokens = nltk.word_tokenize(document)

In [5]:
tokens = [word.lower() for word in tokens if word.isalpha() and len(word) > 1]

In [6]:
vocab = {"<pad>": 0} | {word: i+1 for i, word in enumerate(set(tokens))}

In [7]:
vocab_size = len(vocab)
vocab_size

515

In [8]:
# save vocab as json file with indent = 2
with open("./word2vec_embeddings/vocab.json", "w") as file:
    json.dump(vocab, file, indent=2)

In [9]:
train_samples = []

window_size = 2

for i in range(window_size, len(tokens) - window_size):
    for j in range(1, window_size + 1):
        train_samples.append((tokens[i], tokens[i-j]))
        train_samples.append((tokens[i], tokens[i+j]))

In [10]:
print(train_samples[:5])

[('browser', 'modern'), ('browser', 'that'), ('browser', 'using'), ('browser', 'supports'), ('that', 'browser')]


In [11]:
def get_one_hot_vector(data_point_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[data_point_index] = 1
    return one_hot_vector

In [12]:
x_train = []
y_train = []

for word, target_word in train_samples:
  x_train.append(vocab[word])
  y_train.append(get_one_hot_vector(vocab[target_word], vocab_size))
  
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [13]:
print("Number of training samples (word-target pairs):", x_train.shape, y_train.shape)

Number of training samples (word-target pairs): (5424,) (5424, 515)


In [14]:
# Build the Word2Vec model using TensorFlow
embedding_dim = 100  # Adjust the dimensionality as needed

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
model.add(Flatten())
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

2024-02-26 16:34:41.435165: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-02-26 16:34:41.435187: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-02-26 16:34:41.435191: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-02-26 16:34:41.435223: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-26 16:34:41.435238: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [15]:
# Train the Word2Vec model
num_epochs = 10  # Adjust the number of epochs as needed

model.fit(x_train, y_train, epochs=num_epochs)

Epoch 1/10


2024-02-26 16:34:41.830333: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x29257c880>

In [30]:
model.layers

[<keras.src.layers.core.embedding.Embedding at 0x29257ea40>,
 <keras.src.layers.reshaping.flatten.Flatten at 0x29257f670>,
 <keras.src.layers.core.dense.Dense at 0x29257f6a0>]

In [31]:
np.array([[1, 2, 3], [4, 5, 6]]).shape

(2, 3)

In [37]:
print("Word to intermediate representation shape:", model.layers[0].get_weights()[0].shape)
print("Intermediate representation to context shape:", model.layers[-1].get_weights()[0].shape)

Word to intermediate representation shape: (515, 100)
Intermediate representation to context shape: (100, 515)


In [16]:
word_embeddings = model.layers[0].get_weights()[0]

# Get the embedding vector for a specific word (replace 'word' with your target word)
target_word = 'mathematical'
word_index = vocab[target_word]
if word_index is not None:
    word_embedding_vector = word_embeddings[word_index]
    print(f"Embedding for '{target_word}': {word_embedding_vector}")
else:
    print(f"'{target_word}' not found in vocabulary.")

Embedding for 'mathematical': [-0.25038496 -0.12074625 -0.25046253 -0.11962021  0.2914548   0.3539963
  0.2338442   0.27389252  0.20226124  0.31001252 -0.28537768 -0.25672868
  0.03425189  0.26329643 -0.2073902   0.20299643  0.18937117 -0.38715443
  0.160799   -0.1122127   0.26695576  0.00865394 -0.00332121  0.08428309
  0.03746206  0.08183785 -0.33986992  0.0545951   0.2038086   0.11077999
  0.2830405  -0.14372191 -0.37783775 -0.17305006 -0.16241601  0.04340811
 -0.06724421 -0.39739466 -0.11676113 -0.2910056  -0.00044849  0.13629049
  0.23495102 -0.05917903  0.19355355 -0.18640693 -0.00192927 -0.10809742
 -0.3647978   0.2045662   0.01070358 -0.03838889 -0.02877319  0.00853972
 -0.34833577  0.16886403 -0.18489802  0.09745418 -0.05802905  0.28081337
  0.05990114  0.03270217 -0.34254116 -0.08073702 -0.33526438 -0.22425586
 -0.08522201  0.33048713 -0.3255536  -0.17234625 -0.17222409  0.10100684
  0.10930716 -0.08554956 -0.24872912  0.06837241  0.17450558  0.24357124
  0.12187509 -0.123668

In [17]:
word_embeddings

array([[ 0.02979138, -0.00868454, -0.01192099, ..., -0.04319798,
         0.03033732,  0.01863689],
       [ 0.01668987, -0.10443415, -0.16142431, ..., -0.03786098,
         0.00874935,  0.26523963],
       [ 0.01370137,  0.12623726,  0.28781334, ...,  0.06898063,
        -0.11818239, -0.1377626 ],
       ...,
       [-0.0612663 ,  0.07626855,  0.1095778 , ...,  0.38879883,
        -0.1381475 ,  0.06042026],
       [-0.05715266,  0.06305658, -0.06303331, ...,  0.01656135,
         0.20961371,  0.00727249],
       [-0.34127975,  0.04580623,  0.22724192, ..., -0.05154541,
        -0.21177986,  0.14540702]], dtype=float32)

In [20]:
word_to_word_embedding = {}

for word, index in vocab.items():
    word_to_word_embedding[word] = word_embeddings[index].tolist()

In [21]:
# save word embeddings as json file with indent = 2
with open("./word2vec_embeddings/word_embeddings.json", "w") as file:
    json.dump(word_to_word_embedding, file, indent=2)