In [16]:
import os
import json

from tqdm.notebook import tqdm
import numpy as np
import nltk
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Flatten
from tensorflow.keras import Sequential

### References

1. [Implementing Word2Vec in Tensorflow](https://medium.com/analytics-vidhya/implementing-word2vec-in-tensorflow-44f93cf2665f)
2. [Word2Vec with TensorFlow](https://www.scaler.com/topics/tensorflow/tensorflow-word2vwc/)

In [None]:
os.makedirs("./word2vec_embeddings/subjects", exist_ok=True)

In [17]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()

In [18]:
def get_one_hot_vector(data_point_index, vocab_size):
    one_hot_vector = np.zeros(vocab_size)
    one_hot_vector[data_point_index] = 1
    return one_hot_vector

In [19]:
def get_embeddings(document: str, filename: str):
  tokens = nltk.word_tokenize(document)
  tokens = [word.lower() for word in tokens if word.isalpha() and len(word) > 1]
  vocab = {"<pad>": 0} | {word: i+1 for i, word in enumerate(set(tokens))}
  vocab_size = len(vocab)
  
  train_samples = []

  window_size = 2

  for i in range(window_size, len(tokens) - window_size):
      for j in range(1, window_size + 1):
          train_samples.append((tokens[i], tokens[i-j]))
          train_samples.append((tokens[i], tokens[i+j]))
          
  x_train = []
  y_train = []

  for word, target_word in train_samples:
    x_train.append(vocab[word])
    y_train.append(get_one_hot_vector(vocab[target_word], vocab_size))
    
  x_train = np.asarray(x_train)
  y_train = np.asarray(y_train)
  
  # Build the Word2Vec model using TensorFlow
  embedding_dim = 100  # Adjust the dimensionality as needed

  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1))
  model.add(Flatten())
  model.add(Dense(vocab_size, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam')
  
  # Train the Word2Vec model
  num_epochs = 10  # Adjust the number of epochs as needed

  model.fit(x_train, y_train, epochs=num_epochs)
  
  word_embeddings = model.layers[0].get_weights()[0]

  word_to_word_embedding = {}

  for word, index in vocab.items():
      word_to_word_embedding[word] = word_embeddings[index].tolist()
      
  # save word embeddings as json file with indent = 2
  with open(f"./word2vec_embeddings/subjects/{filename}.json", "w") as file:
      json.dump(word_to_word_embedding, file, indent=2)

Need to generate embeddings for last 75 documents.

In [20]:
for filename in tqdm(os.listdir("./subjects_cleaned/text")):
  document = read_file(f"./subjects_cleaned/text/{filename}")
  get_embeddings(document, filename.replace(".txt", ""))

  0%|          | 0/265 [00:00<?, ?it/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

KeyboardInterrupt: 