# Final Assignment - Word2Vec

#### Imports

In [1]:
from pathlib import Path

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import tqdm

#### Constants

In [2]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

WINDOW_SIZE = 5
NUM_NS = 15
EMBEDDING_DIM = 128

## Preprocessing - Preparing the dataset

In [18]:
file_path = "reviews_data.txt"
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

Print the first few lines:

In [19]:
for line in lines[:3]:
    print(line)

oct nice trendy hotel location not too bad stayed in this hotel for one night as this is fairly new place some of the taxi drivers did not know where it was and or did not want to drive there once have eventually arrived at the hotel was very pleasantly surprised with the decor of the lobby ground floor area it was very stylish and modern found the reception staff geeting me with aloha bit out of place but guess they are briefed to say that to keep up the coroporate image as have starwood preferred guest member was given small gift upon check in it was only couple of fridge magnets in gift box but nevertheless nice gesture my room was nice and roomy there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by bliss the location is not great it is at the last metro stop and you then need to take taxi but if you are not planning on going to see the historic sites in beijing then you will be ok chose to have some breakfast in the 

So, it seems like the reviews data is already all lower case and with no punctuation.
Let us find the vocabulary size first, and determine which words we want to subsample:

In [20]:
seq_lengths = []

from collections import defaultdict
word_frequencies = defaultdict(int)

for line in lines:
    words = line.split()
    seq_lengths.append(len(words))
    for word in words:
        word_frequencies[word] += 1

In [21]:
sorted_word_frequencies = sorted(word_frequencies.items(), reverse=True, key=lambda item: item[1])
sorted_word_frequencies[-10:]

[('the', 2812098),
 ('and', 1472767),
 ('to', 1077721),
 ('was', 903010),
 ('in', 748274),
 ('we', 660041),
 ('of', 614458),
 ('hotel', 565672),
 ('for', 544389),
 ('is', 528043)]

In [22]:
filtered_sorted_word_frequencies = list(filter(lambda t: t[1] > 4, sorted_word_frequencies))
filtered_sorted_word_frequencies[-10:]

[('the', 2812098),
 ('and', 1472767),
 ('to', 1077721),
 ('was', 903010),
 ('in', 748274),
 ('we', 660041),
 ('of', 614458),
 ('hotel', 565672),
 ('for', 544389),
 ('is', 528043)]

In [23]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\misha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
filtered_sorted_word_frequencies = list(filter(lambda t: t[0] not in stop_words, filtered_sorted_word_frequencies))

In [25]:
vocab_by_freq, freq = zip(*filtered_sorted_word_frequencies)

In [26]:
VOCAB_SIZE = len(vocab_by_freq)
MAX_SEQ_LEN = int(np.mean(seq_lengths) + np.std(seq_lengths))

In [27]:
print(f'VOCAB_SIZE={VOCAB_SIZE}, MAX_SEQ_LEN={MAX_SEQ_LEN}')

VOCAB_SIZE=38436, MAX_SEQ_LEN=306


In [28]:
text_ds = tf.data.TextLineDataset(file_path)

Now, we will vectorize the vocabulary using a `tf.keras.layers.TextVectorization` layer.

In [29]:
vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE + 2,
    output_mode='int',
    output_sequence_length=MAX_SEQ_LEN,
    vocabulary=vocab_by_freq)

# vectorize_layer.adapt(text_ds.batch(1024))

Save the inverse vocabulary to look it up later:

In [30]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'hotel', 'room', 'great', 'stay', 'good', 'staff', 'would', 'location', 'rooms', 'one', 'nice', 'stayed', 'us', 'clean', 'night', 'quot', 'breakfast', 'service']


In [31]:
import _pickle as pickle

with open('inverse_vocab', 'wb') as f:
    pickle.dump(inverse_vocab, f)

In [15]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

NameError: name 'text_ds' is not defined

### Obtain Sequences from the Dataset

In [56]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

255404


In [57]:
for seq in sequences[:3]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[  122    12  1139     2     9     1     1   143    13     1     1     2
     1    11    16     1     1     1   559    53    24     1     1     1
   302  1625     1     1   193     1     1     1     1     1     1     1
    85     1   933     1     1     1  1284   123     1     1     2     1
     1   806   480     1     1   347     1     1    93   846    34    40
     1     1     1  1191     1   217    73     1   138     7     1     1
     1 12951    72     1     1    24     1   709     1     1 24543     1
   125     1     1   455     1     1     1  4517     1     1  1881  1552
   451   718     1   226    23  1609   382    43     1     1     1     1
   240     1   450 20219     1  1609  1338     1  3167    12  4001     1
     3     1    12     1  1415     1     1   265     1   103   371     1
     1     3     1     1    20    36   397  1324     1   112   315     1
   673     1  3405     1     9     1     1     4     1     1     1     1
   174   574   367     1     1     1   134     1   

### Generate training examples from sequences

In [58]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, sampling_table):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # # Build the sampling table for `vocab_size` tokens.
  # sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:

      # context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

      context_class = context_word.reshape(1, 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      # label = tf.constant([1] + [0] * num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
    labels += [tf.constant([1] + [0] * num_ns, dtype="int64")] * len(positive_skip_grams)

  return targets, contexts, labels

Since the data is too big (RAM wise) we will generate training data in chunks:

In [15]:
train_data_path = Path('train_data_w5_ns15')
if not train_data_path.exists():
    train_data_path.mkdir()

sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(len(inverse_vocab))

num_chunks = 10
step = len(sequences) // num_chunks
for i in range(num_chunks):
    targets_path = train_data_path / f'targets{i}.npy'
    contexts_path = train_data_path / f'contexts{i}.npy'
    labels_path = train_data_path / f'labels{i}.npy'

    if targets_path.exists() and contexts_path.exists() and labels_path.exists():
        continue

    print(f'{i=}')

    targets, contexts, labels = generate_training_data(
        sequences=sequences[i * step:(i + 1) * step],
        window_size=WINDOW_SIZE,
        num_ns=NUM_NS,
        vocab_size=len(inverse_vocab),
        sampling_table=sampling_table,
    )

    targets = np.array(targets)
    contexts = np.array(contexts)[:,:,0]
    labels = np.array(labels)

    print('\n')
    print(f"targets.shape: {targets.shape}")
    print(f"contexts.shape: {contexts.shape}")
    print(f"labels.shape: {labels.shape}")

    np.save(targets_path, targets)
    np.save(contexts_path, contexts)
    np.save(labels_path, labels)

NameError: name 'sequences' is not defined

Now, concatenate all chunks into single big arrays, and take advantage of knowing the vocabulary size by changing the dtype:

In [16]:
train_data_path = Path('train_data_w5_ns15')
targets_path = train_data_path / 'targets'
contexts_path = train_data_path / 'contexts'
labels_path = train_data_path / 'labels'

num_chunks = 10
if not targets_path.exists() or not contexts_path.exists() or not labels_path.exists():
    targets_list = [np.load(train_data_path / f'targets{i}.npy') for i in range(num_chunks)]
    contexts_list = [np.load(train_data_path / f'contexts{i}.npy') for i in range(num_chunks)]
    labels_list = [np.load(train_data_path / f'labels{i}.npy') for i in range(num_chunks)]

    targets = np.concatenate(targets_list, axis=0, dtype=np.int16)
    contexts = np.concatenate(contexts_list, axis=0, dtype=np.int16)
    labels = np.concatenate(labels_list, axis=0, dtype=np.int16)

    np.save(targets_path, targets)
    np.save(contexts_path, contexts)
    np.save(labels_path, labels)

## Train a Word2Vec model

#### Load the training data

In [3]:
train_data_path = Path('train_data_w5_ns15')

targets = np.load(train_data_path / 'targets.npy')
contexts = np.load(train_data_path / 'contexts.npy')
labels = np.load(train_data_path / 'labels.npy')
print(targets[0].shape)
print(contexts[0].shape)
print(labels[0])

print(len(targets))

()
(16,)
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
39166039


In [7]:
# BATCH_SIZE = 1024
BATCH_SIZE = 4096
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

#### Defining the Model

In [5]:
with open('inverse_vocab', 'rb') as f:
    inverse_vocabulary = pickle.load(f)

VOCAB_SIZE = len(inverse_vocab)

In [6]:
input_target = layers.Input(shape=())
input_context = layers.Input(shape=(NUM_NS + 1,))
embedding_target = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=1, name="w2v_embedding")(input_target)
embedding_context = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=NUM_NS + 1)(input_context)

dotted = layers.Dot(axes=[1, 2])([embedding_target, embedding_context])
model = models.Model(inputs=[input_target, input_context], outputs=dotted)

#### Compile and run

In [10]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="my_logs")

# checkpoint a model. here we save the best model relative to validation loss
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(filepath="word2vec_model_w5_ns15_ckpt.h5", monitor='accuracy', save_best_only=True)

# restore_best_weights - Whether to restore model weights from
# the epoch with the best value of the monitored quantity.
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='accuracy')

history = model.fit(dataset, epochs=100, callbacks=[tensorboard_callback, checkpoint_cb, early_stopping_cb])

Epoch 1/100


ResourceExhaustedError: Graph execution error:

Detected at node 'Adam/Adam/update_1/mul_1' defined at (most recent call last):
    File "C:\Users\misha\miniconda3\envs\orly310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\misha\miniconda3\envs\orly310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\asyncio\base_events.py", line 595, in run_forever
      self._run_once()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\asyncio\base_events.py", line 1881, in _run_once
      handle._run()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\interactiveshell.py", line 2768, in run_cell
      result = self._run_cell(
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\interactiveshell.py", line 2814, in _run_cell
      return runner(coro)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\interactiveshell.py", line 3012, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\interactiveshell.py", line 3191, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\IPython\core\interactiveshell.py", line 3251, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\misha\AppData\Local\Temp\ipykernel_15112\4178320667.py", line 16, in <module>
      history = model.fit(dataset, epochs=100, callbacks=[tensorboard_callback, checkpoint_cb, early_stopping_cb])
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\engine\training.py", line 863, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 532, in minimize
      return self.apply_gradients(grads_and_vars, name=name)
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 671, in apply_gradients
      return tf.__internal__.distribute.interim.maybe_merge_call(
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 716, in _distributed_apply
      update_op = distribution.extended.update(
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 694, in apply_grad_to_update_var
      return self._resource_apply_sparse_duplicate_indices(
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\optimizer_v2.py", line 1280, in _resource_apply_sparse_duplicate_indices
      return self._resource_apply_sparse(summed_grad, handle, unique_indices,
    File "C:\Users\misha\miniconda3\envs\orly310\lib\site-packages\keras\optimizer_v2\adam.py", line 201, in _resource_apply_sparse
      m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
Node: 'Adam/Adam/update_1/mul_1'
failed to allocate memory
	 [[{{node Adam/Adam/update_1/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_209173]

#### Saving the results

Save the model

In [11]:
model.save("word2vec_model_w5_ns15.h5", include_optimizer=True)

Retrieve the learned embeddings

In [61]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
print(f'weights shape is {weights.shape}')

import _pickle as pickle

with open('inverse_vocab', 'rb') as f:
    inverse_vocabulary = pickle.load(f)

Now, save the vectors to disk in a `.parquet` file with a word column as key:

In [43]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

df = pd.DataFrame({'word': inverse_vocabulary[2:], 'embedding': list(weights[2:])})
df.set_index('word', inplace=True)

table = pa.Table.from_pandas(df)
pq.write_table(table, 'word2vec_embeddings.parquet')

In [44]:
df.head()

Unnamed: 0_level_0,embedding
word,Unnamed: 1_level_1
hotel,"[-0.12856515, 0.15770362, 0.06505656, 0.234301..."
room,"[-0.08665641, 0.22643305, 0.17230006, 0.242368..."
great,"[0.18022697, -0.051985245, 0.3708178, -0.03778..."
stay,"[-0.048565857, 0.11214562, -0.022309598, 0.208..."
good,"[0.07658313, -0.07445546, 0.08375567, -0.03007..."


## Analyze the learned Word Embeddings

### Find Most Similar

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

embeddings_path='word2vec_embeddings.parquet'
word_vectors_table = pq.read_table(embeddings_path).to_pandas()

def find_most_similar(word: str, k: int = 10, word_vectors: pd.DataFrame = word_vectors_table):
    word_vector = word_vectors.loc[word].embedding

    weights = np.asarray(list(word_vectors_table.embedding.values))
    similarities = cosine_similarity(word_vector.reshape(1, -1), weights).flatten()

    most_similar_idxs = np.argsort(similarities)[::-1][1:k + 1]  # skip the word itself
    return [(word_vectors.index[idx], similarities[idx]) for idx in most_similar_idxs]

In [12]:
find_most_similar('car')

[('vehicle', 0.6907699),
 ('van', 0.60751176),
 ('cars', 0.6057887),
 ('parked', 0.5898296),
 ('suv', 0.5631365),
 ('valet', 0.53012586),
 ('rental', 0.52415824),
 ('trolley', 0.5229506),
 ('taxi', 0.51901245),
 ('shuttle', 0.51762193)]

### Visualize with the `Tensorboard Embedding Projector`

Load the vectors from the `.parquet` file to a 2d numpy array

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import pyarrow.parquet as pq
import numpy as np

embeddings_path='word2vec_embeddings.parquet'
word_vectors_table = pq.read_table(embeddings_path).to_pandas()

weights = np.asarray(list(word_vectors_table.embedding.values))
weights.shape

(38436, 128)

Save vectors metadata in `.tsv` format for the use of the `Embedding Projector`

In [4]:
from pathlib import Path

logs_path = Path('my_logs')
if not logs_path.exists():
    logs_path.mkdir()

with open(logs_path / 'metadata.tsv', 'w', encoding='utf-8') as f:
    for word in word_vectors_table.index:
      f.write(word + "\n")

Produce a checkpoint for the `Embedding Projector`

In [5]:
import tensorflow as tf

weights = tf.Variable(weights)

checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(logs_path / "embedding.ckpt")

'my_logs\\embedding.ckpt-1'

Configure and run the `Embedding Projector`

In [7]:
from tensorboard.plugins import projector

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(logs_path, config)

In [None]:
%load_ext tensorboard
%tensorboard --logdir 'my_logs'

### Find Clusters using K-Means