# Final Assignment - Word2Vec

#### Imports

In [72]:
from pathlib import Path

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import tqdm

#### Constants

In [33]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

WINDOW_SIZE = 4
NUM_NS = 4
EMBEDDING_DIM = 128

## Preparing the dataset

In [19]:
file_path = "reviews_data.txt"
with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

Print the first few lines:

In [20]:
for line in lines[:3]:
    print(line)

oct nice trendy hotel location not too bad stayed in this hotel for one night as this is fairly new place some of the taxi drivers did not know where it was and or did not want to drive there once have eventually arrived at the hotel was very pleasantly surprised with the decor of the lobby ground floor area it was very stylish and modern found the reception staff geeting me with aloha bit out of place but guess they are briefed to say that to keep up the coroporate image as have starwood preferred guest member was given small gift upon check in it was only couple of fridge magnets in gift box but nevertheless nice gesture my room was nice and roomy there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by bliss the location is not great it is at the last metro stop and you then need to take taxi but if you are not planning on going to see the historic sites in beijing then you will be ok chose to have some breakfast in the 

So, it seems like the reviews data is already all lower case and with no punctuation.
Let us find the vocabulary size first, and determine which words we want to subsample:

In [38]:
vocab = set()
seq_lengths = []

from collections import defaultdict
word_frequencies = defaultdict(int)

for line in lines:
    words = line.split()
    seq_lengths.append(len(words))
    for word in words:
        vocab.add(word)
        word_frequencies[word] += 1

In [61]:
sorted(word_frequencies.items(), reverse=True, key=lambda item: item[1])

[('the', 2812098),
 ('and', 1472767),
 ('to', 1077721),
 ('was', 903010),
 ('in', 748274),
 ('we', 660041),
 ('of', 614458),
 ('hotel', 565672),
 ('for', 544389),
 ('is', 528043),
 ('it', 509630),
 ('room', 445320),
 ('at', 361647),
 ('but', 349456),
 ('on', 335747),
 ('you', 335174),
 ('were', 330943),
 ('with', 320517),
 ('that', 310995),
 ('very', 307420),
 ('this', 307138),
 ('not', 283312),
 ('had', 260005),
 ('there', 241531),
 ('our', 225339),
 ('are', 211395),
 ('as', 210432),
 ('great', 209799),
 ('have', 207932),
 ('my', 204468),
 ('from', 203631),
 ('stay', 189007),
 ('they', 183122),
 ('good', 169934),
 ('all', 169433),
 ('so', 169008),
 ('be', 168374),
 ('staff', 161593),
 ('would', 157965),
 ('location', 154375),
 ('if', 146917),
 ('rooms', 138413),
 ('one', 137352),
 ('which', 127852),
 ('nice', 122123),
 ('no', 120443),
 ('stayed', 119642),
 ('out', 115511),
 ('us', 111793),
 ('clean', 111578),
 ('an', 110669),
 ('night', 110545),
 ('just', 110285),
 ('quot', 109413),
 

In [22]:
np.median(seq_lengths)

131.0

In [23]:
VOCAB_SIZE = len(vocab)
MAX_SEQ_LEN = int(np.mean(seq_lengths) + np.std(seq_lengths))

In [24]:
print(f'{VOCAB_SIZE=}, {MAX_SEQ_LEN=}')

VOCAB_SIZE=150053, MAX_SEQ_LEN=306


In [25]:
text_ds = tf.data.TextLineDataset(file_path)

Now, we will vectorize the vocabulary using a `tf.keras.layers.TextVectorization` layer.

In [26]:
vectorize_layer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE + 2,
    output_mode='int',
    output_sequence_length=MAX_SEQ_LEN,
    vocabulary=list(vocab))

# vectorize_layer.adapt(text_ds.batch(1024))

Save the inverse vocabulary to look it up later:

In [68]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'rattan', 'restrikted', 'thh', 'confernce', 'immenties', 'opining', 'expolre', 'handouts', 'effcient', 'ȡh', 'designhotel', 'pulses', 'mangaged', 'themuseums', 'rtas', 'surveys', 'эb', 'pluspoints']


In [69]:
print(inverse_vocab[:200])

['', '[UNK]', 'rattan', 'restrikted', 'thh', 'confernce', 'immenties', 'opining', 'expolre', 'handouts', 'effcient', 'ȡh', 'designhotel', 'pulses', 'mangaged', 'themuseums', 'rtas', 'surveys', 'эb', 'pluspoints', 'sanctum', 'picad', 'unpleasent', 'zvbqc', 'asumin', 'pillowinternet', 'othertimes', 'citadines', 'banderas', 'amenitiesyour', 'couponsreturned', 'upholstery', 'hotelsin', 'practice', 'mountaineering', 'anisha', 'showerfood', 'wouldnt', 'facilites', 'solicitation', 'semblance', 'artsey', 'horned', 'cuising', 'morshed', 'fouton', 'friar', 'torte', 'busseness', 'klahani', 'roomswill', 'timeyou', 'survellanced', 'utube', 'barder', 'materialised', 'themselvesc', 'shubert', 'havanna', 'trimming', 'heehee', 'consiidered', 'convivia', 'forcefull', 'montrachet', 'damn', 'prague', 'reportst', 'supplying', 'prepeared', 'accured', 'rathna', 'cashout', 'gbk', 'replacemnet', 'presentazione', 'hoitey', 'burn', 'outnumbered', 'urbanes', 'nusband', 'conveninces', 'islamaphobic', 'degraded', '

In [28]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

### Obtain Sequences from the Dataset

In [29]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

255404


In [30]:
for seq in sequences[:3]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[100025  48398  18320 136594  58478  84900 137733 133356  97218  22238
   6376 136594  55223  31363  67270  47090   6376 114927 148479 118487
 133017 111882  69496 128181 143719  15102  31628  84900  37303  95790
 141536  18336  53909  14182  31628  84900 126162  90216   9159  45535
  14540 139515  97717  94528   5598 128181 136594  18336 105383   9720
  55306  90309 128181 124584  69496 128181  29792 117063  72110  54527
 141536  18336 105383  60048  53909  54703 133718 128181 142657  58908
 143167  46530  90309 106801  16798 100636  69496 133017  41419  30181
 105469   9348 140061  90216  90277 146070  90216  38909 126585 128181
 100797 118153  47090 139515  48526 144154  50673  19851  18336  68127
 121355 113728 149258 130207  22238 141536  18336  49787 125682  69496
 136831 108469  22238 113728  20566  41419  60025  48398   9040  33146
 134215  18336  48398  53909  48285  45535   9348  85913  53909  49854
 106462  22238 105499 134215  53909  56553 135580  36807  73365  93235
  6949

### Generate training examples from sequences

In [53]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(VOCAB_SIZE)
sampling_table[130000]

1.0

In [31]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:

      # context_class = tf.expand_dims(tf.constant([context_word], dtype="int64"), 1)

      context_class = context_word.reshape(1, 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      # label = tf.constant([1] + [0] * num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
    labels += [tf.constant([1] + [0] * num_ns, dtype="int64")] * len(positive_skip_grams)

  return targets, contexts, labels

Since the data is too big (RAM wise) we will generate training data in chunks:

In [32]:
train_data_path = Path('train_data')
if not train_data_path.exists():
    train_data_path.mkdir()

num_chunks = 100
step = len(sequences) // num_chunks
for i in range(num_chunks):
    targets_path = train_data_path / f'targets{i}.npy'
    contexts_path = train_data_path / f'contexts{i}.npy'
    labels_path = train_data_path / f'labels{i}.npy'

    if targets_path.exists() and contexts_path.exists() and labels_path.exists():
        continue

    print(f'{i=}')

    targets, contexts, labels = generate_training_data(
        sequences=sequences[i * step:(i + 1) * step],
        window_size=WINDOW_SIZE,
        num_ns=NUM_NS,
        vocab_size=VOCAB_SIZE + 2)

    targets = np.array(targets)
    contexts = np.array(contexts)[:,:,0]
    labels = np.array(labels)

    print('\n')
    print(f"targets.shape: {targets.shape}")
    print(f"contexts.shape: {contexts.shape}")
    print(f"labels.shape: {labels.shape}")

    np.save(targets_path, targets)
    np.save(contexts_path, contexts)
    np.save(labels_path, labels)

Now, we have the training data in chunks. Meaning, 100 .npy files with targets, contexts and labels, each.

## Defining the Model

In [71]:
a = np.load(train_data_path / 'targets0.npy')
b = np.load(train_data_path / 'contexts0.npy')
c = np.load(train_data_path / 'labels0.npy')
print(a[0].shape)
print(b[0].shape)
print(c[0])

()
(5,)
[1 0 0 0 0]


In [73]:
input_target = layers.Input(shape=())
input_context = layers.Input(shape=(NUM_NS + 1,))
embedding_target = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=1, name="w2v_embedding")(input_target)
embedding_context = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=NUM_NS + 1)(input_context)

dotted = layers.Dot(axes=-1)([embedding_target, embedding_context])
model = models.Model(inputs=[input_target, input_context], outputs=dotted)

In [77]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((a, b), c))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

model.compile(optimizer='adam', loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
model.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20

In [75]:
input_target(a[0])

TypeError: 'KerasTensor' object is not callable

In [74]:
model((a[0], b[0]))

AttributeError: Exception encountered when calling layer "model" (type Functional).

'numpy.int64' object has no attribute '_keras_mask'

Call arguments received:
  • inputs=('21570', 'tf.Tensor(shape=(5,), dtype=int64)')
  • training=None
  • mask=None