# The tensorfow implementation of the Word2Vec paper




In [24]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [25]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [26]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [30]:
# Generates a skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []
    
    #Build the sampling table for vocab_size tokens
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    
    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):
        
        # Generate positive skip=gram pairs for a squences (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence, 
            vocabulary_size=vocab_size,
            sampling_table=sampling_table,
            window_size=window_size,
            negative_samples=0)
        
        # Iterate over each positive skip-gram pair to produce training examples
        # with a positive context word and negative samples
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"),1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")
            # Build context and label vectors (for one target word)
            context = tf.concat([tf.squeeze(context_class, 1), negative_sampling_candidates], 0) 
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
    
    return targets, contexts, labels

In [31]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [32]:
with open(path_to_file) as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [33]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [34]:
# print elements from the dataset to see empty lines have been removed
# The b's printed in the lines below signify
# bytes = b'...' literals = a sequence of octets (integers between 0 and 255)
i = 0
# print(text_ds.as_numpy_iterator())
for elem in text_ds.as_numpy_iterator():
    print(elem)
    i+=1
    if i == 5:
        break

b'First Citizen:'
b'Before we proceed any further, hear me speak.'
b'All:'
b'Speak, speak.'
b'First Citizen:'


In [35]:
# A custom standardization function  to lowercase the text and remove punctuation
def custom_standardization(input_data):
    input_lowercased = tf.strings.lower(input_data)

    # Escape special characters in pattern. 
    # and then use regex_replace to replace them with '' in the input string
    return tf.strings.regex_replace(input_lowercased, 
                                   '[%s]' % re.escape(string.punctuation), '')

# Define the vocabulary size and the number of words in a sequence 
vocab_size = 4096
sequence_length = 10

# Use the 'TextVectorization'  layer to normalize, split and map string to
# integers. Set the 'output_sequence_length' lentght 
# to pad all samples to the same length

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [36]:
# call TextVectorization.adapt  on the text dataset to create vocabulary
vectorize_layer.adapt(text_ds.batch(1024))

In [37]:
#  Save the created vocabulary for reference 
inverse_vocab = vectorize_layer.get_vocabulary()
print(type(inverse_vocab))
print(inverse_vocab[:40])

<class 'list'>
['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your', 'his', 'this', 'but', 'he', 'have', 'as', 'thou', 'him', 'so', 'what', 'thy', 'will', 'no', 'by', 'all', 'king', 'we', 'shall', 'her', 'if']


In [38]:
# Vectorize the data in text_ds
text_vector_ds =  text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
text_vector_ds

<_UnbatchDataset shapes: (10,), types: tf.int64>

Print the first ten elements to see how the TextVectorization transformed them.

In [39]:
list(text_ds)[:10]

[<tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Before we proceed any further, hear me speak.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'All:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Speak, speak.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'You are all resolved rather to die than to famish?'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'All:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'Resolved. resolved.'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First Citizen:'>,
 <tf.Tensor: shape=(), dtype=string, numpy=b'First, you know Caius Marcius is chief enemy to the people.'>]

In [40]:
list(text_vector_ds)[:10]

[<tf.Tensor: shape=(10,), dtype=int64, numpy=array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([138,  36, 982, 144, 673, 125,  16, 106,   0,   0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([106, 106,   0,   0,   0,   0,   0,   0,   0,   0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([   7,   41,   34, 1286,  344,    4,  200,   64,    4, 3690])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([1286, 1286,    0,    0,    0,    0,    0,    0,    0,    0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0])>,
 <tf.Tensor: shape=(10,), dtype=int64, numpy=array([  8

#### The multiplication of the target word with it's context using einsum
<p>
Here is a small example showing how the matrices are multiplied with einsum in the word2vec model.
The structure shown here is similar to the tf.Data.Dataset that is fed into the Word2Vec Model
just with lest data and smaller embedding dimension (embedding_dim==2).
A (3,2) matrix 3 words each word has 2 dimensional embedding, 
we multiply each word with its context, meaning each word with the words from it's context
and add the columns up into one column.
</p>

In [41]:
target_word = tf.constant([
                 [1, 0.5],
                 [1, 2],
                 [1, 3.0]
                 ], dtype=tf.float32)
# 3,5,2 
# 3 contexts 
# each context has 5 words 
# each word has 2 dimensional embedding 
context = tf.constant([
                [[1, 0],
                 [1, -2],
                 [4, 2],
                 [2, 2],
                 [4,5]],
                [[0, 2],
                 [3, -1],
                 [7, -2],
                 [1, 2],
                 [1,3]],
                [[11, 2],
                 [1, 12],
                 [1, 5],
                 [7, 6],
                 [5,0]]
                ],dtype=tf.float32)

#### Some more notes on Einsum - Einstein-Summation

The firs matrix m1 you can see that each context is multiplied by the corresponding word for that context
repeating letters in the einsum notation are multiplied
and in the second matrix m2 you can see that the columns of those embedings are added up
'be,bce->bc' because the 'e' does not appear in the result of the einsum, letters that appear in the first
part of the einsum but not after the '->' are added up.

In [42]:
m1 = tf.einsum('be,bce->bce', target_word, context)
m2 = tf.einsum('be,bce->bc', target_word, context) 
print('m1: \n',m1)
print('\n m2: \n',m2)

m1: 
 tf.Tensor(
[[[ 1.   0. ]
  [ 1.  -1. ]
  [ 4.   1. ]
  [ 2.   1. ]
  [ 4.   2.5]]

 [[ 0.   4. ]
  [ 3.  -2. ]
  [ 7.  -4. ]
  [ 1.   4. ]
  [ 1.   6. ]]

 [[11.   6. ]
  [ 1.  36. ]
  [ 1.  15. ]
  [ 7.  18. ]
  [ 5.   0. ]]], shape=(3, 5, 2), dtype=float32)

 m2: 
 tf.Tensor(
[[ 1.   0.   5.   3.   6.5]
 [ 4.   1.   3.   5.   7. ]
 [17.  37.  16.  25.   5. ]], shape=(3, 5), dtype=float32)


In [43]:
context_for_first_word = context[0]
context_for_first_word_transposed = tf.transpose(context_for_first_word)
target_word0_expanded = tf.expand_dims(target_word[0], axis=0)
# tf.matmul(target[0],  m1context)
# m1context
target_word0_expanded

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[1. , 0.5]], dtype=float32)>

In [44]:
context_for_first_word_transposed

<tf.Tensor: shape=(2, 5), dtype=float32, numpy=
array([[ 1.,  1.,  4.,  2.,  4.],
       [ 0., -2.,  2.,  2.,  5.]], dtype=float32)>

####  The same multiplication as above but done with matrix multiplication and some extra explanation on word2vec
You can see here that there is a matrix multiplication performed for each word embedding with its context
and thus the new matrix is created
and these are the final values in the word2vec model 
against these values the loss function is performed.
First value in each row representing the true class and the rest the negative classes.
In the backward pass the weights of the hidden layer are adjusted to minimize the  loss
and these weights from the hidden layer represent the word embeddings.
Note that the output here are some values that are larger than 1
because values choosen in this example are also larger than 1 so it is easier to follow the calculations. 
Embeddings output by the embedding layer of keras will be between -0.05 and 0.05, because the RandomUniform initializer has been used.

In [46]:
first_row = tf.matmul(target_word0_expanded, context_for_first_word_transposed)
first_row, m2

(<tf.Tensor: shape=(1, 5), dtype=float32, numpy=array([[1. , 0. , 5. , 3. , 6.5]], dtype=float32)>,
 <tf.Tensor: shape=(3, 5), dtype=float32, numpy=
 array([[ 1. ,  0. ,  5. ,  3. ,  6.5],
        [ 4. ,  1. ,  3. ,  5. ,  7. ],
        [17. , 37. , 16. , 25. ,  5. ]], dtype=float32)>)

In [47]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [48]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")



[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


In [49]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED
)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f'targets.shape: {targets.shape}')
print(f'contexts.shape: {contexts.shape}')
print(f'labels.shape: {labels.shape}')
      

100%|███████████████████████████████████| 32777/32777 [00:17<00:00, 1883.92it/s]




targets.shape: (65392,)
contexts.shape: (65392, 5)
labels.shape: (65392, 5)


In [50]:
dataset1 = tf.data.Dataset.from_tensor_slices([1, 2, 3])
for el in dataset1:
    print(el)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)


In [51]:
def times(n):
    return lambda x: x * n
double = times(2)
double(10)

20

In [52]:
print(targets.shape,contexts.shape, labels.shape)

(65392,) (65392, 5) (65392, 5)


In [53]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
num_ns = 4
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


In [54]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5)), (1024, 5)), types: ((tf.int64, tf.int64), tf.int64)>


In [55]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                                 embedding_dim,
                                                 input_length=1,
                                                 name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                                 embedding_dim,
                                                 input_length=num_ns+1)
    
    def call(self, pair):
        target, context = pair
        #target: (batch, dummy?) #the dummy axis doesn't exists in TF2.7+
        #context :(batch, context)
        # target: (batch,)
        if len(target.shape) == 2:
            print('len(target.shape) == 2')
            target = tf.squeeze(target, axis=1)
        # word_emb: (batch, embed)
        word_emb = self.target_embedding(target)
        print(f'word_emb{word_emb}')
        # context_emb: (batch, context, embed)
        context_emb = self.context_embedding(context)

        #dots: (batch, context)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        
        return dots
        

In [56]:
# define loss function
def custom_loss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [57]:

embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

In [58]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")


2023-01-26 18:48:28.437588: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2023-01-26 18:48:28.437627: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2023-01-26 18:48:28.438069: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


In [59]:

target_embedding = layers.Embedding(10,10,
                                                 input_length=1,
                                                 name="w2v")
enc_1  = target_embedding(1)


In [60]:
enc2 = target_embedding(1)

In [61]:
enc_1,enc2

(<tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([ 0.04993533,  0.04133791, -0.01003022, -0.03616273,  0.01099525,
        -0.00558387, -0.046495  ,  0.00755589,  0.04464337, -0.04282141],
       dtype=float32)>,
 <tf.Tensor: shape=(10,), dtype=float32, numpy=
 array([ 0.04993533,  0.04133791, -0.01003022, -0.03616273,  0.01099525,
        -0.00558387, -0.046495  ,  0.00755589,  0.04464337, -0.04282141],
       dtype=float32)>)

In [62]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
len(target.shape) == 2
word_embTensor("word2_vec/w2v_embedding/embedding_lookup/Identity_1:0", shape=(1024, 128), dtype=float32)
len(target.shape) == 2
word_embTensor("word2_vec/w2v_embedding/embedding_lookup/Identity_1:0", shape=(1024, 128), dtype=float32)
 9/63 [===>..........................] - ETA: 1s - loss: 1.6094 - accuracy: 0.2063

2023-01-26 18:48:29.408103: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2023-01-26 18:48:29.408132: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2023-01-26 18:48:29.429178: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2023-01-26 18:48:29.445056: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2023-01-26 18:48:29.463546: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/train/plugins/profile/2023_01_26_18_48_29

2023-01-26 18:48:29.472960: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to logs/train/plugins/profile/2023_01_26_18_48_29/neo.local.trace.json.gz
2023-01-26 18:48:29.482829: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: logs/train/plugins/profile/2023_01_26_18_48_29

2023-01-26 18:48:29.483681: I te

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x14e6346d0>

In [63]:
#docs_infra: no_execute
%tensorboard --logdir logs

In [64]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()
weights, vocab[:10]

(array([[-0.04869585, -0.00979846,  0.01616162, ...,  0.04642085,
         -0.01661394,  0.02395758],
        [-0.18934697,  0.06779505, -0.00595527, ..., -0.15223621,
          0.31419426,  0.02651004],
        [ 0.05358925,  0.15102428, -0.02173385, ...,  0.06489161,
         -0.20461024, -0.20701258],
        ...,
        [-0.17118086,  0.14345002,  0.04594244, ..., -0.00836325,
          0.08011571, -0.18411285],
        [-0.2470236 , -0.00500658, -0.22775547, ..., -0.19098292,
         -0.04500531,  0.30172408],
        [-0.03835887,  0.04480712,  0.20331377, ...,  0.27478793,
          0.15031911,  0.08332097]], dtype=float32),
 ['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a'])

In [66]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [67]:
# Create and save the vectors and metadata files:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')


for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")

out_v.close()
out_m.close()



#### Some resources that filled in the gaps for me 
Word2Vec TensorFlow Tutorial Walkthrough by Ya Xiao - https://www.youtube.com/watch?v=dxTwaUveedo
<br>Making Computation Easier with Cool Numpy Tricks by Kirit Thadaka  - https://www.youtube.com/watch?v=poD8ud4MxOY
<br> And the original tensorflow tutorial itself: https://www.tensorflow.org/tutorials/text/word2vec
    