### 3.2.3 Word2Vec (Word Embedding)

- Author: Phanxuan Phuc
- Project: https://github.com/phanxuanphucnd/TensorFlow-2.0-Tutorial

In [1]:
import os
import random
import zipfile
import numpy as np
import collections
import urllib.request
import tensorflow as tf

2021-07-07 00:37:27.708141: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-07 00:37:27.708166: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Training parameters
lr = 0.1
bs = 128
eval_step = 200000
display_step = 10000
num_steps = 3000000

# Evaluation parameters
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']

# Word2Vec parameters
embedding_size = 200     # Dimension of the embedding vector
max_vocab_size = 50000   # Total numbers of different words in the Vocaburary
min_occurrence = 10      # Remove all words that doesn't appears at least n times
skip_window = 3          # How many words to consider left and right
num_skips = 2            # How many times to reuse an input to generate a label
num_sampled = 64         # Number of negative examples to sample

### Download Dataset

- Text8 Dataset: [Download](http://mattmahoney.net/dc/text8.zip)

In [3]:
# Unzip the dataset file. Text has already been processed.

data_path = '../../data/text8.zip'

with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

In [4]:
text_words = [i.decode('utf-8') for i in text_words]

In [5]:
# Build the Dictionary and replace rare words with UNK token.
count = [('UNK', -1)]

# Retrieve the most common words.
count.extend(collections.Counter(text_words).most_common(max_vocab_size - 1))

# Remove samples with less than 'min_occurence' ocurrences.
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so that stop when 'min_occurence' is reached
        break
        
# Compute the Vocaburary size
vocab_size = len(count)

# Assign an id to each word
word2id = dict()
for i, (word, _) in enumerate(count):
    word2id[word] = i
    
data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 ('UNK') if not in Dictionary
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
    
id2word = dict(zip(word2id.values(), word2id.keys()))


print(f"Words count: {len(text_words)}")
print(f"Unique words: {len(set(text_words))}")
print(f"Vocaburary size: {vocab_size}")

Words count: 17005207
Unique words: 253854
Vocaburary size: 47135


In [6]:
data_index = 0

# Generate training batch for the skip-gram model
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # Get window_size (words left and right + current one)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    
    buffer.extend(data[data_index: data_index + span])
    data_index += span
    
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w !=  skip_window]
        word_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(word_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
            
        if data_index == len(data):
            buffer.extend(data[0: span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    
    return batch, labels

In [7]:
with tf.device('/cpu:0'):
    # Create the Embedding variable (each row represent a word embedding vector)
    embedding = tf.Variable(tf.random.normal([vocab_size, embedding_size]))
    
    # Construct the variables for NCE loss
    nce_weights = tf.Variable(tf.random.normal([vocab_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocab_size]))
    
def get_embedding(x):
    with tf.device('/cpu:0'):
        # Lookup the corresponding embedding vectors for each sample in X
        embeded = tf.nn.embedding_lookup(embedding, x)
        
        return embeded

def nce_loss(embeded, y):
    with tf.device('/cpu:0'):
        # Compute the average NCE loss for the batch
        y = tf.cast(y, tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights, 
                           biases=nce_biases, 
                           labels=y,
                           inputs=embeded,
                           num_sampled=num_sampled,
                           num_classes=vocab_size
                          )
        )
        
        return loss
    
def evaluate(embeded):
    with tf.device('/cpu:0'):
        # Compute the cosine similarity between input data embedding and every embedding vectors
        embeded = tf.cast(embeded, tf.float32)
        embeded_norm = embeded / tf.sqrt(tf.reduce_sum(tf.square(embeded)))
        embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)
        cosine_sim_op = tf.matmul(embeded_norm, embedding_norm, transpose_b=True)
        
        return cosine_sim_op
    
# Define the optimizer
optimizer = tf.optimizers.SGD(lr)

2021-07-07 00:37:42.830976: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-07 00:37:42.831001: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-07 00:37:42.831020: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (phucphan-ThinkPad): /proc/driver/nvidia/version does not exist
2021-07-07 00:37:42.831280: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# Optimization process
def run_optimization(x, y):
    with tf.device('/cpu:0'):
        # Wrap computation inside a GradientTape for automatic differentation
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)
            
        # Compuate gradients
        gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])
        
        # Update W and b following gradients
        optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))

In [None]:
# Words for testing
x_test = np.array([word2id[w] for w in eval_words])


# Run training for the given number of steps
for step in range(1, num_steps + 1):
    batch_x, batch_y = next_batch(bs, num_skips, skip_window)
    run_optimization(batch_x, batch_y)
    
    if step % display_step == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x), batch_y)
        print("Step: %i, loss: %.4f" % (step, loss))
    
    if step % eval_step == 0 or step == 1:
        print(f"Evaluation...")
        sim = evaluate(get_embedding(x_test)).numpy()
        for i in range(len(eval_words)):
            top_k = 8    # Number of nearest neighbors
            nearest = (-sim[i, :]).argsort()[1: top_k + 1]
            log_str = '"%s" nearest neighbors: ' % eval_words[i]
            for k in range(top_k):
                log_str = '%s %s, ' % (log_str, id2word[nearest[k]])
                
            print(log_str)

Step: 1, loss: 32.8903
Evaluation...
"five" nearest neighbors:  six,  four,  three,  eight,  one,  two,  seven,  zero, 
"of" nearest neighbors:  and,  a,  in,  is,  this,  or,  for,  to, 
"going" nearest neighbors:  god,  united,  second,  life,  old,  form,  any,  within, 
"hardware" nearest neighbors:  english,  later,  during,  same,  number,  list,  some,  level, 
"american" nearest neighbors:  by,  but,  many,  its,  at,  was,  which,  all, 
"britain" nearest neighbors:  her,  d,  year,  same,  people,  number,  states,  being, 
