In [1]:
""" starter code for word2vec skip-gram model with NCE loss
CS 20: "TensorFlow for Deep Learning Research"
cs20.stanford.edu
Chip Huyen (chiphuyen@cs.stanford.edu)
Lecture 04
"""

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import tensorflow as tf

import utils
import word2vec_utils

  from ._conv import register_converters as _register_converters


In [2]:
# Model hyperparameters
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128            # dimension of the word embedding vectors
SKIP_WINDOW = 1             # the context window
NUM_SAMPLED = 64            # number of negative examples to sample
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
VISUAL_FLD = 'visualization'
SKIP_STEP = 5000

In [3]:
# Parameters for downloading data
DOWNLOAD_URL = 'http://mattmahoney.net/dc/text8.zip'
EXPECTED_BYTES = 31344016
NUM_VISUALIZE = 3000        # number of tokens to visualize

### loading dataset

In [4]:
def gen():
    yield from word2vec_utils.batch_gen(DOWNLOAD_URL, EXPECTED_BYTES, VOCAB_SIZE, 
                                        BATCH_SIZE, SKIP_WINDOW, VISUAL_FLD)

In [5]:
dataset = tf.data.Dataset.from_generator(gen, (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))

In [6]:
dataset 

<FlatMapDataset shapes: ((128,), (128, 1)), types: (tf.int32, tf.int32)>

### setting word2vec function

In [7]:
#step 1: get input, output form dataset
with tf.name_scope('data'):
    iterator = dataset.make_initializable_iterator()
    center_words, target_words = iterator.get_next()

""" Step 2 + 3: define weights and embedding lookup.
In word2vec, it's actually the weights that we care about 
"""
with tf.name_scope('embed'):
    embed_matrix = tf.get_variable('embed_matrix', shape=[VOCAB_SIZE, EMBED_SIZE],
                                   initializer=tf.random_normal_initializer())
    embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding') #Looks up `ids` in a list of embedding tensors.

# Step 4: construct variables for NCE loss and define loss function
with tf.name_scope('loss'):
    nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
                                initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
    nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))

    # define loss function to be NCE loss function
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, biases=nce_bias,labels=target_words,
                                         inputs=embed,
                                         num_sampled=NUM_SAMPLED, num_classes= VOCAB_SIZE),
                          name= 'nce_loss_with_mean')

# Step 5: define optimizer
with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss)

utils.safe_mkdir('checkpoints')

with tf.Session() as sess:
    sess.run(iterator.initializer)
    sess.run(tf.global_variables_initializer())

    total_loss = .0 # we use this to calculate late average loss in the last SKIP_STEP steps
    writer = tf.summary.FileWriter('./graphs/word2vec', sess.graph)

    for index in range(NUM_TRAIN_STEPS):
        try:
            loss_batch, _ = sess.run([loss, optimizer])
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0

        except tf.errors.OutOfRangeError:
            sess.run(iterator.initializer)
    writer.close()

Downloading http://mattmahoney.net/dc/text8.zip
Successfully downloaded data/text8.zip
Average loss at step 4999:  75.0
Average loss at step 9999:  23.7
Average loss at step 14999:  13.2
Average loss at step 19999:   9.2
Average loss at step 24999:   7.4
Average loss at step 29999:   6.7
Average loss at step 34999:   6.3
Average loss at step 39999:   5.8
Average loss at step 44999:   5.7
Average loss at step 49999:   5.4
Average loss at step 54999:   5.4
Average loss at step 59999:   5.3
Average loss at step 64999:   5.1
Average loss at step 69999:   5.1
Average loss at step 74999:   5.0
Average loss at step 79999:   5.0
Average loss at step 84999:   5.0
Average loss at step 89999:   5.0
Average loss at step 94999:   4.9
Average loss at step 99999:   4.9


## VIsualization in tensorboard

In [9]:
VISUAL_FLD = 'visualization'
NUM_VISUALIZE = 3000        # number of tokens to visualize

In [14]:
def visualize(visual_fld, num_visualize):
    """ run "'tensorboard --logdir='visualization'" to see the embeddings """

    # create the list of num_variable most common words to visualize
    word2vec_utils.most_common_words(visual_fld, num_visualize)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))

        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        final_embed_matrix = sess.run(embed_matrix)

        # you have to store embeddings in a new variable
        embedding_var = tf.Variable(final_embed_matrix[:num_visualize], name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter(visual_fld)

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        # link this tensor to its metadata file, in this case the first NUM_VISUALIZE words of vocab
        embedding.metadata_path = 'vocab_' + str(num_visualize) + '.tsv'

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, os.path.join(visual_fld, 'model.ckpt'), 1)

In [15]:
visualize(VISUAL_FLD, NUM_VISUALIZE)