In [1]:
# python 3
# -*- coding: utf-8 -*-
from __future__ import division, print_function, absolute_import

import collections
import os
import random
import urllib.request
import zipfile

import numpy as np
import tensorflow as tf

In [2]:
# Training Parameters
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000

# Evaluation Parameters
eval_words = [b'five', b'of', b'going', b'hardware', b'american', b'britain']

# Word2Vec Parameters
embedding_size = 200 # Dimension of the embedding vector
max_vocabulary_size = 50000 # Total number of different words in the vocabulary
min_occurrence = 10 # Remove all words that does not appears at least n times
skip_window = 3 # How many words to consider left and right
num_skips = 2 # How many times to reuse an input to generate a label
num_sampled = 64 # Number of negative examples to sample

In [3]:
# Download a small chunk of Wikipedia articles collection
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
    print("Downloading the dataset... (It may take some time)")
    filename, _ = urllib.request.urlretrieve(url, data_path)
    print("Done!")
# Unzip the dataset file. Text has already been processed
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

print("Finish loading {}, Size of zipfile is {}.".format(data_path, len(text_words)))

Downloading the dataset... (It may take some time)
Done!
Finish loading text8.zip, Size of zipfile is 17005207.


In [4]:

# Build the dictionary and replace rare words with UNK token
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))

# Remove samples with less than 'min_occurrence' occurrences
for i in range(len(count) - 1, -1, -1):
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # The collection is ordered, so stop when 'min_occurrence' is reached
        break

# Compute the vocabulary size
vocabulary_size = len(count)
# Assign an id to each word
word2id = dict()
for i, (word, _)in enumerate(count):
    word2id[word] = i

data = list()
unk_count = 0
for word in text_words:
    # Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:50])


Words count: 17005207
Unique words: 253854
Vocabulary size: 47135
Most common words: [('UNK', 444176), (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764), (b'in', 372201), (b'a', 325873), (b'to', 316376), (b'zero', 264975), (b'nine', 250430), (b'two', 192644), (b'is', 183153), (b'as', 131815), (b'eight', 125285), (b'for', 118445), (b's', 116710), (b'five', 115789), (b'three', 114775), (b'was', 112807), (b'by', 111831), (b'that', 109510), (b'four', 108182), (b'six', 102145), (b'seven', 99683), (b'with', 95603), (b'on', 91250), (b'are', 76527), (b'it', 73334), (b'from', 72871), (b'or', 68945), (b'his', 62603), (b'an', 61925), (b'be', 61281), (b'this', 58832), (b'which', 54788), (b'at', 54576), (b'he', 53573), (b'also', 44358), (b'not', 44033), (b'have', 39712), (b'were', 39086), (b'has', 37866), (b'but', 35358), (b'other', 32433), (b'their', 31523), (b'its', 29567), (b'first', 28810), (b'they', 28553), (b'some', 28161), (b'had', 28100)]


In [5]:
data_index = 0
# Generate training batch for the skip-gram model
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)  # init batch array
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)  # init label array
    # get window size (words left and right + current one)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span) # init list
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]

        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


In [6]:

# Input data
X = tf.placeholder(tf.int32, shape=[None])
# Input label
Y = tf.placeholder(tf.int32, shape=[None, 1])

# Ensure the following ops & var are assigned on CPU
# (some ops are not compatible on GPU)
with tf.device('/cpu:0'):
    # Create the embedding variable (each row represent a word embedding vector)
    embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
    
    # Lookup the corresponding embedding vectors for each sample in X
    X_embed = tf.nn.embedding_lookup(embedding, X)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Compute the average NCE loss for the batch
loss_op = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   labels=Y,
                   inputs=X_embed,
                   num_sampled=num_sampled,
                   num_classes=vocabulary_size))

# Define the optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluation
# Compute the cosine similarity between input data embedding and every embedding vectors
X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)

In [None]:
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    # Testing data
    x_test = np.array([word2id[w] for w in eval_words])

    average_loss = 0
    for step in range(1, num_steps + 1):
        # Get a new batch of data
        batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
        # Run training op
        _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
        average_loss += loss

        if step % display_step == 0 or step == 1:
            if step > 1:
                average_loss /= display_step
            print("Step " + str(step) + ", Average Loss= " + \
                  "{:.4f}".format(average_loss))
            average_loss = 0

        # Evaluation
        if step % eval_step == 0 or step == 1:
            print("Evaluation...")
            sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
            for i in range(len(eval_words)):
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = '"%s" nearest neighbors:' % eval_words[i]
                for k in range(top_k):
                    log_str = '%s %s,' % (log_str, id2word[nearest[k]])
                print(log_str)

    saver = tf.train.Saver()
    saver.save(sess, "./model-wiki/model.ckpt")

Step 1, Average Loss= 546.0007
Evaluation...
"b'five'" nearest neighbors: b'hawthorne', b'wargame', b'alternate', b'venturi', b'thaw', b'neighboring', b'overwhelming', b'ppg',
"b'of'" nearest neighbors: b'bureaucracies', b'cellular', b'crept', b'ghibli', b'covariance', b'amnon', b'stadiums', b'tidings',
"b'going'" nearest neighbors: b'trailers', b'indicted', b'captures', b'architect', b'heschel', b'characterise', b'refinement', b'hiram',
"b'hardware'" nearest neighbors: b'julia', b'intimacy', b'bertolt', b'radios', b'replacing', b'achievable', b'llc', b'habitat',
"b'american'" nearest neighbors: b'annealing', b'colombian', b'martinez', b'walters', b'hellish', b'crist', b'sligo', b'levee',
"b'britain'" nearest neighbors: b'correlatives', b'allegation', b'acceptability', b'adversarial', b'contrasting', b'chosen', b'proverbial', b'corn',
Step 10000, Average Loss= 199.4634
Step 20000, Average Loss= 94.1371
Step 30000, Average Loss= 65.3162
Step 40000, Average Loss= 50.9172
Step 50000, Aver

Step 1210000, Average Loss= 6.5289
Step 1220000, Average Loss= 6.3697
Step 1230000, Average Loss= 6.4436
Step 1240000, Average Loss= 6.4126
Step 1250000, Average Loss= 6.3676
Step 1260000, Average Loss= 6.3348
Step 1270000, Average Loss= 6.0910
Step 1280000, Average Loss= 6.3760
Step 1290000, Average Loss= 6.3235
Step 1300000, Average Loss= 6.4666
Step 1310000, Average Loss= 6.2316
Step 1320000, Average Loss= 6.2973
Step 1330000, Average Loss= 6.3601
Step 1340000, Average Loss= 6.2467
Step 1350000, Average Loss= 6.2259
Step 1360000, Average Loss= 6.2797
Step 1370000, Average Loss= 6.2300
Step 1380000, Average Loss= 6.2448
Step 1390000, Average Loss= 6.2611
Step 1400000, Average Loss= 6.2005
Evaluation...
"b'five'" nearest neighbors: b'four', b'three', b'six', b'eight', b'seven', b'two', b'one', b'zero',
"b'of'" nearest neighbors: b'the', b'including', b'and', b'its', b'for', b'especially', b'both', b'group',
"b'going'" nearest neighbors: b'their', b'human', b'business', b'to', b'for', 

Step 2410000, Average Loss= 5.6640
Step 2420000, Average Loss= 5.7203
Step 2430000, Average Loss= 5.7089
Step 2440000, Average Loss= 5.7183
