In [1]:
import collections
import re

import numpy as np
import pandas as pd
import tensorflow as tf

import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize

nltk.download('reuters')
nltk.download('punkt')

N_DOCS = 100 # Only use first N_DOCS Reuters docs
VOCAB_SIZE = 50000
TEXT_WINDOW_SIZE = 8
BATCH_SIZE = 10 * TEXT_WINDOW_SIZE
EMBEDDING_SIZE = 128
PV_TEST_SET_PERCENTAGE = 5
NUM_STEPS = 100001
LEARNING_RATE = 0.1
NUM_SAMPLED = 64
REPORT_EVERY_X_STEPS = 100

# Token integer ids for special tokens
UNK = 0
NULL = 1

[nltk_data] Downloading package reuters to /Users/richard/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/richard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def accept(word):
    # Accept if not only Unicode non-word characters are present
    return re.sub(r'\W', '', word) != ''

In [3]:
def normalize(word):
    return word.lower()

In [4]:
def build_dataset():
    doc2words = {docid: [normalize(word) for word in word_tokenize(
            reuters.raw(fileid)) if accept(word)] \
            for docid, fileid in enumerate(
                    reuters.fileids()[:N_DOCS])}
    count = [['__UNK__', 0], ['__NULL__', 0]]
    count.extend(collections.Counter(
            [word for words in doc2words.values() \
                    for word in words]).most_common(VOCAB_SIZE - 2))
    assert not set(['__UNK__', '__NULL__']) & set(next(zip(
            *count[2:])))
    dictionary = {}
    for i, (word, _) in enumerate(count):
        dictionary[word] = i
    reverse_dictionary = dict(zip(dictionary.values(),
                                  dictionary.keys()))
    data = []
    doclens = []
    for docid, words in doc2words.items():
        for word in words:
            if word in dictionary:
                wordid = dictionary[word]
            else:
                wordid = UNK
                count[UNK][1] += 1
            data.append((docid, wordid))
        # Pad with NULL values if necessary
        doclen = len(words)
        doclens.append(doclen)
        if doclen < TEXT_WINDOW_SIZE:
            n_nulls = TEXT_WINDOW_SIZE - doclen
            data.extend([(docid, NULL)] * n_nulls)
            count[NULL][1] += n_nulls
    return data, count, doclens, dictionary, reverse_dictionary

In [5]:
data, count, doclens, dictionary, reverse_dictionary = \
        build_dataset()

In [6]:
print('Number of documents:', len(set(next(zip(*data)))))
print('Number of tokens:', len(data))
print('Number of unique tokens:', len(count))
assert len(data) == sum([i for _, i in count])
print('Most common words (+UNK and NULL):', count[:5])
print('Least common words:', count[-5:])
print('Sample data:', data[:5])

vocab_size = min(VOCAB_SIZE, len(count))

Number of documents: 100
Number of tokens: 15955
Number of unique tokens: 3321
Most common words (+UNK and NULL): [['__UNK__', 0], ['__NULL__', 0], ('the', 832), ('of', 466), ('to', 411)]
Least common words: [('computer-related', 1), ('context', 1), ('undertook', 1), ('seoul', 1), ('seriousness', 1)]
Sample data: [(0, 1471), (0, 846), (0, 2748), (0, 479), (0, 25)]


In [7]:
pd.Series(doclens).describe()

count    100.000000
mean     159.550000
std      167.645271
min       20.000000
25%       52.500000
50%      106.500000
75%      198.250000
max      779.000000
dtype: float64

In [8]:
def get_text_window_center_positions():
    # If TEXT_WINDOW_SIZE is even, then define text_window_center
    # as left-of-middle-pair
    doc_start_indexes = [0]
    last_docid = data[0][0]
    for i, (d, _) in enumerate(data):
        if d != last_docid:
            doc_start_indexes.append(i)
            last_docid = d
    twcp = []
    for i in range(len(doc_start_indexes) - 1):
        twcp.extend(list(range(
                doc_start_indexes[i] + (TEXT_WINDOW_SIZE - 1) // 2,
                doc_start_indexes[i + 1] - TEXT_WINDOW_SIZE // 2
                )))
    return doc_start_indexes, twcp

In [9]:
doc_start_indexes, twcp = get_text_window_center_positions()

In [10]:
def get_train_test():
    global twcp
    np.random.shuffle(twcp)
    split_point = (len(twcp) // 100) * (100 - PV_TEST_SET_PERCENTAGE)
    twcp_train = twcp[:split_point]

    # Test set data must come from known documents
    docids_train = set([data[i][0] for i in twcp_train])
    twcp_test = []
    twcp_test_reject = []
    for i in twcp[split_point:]:
        if data[i][0] in docids_train:
            twcp_test.append(i)
        else:
            twcp_test_reject.append(i)
    twcp_train.extend(twcp_test_reject)
    if not twcp_test:
        raise ValueError(
            'No test data, try increasing PV_TEST_SET_PERCENTAGE')
    return twcp_train, twcp_test

In [11]:
twcp_train, twcp_test = get_train_test()

In [12]:
print('Effective test set percentage: {} out of {}, {:.1f}%'.format(
        len(twcp_test), len(twcp_test) + len(twcp_train),
        100 * len(twcp_test) / (len(twcp_test) + len(twcp_train))))

Effective test set percentage: 811 out of 15061, 5.4%


In [13]:
np.random.shuffle(twcp_train)

In [14]:
twcp_train_index = 0

def generate_batch_single_twcp(twcp, i, batch, labels):
    tw_start = twcp - (TEXT_WINDOW_SIZE - 1) // 2
    tw_end = twcp + TEXT_WINDOW_SIZE // 2 + 1
    docids, wordids = zip(*data[tw_start:tw_end])
    batch_slice = slice(i * TEXT_WINDOW_SIZE,
                        (i+1) * TEXT_WINDOW_SIZE)
    batch[batch_slice] = docids
    labels[batch_slice, 0] = wordids
    
def generate_batch():
    global twcp_train_index
    batch = np.ndarray(shape=(BATCH_SIZE,), dtype=np.int32)
    labels = np.ndarray(shape=(BATCH_SIZE, 1), dtype=np.int32)
    for i in range(BATCH_SIZE // TEXT_WINDOW_SIZE):
        generate_batch_single_twcp(twcp_train[twcp_train_index],
                                   i, batch, labels)
        twcp_train_index = (twcp_train_index + TEXT_WINDOW_SIZE) \
                % len(twcp_train)
    return batch, labels

In [15]:
batch, labels = generate_batch()

In [16]:
test_dataset_ = np.ndarray(shape=(len(twcp_test) * TEXT_WINDOW_SIZE,),
                          dtype=np.int32)
test_labels_ = np.ndarray(shape=(len(twcp_test) * TEXT_WINDOW_SIZE,
                                1),
                         dtype=np.int32)
for i in range(len(twcp_test)):
    generate_batch_single_twcp(twcp_test[i], i, test_dataset_, 
                              test_labels_)

In [17]:
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    
    # Input data
    train_dataset = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
    train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])
    test_dataset = tf.constant(test_dataset_, dtype=tf.int32)
    test_labels = tf.constant(test_labels_, dtype=tf.int32)
    
    # Weights
    embeddings = tf.Variable(
            tf.random_uniform([len(doclens), EMBEDDING_SIZE],
                              -1.0, 1.0))
    softmax_weights = tf.Variable(
            tf.truncated_normal(
                    [vocab_size, EMBEDDING_SIZE],
                    stddev=1.0 / np.sqrt(EMBEDDING_SIZE)))
    softmax_biases = tf.Variable(tf.zeros([vocab_size]))
    
    # Model
    # Look up embeddings for inputs
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative
    # labels each time
    loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(
                    softmax_weights, softmax_biases, embed,
                    train_labels, NUM_SAMPLED, vocab_size))
    
    # Optimizer
    optimizer = tf.train.AdagradOptimizer(LEARNING_RATE).minimize(
            loss)
    
    # Test loss
    test_embed = tf.nn.embedding_lookup(embeddings, test_dataset)
    test_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                    tf.matmul(test_embed, tf.transpose(
                              softmax_weights)) + softmax_biases,
                    test_labels[:, 0]))
    
    # Normalized embeddings (to use cosine similarity later on)
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1,
                                  keep_dims=True))
    normalized_embeddings = embeddings / norm

In [None]:
with tf.Session(graph=graph) as session:
    session.run(tf.global_variables_initializer())
    print('Initialized')
    avg_training_loss = 0
    for step in range(NUM_STEPS):
        batch_data, batch_labels = generate_batch()
        feed_dict = {train_dataset: batch_data,
                     train_labels: batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        avg_training_loss += l
        if step % REPORT_EVERY_X_STEPS == 0:
            if step > 0:
                avg_training_loss = \
                        avg_training_loss / REPORT_EVERY_X_STEPS
            # The average loss is an estimate of the loss over the
            # last REPORT_EVERY_X_STEPS batches
            print('Average loss at step {:d}: {:.1f}'.format(
                    step, avg_training_loss))
            avg_training_loss = 0
            test_l = test_loss.eval()
            print('Test loss at step {:d}: {:.1f}'.format(
                    step, test_l))

Initialized
Average loss at step 0: 5.7
Test loss at step 0: 8.2
Average loss at step 100: 5.4
Test loss at step 100: 7.7
Average loss at step 200: 4.9
Test loss at step 200: 7.3
Average loss at step 300: 4.6
Test loss at step 300: 7.0
Average loss at step 400: 4.3
Test loss at step 400: 6.8
Average loss at step 500: 4.2
Test loss at step 500: 6.7
Average loss at step 600: 4.0
Test loss at step 600: 6.5
Average loss at step 700: 3.9
Test loss at step 700: 6.4
Average loss at step 800: 3.7
Test loss at step 800: 6.3
Average loss at step 900: 3.6
Test loss at step 900: 6.2
Average loss at step 1000: 3.5
Test loss at step 1000: 6.2
Average loss at step 1100: 3.5
Test loss at step 1100: 6.1
Average loss at step 1200: 3.4
Test loss at step 1200: 6.0
Average loss at step 1300: 3.4
Test loss at step 1300: 6.0
Average loss at step 1400: 3.3
Test loss at step 1400: 5.9
Average loss at step 1500: 3.2
Test loss at step 1500: 5.9
Average loss at step 1600: 3.2
Test loss at step 1600: 5.8
Average l