In [28]:
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import pandas as pd
import pickle

In [29]:
filename = 'data.en'

In [30]:
def read_data(filename):
    data = pd.read_pickle(filename)
    text = ""
    
    for line in data:
        text = text + " " + line
        
    return text.strip().split(' ')

words = read_data(filename)
print("Data size %d" % len(words))

Data size 166908


In [31]:
vocabulary_size = len(set(words))

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    dictionary = dict()
    
    for word, _ in count:
        dictionary[word] = len(dictionary)
    
    data = list()
    unk_count = 0
    
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    
    count[0][1] = unk_count
    
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print("Most common words (+UNK)", count[:5])
print("Sample data", data[:10])
del words

Most common words (+UNK) [['UNK', 1], ('the', 8396), ('to', 4939), ('a', 3631), ('s', 3525)]
Sample data [968, 81, 151, 26, 507, 1751, 697, 507, 1216, 1]


In [32]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    
    for i in range(batch_size // num_skips):
        target = skip_window
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

print("Data:", [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print("\nwith num_skips = %d and skip_window = %d:" % (num_skips, skip_window))
    print("    batch:", [reverse_dictionary[bi] for bi in batch])
    print("    labels:", [reverse_dictionary[di] for di in labels.reshape(8)])

Data: ['give', 'your', 'application', 'an', 'accessibility', 'workout', 'accerciser', 'accessibility']

with num_skips = 2 and skip_window = 1:
    batch: ['your', 'your', 'application', 'application', 'an', 'an', 'accessibility', 'accessibility']
    labels: ['application', 'give', 'your', 'an', 'accessibility', 'application', 'an', 'workout']

with num_skips = 4 and skip_window = 2:
    batch: ['application', 'application', 'application', 'application', 'an', 'an', 'an', 'an']
    labels: ['accessibility', 'give', 'your', 'an', 'your', 'accessibility', 'application', 'workout']


In [33]:
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 16
valid_window = 100
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                                    labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
    
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

In [34]:
num_steps = 100001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    average_loss = 0
    
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_dataset:batch_data, train_labels:batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            
            print("Average loss at step %d: %f" % (step, average_loss))
            average_loss = 0
        
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 5.591823
Nearest to target: replying, encountered, mb, duplicated, malformed, larger, profiler, number,
Nearest to video: beleaguered, occured, cmd, determines, occupied, ll, fill, composing,
Nearest to save: identified, brasero, defaults, startup, number, autocomplete, holding, wire,
Nearest to burning: bar, anchors, browser, initializer, searches, identify, handle, quatorze,
Nearest to not: burst, screens, press, content, duration, unknown, aunt, later,
Nearest to is: displaying, lear, like, reject, revious, dvdauthor, image, thirteen,
Nearest to or: sampleplugin, focusable, fill, zoom, streets, complie, delegated, larger,
Nearest to cd: fifth, un, wrong, classes, view, string, won, capable,
Nearest to remove: spanish, name, rounds, subscribed, crash, device, tags, mh,
Nearest to i: url, variable, gconf, specified, gr, leadin, know, bisect,
Nearest to replace: touch, bold, should, locked, explorer, redistribute, update, adds,
Nearest to plugin: cap

Average loss at step 52000: 1.582450
Average loss at step 54000: 1.536024
Average loss at step 56000: 1.523003
Average loss at step 58000: 1.560395
Average loss at step 60000: 1.519291
Nearest to target: group, descending, pascal, videos, scuffle, xml, associated, dubious,
Nearest to video: autocomplete, photo, drop, svcds, recordable, signora, downloads, unsaved,
Nearest to save: suggest, script, pixmap, natural, truncated, cdrkit, lu, show,
Nearest to burning: directly, euro, ongoing, integrity, ukrainian, grouping, normalization, listed,
Nearest to not: deactivated, choice, menus, looks, renamed, keywords, incomplete, possible,
Nearest to is: because, updated, splitting, marshaller, has, was, named, if,
Nearest to or: among, rw, numbers, keys, books, atoms, thirteen, streets,
Nearest to cd: choice, plain, genisoimage, infromation, bars, wrong, ctivate, ensure,
Nearest to remove: canfield, place, next, move, seven, ok, of, fours,
Nearest to i: japanese, normal, acceptable, bisect, en

In [35]:
output = {
    'embeddings': final_embeddings,
    'dictionary': dictionary,
    'reverse_dictionary': reverse_dictionary
}

In [36]:
with open('embeddings.en', 'wb') as f:
    pickle.dump(output, f)