In [50]:
#Source: http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/
#https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_word2vec.py
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

In [51]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

In [52]:
url = 'http://mattmahoney.net/dc/'
filename = maybe_download('text8.zip', url, 31344016)

Found and verified text8.zip


In [53]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [54]:
vocabulary = read_data(filename)
print(vocabulary[:7])
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']

In [55]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [56]:
filtered_vocab = []
def remove_stop_words(vocab):
    stop_words = set(stopwords.words('english'))
    for word in vocab:
        if word not in stop_words and len(word) > 2:
                filtered_vocab.append(word)
    return filtered_vocab

In [57]:
def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    filtered_vocab = remove_stop_words(vocabulary)
    print(filtered_vocab[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(filtered_vocab,
                                                                vocabulary_size)
    del filtered_vocab, vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

In [58]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
['anarchism', 'originated', 'term', 'abuse', 'first', 'used', 'early']
[5010, 2890, 97, 2941, 12, 15, 47]


In [59]:
window_size = 3
vector_dim = 300
epochs = 200000

In [60]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [61]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[9557, 3], [312, 3426], [5966, 8233], [2499, 4154], [3760, 8736], [8982, 4032], [3656, 1104], [2615, 3182], [458, 4], [845, 2140]] [1, 0, 0, 1, 0, 0, 1, 1, 1, 1]


In [62]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [63]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = dot([target, context], axes = 0, normalize = True)
#similarity = merge([target, context], mode='cos', dot_axes=0)

# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context], axes=1, normalize = False)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

In [None]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
#     if cnt % 10000 == 0:
#         sim_cb.run_sim()

In [64]:
dictionary

{'UNK': 0,
 'one': 1,
 'zero': 2,
 'nine': 3,
 'two': 4,
 'eight': 5,
 'five': 6,
 'three': 7,
 'four': 8,
 'six': 9,
 'seven': 10,
 'also': 11,
 'first': 12,
 'many': 13,
 'new': 14,
 'used': 15,
 'american': 16,
 'time': 17,
 'see': 18,
 'may': 19,
 'world': 20,
 'would': 21,
 'however': 22,
 'years': 23,
 'states': 24,
 'people': 25,
 'war': 26,
 'united': 27,
 'known': 28,
 'called': 29,
 'use': 30,
 'system': 31,
 'often': 32,
 'state': 33,
 'history': 34,
 'city': 35,
 'english': 36,
 'made': 37,
 'well': 38,
 'number': 39,
 'government': 40,
 'later': 41,
 'since': 42,
 'part': 43,
 'name': 44,
 'century': 45,
 'university': 46,
 'early': 47,
 'life': 48,
 'british': 49,
 'year': 50,
 'like': 51,
 'including': 52,
 'became': 53,
 'example': 54,
 'day': 55,
 'even': 56,
 'work': 57,
 'language': 58,
 'although': 59,
 'several': 60,
 'form': 61,
 'john': 62,
 'national': 63,
 'much': 64,
 'french': 65,
 'general': 66,
 'high': 67,
 'links': 68,
 'could': 69,
 'based': 70,
 'second