In [1]:
#Source: http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/
#https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_word2vec.py
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

Using TensorFlow backend.


In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

In [3]:
url = 'http://mattmahoney.net/dc/'
filename = maybe_download('text8.zip', url, 31344016)

Found and verified text8.zip


In [4]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [5]:
vocabulary = read_data(filename)
print(vocabulary[:7])
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']


['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']

In [6]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [7]:
filtered_vocab = []
def remove_stop_words(vocab):
    stop_words = set(stopwords.words('english'))
    for word in vocab:
        if word not in stop_words and len(word) > 2:
                filtered_vocab.append(word)
    return filtered_vocab

In [10]:
def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
#     filtered_vocab = remove_stop_words(vocabulary)
#     print(filtered_vocab[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    #del filtered_vocab, vocabulary  # Hint to reduce memory.
    del vocabulary
    return data, count, dictionary, reverse_dictionary

In [11]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
[5234, 3081, 12, 6, 195, 2, 3134]


In [12]:
window_size = 3
vector_dim = 300
epochs = 200000

In [13]:
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [14]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[292, 668], [5786, 5498], [7821, 2497], [3215, 7], [7490, 5635], [8, 3652], [3075, 2260], [14, 38], [1116, 24], [6309, 8348]] [1, 0, 0, 1, 0, 0, 0, 1, 1, 0]


In [15]:
dictionary

{'UNK': 0,
 'the': 1,
 'of': 2,
 'and': 3,
 'one': 4,
 'in': 5,
 'a': 6,
 'to': 7,
 'zero': 8,
 'nine': 9,
 'two': 10,
 'is': 11,
 'as': 12,
 'eight': 13,
 'for': 14,
 's': 15,
 'five': 16,
 'three': 17,
 'was': 18,
 'by': 19,
 'that': 20,
 'four': 21,
 'six': 22,
 'seven': 23,
 'with': 24,
 'on': 25,
 'are': 26,
 'it': 27,
 'from': 28,
 'or': 29,
 'his': 30,
 'an': 31,
 'be': 32,
 'this': 33,
 'which': 34,
 'at': 35,
 'he': 36,
 'also': 37,
 'not': 38,
 'have': 39,
 'were': 40,
 'has': 41,
 'but': 42,
 'other': 43,
 'their': 44,
 'its': 45,
 'first': 46,
 'they': 47,
 'some': 48,
 'had': 49,
 'all': 50,
 'more': 51,
 'most': 52,
 'can': 53,
 'been': 54,
 'such': 55,
 'many': 56,
 'who': 57,
 'new': 58,
 'used': 59,
 'there': 60,
 'after': 61,
 'when': 62,
 'into': 63,
 'american': 64,
 'time': 65,
 'these': 66,
 'only': 67,
 'see': 68,
 'may': 69,
 'than': 70,
 'world': 71,
 'i': 72,
 'b': 73,
 'would': 74,
 'd': 75,
 'no': 76,
 'however': 77,
 'between': 78,
 'about': 79,
 'over': 80

In [32]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

In [33]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = dot([target, context], axes = 0, normalize = True)
#similarity = merge([target, context], mode='cos', dot_axes=0)

# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context], axes=1, normalize = False)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)

# create the primary training model
model = Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

In [None]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 1000 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
#     if cnt % 10000 == 0:
#         sim_cb.run_sim()

Iteration 0, loss=0.6924301385879517
Iteration 1000, loss=0.7097952961921692
Iteration 2000, loss=0.7116062045097351
Iteration 3000, loss=0.7068607807159424
Iteration 4000, loss=0.6946460008621216
Iteration 5000, loss=0.6970285177230835
Iteration 6000, loss=0.6675528287887573
Iteration 7000, loss=0.6621053218841553
Iteration 8000, loss=0.6677365303039551
Iteration 9000, loss=0.7078180313110352
Iteration 10000, loss=0.7082197666168213
Iteration 11000, loss=0.6940311789512634
Iteration 12000, loss=0.6624252200126648
Iteration 13000, loss=0.6789559125900269
Iteration 14000, loss=0.683051347732544
Iteration 15000, loss=0.7093185186386108
Iteration 16000, loss=0.6986109018325806
Iteration 17000, loss=0.6468752026557922
Iteration 18000, loss=0.6591153144836426
Iteration 19000, loss=0.7077077031135559
Iteration 20000, loss=0.7078066468238831
Iteration 21000, loss=0.6901941299438477
Iteration 22000, loss=0.7145197987556458
Iteration 23000, loss=0.6526499390602112
Iteration 24000, loss=0.702916

In [67]:
sim_cb.run_sim()

Nearest to term: become, began, life, boeing, long, usage, australia, humans,
Nearest to external: links, six, nine, five, two, zero, one, four,
Nearest to nine: seven, one, zero, eight, six, two, three, five,
Nearest to four: six, five, one, seven, zero, eight, two, nine,
Nearest to two: one, zero, three, six, eight, seven, nine, five,
Nearest to still: seven, two, one, first, eight, five, six, zero,
Nearest to king: two, one, eight, louis, zero, three, six, new,
Nearest to university: cambridge, press, college, born, nine, eight, one, first,
Nearest to five: one, zero, four, six, eight, seven, two, nine,
Nearest to british: two, nine, seven, eight, zero, four, five, one,
Nearest to war: zero, nine, four, two, five, one, eight, also,
Nearest to german: seven, five, four, two, world, nine, three, six,
Nearest to international: zero, two, may, square, also, eight, nine, five,
Nearest to french: one, four, six, original, seven, also, language, spanish,
Nearest to government: one, seven, 