In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function


In [3]:
import collections
import math
import os
import random
import zipfile

In [4]:
from six.moves import urllib
from six.moves import xrange

In [8]:
import numpy as np
import tensorflow as tf

In [10]:
print(np.__version__)
print(tf.__version__)

1.16.4
1.14.0


In [17]:
DOWNLOADED_FILENAME = 'SampleText.zip'
def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size ==  expected_bytes:
            print('Found and verified file from this path: ', url_path)
            print('Downloaded filer: ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception(
                'Failed to verify file from :' + url_path +'. Can you get to ti with a browser?')

In [15]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
        
    return words

In [18]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016
maybe_download(URL_PATH, FILESIZE)

Found and verified file from this path:  http://mattmahoney.net/dc/text8.zip
Downloaded filer:  SampleText.zip


In [74]:
vocabulary = read_words()


In [75]:
len(vocabulary)


17005207

In [76]:
vocabulary[:24]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans']

In [77]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words - 1))
    
    dictionary = dict()
    #Assign unique indexs to words; the most common words have the lowest index values
    for word, _ in word_counts:
        dictionary[word] = len(dictionary) 
        
    
    word_indexes = list()
    
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # Dictionary['UNKNOWN']
            unknown_count += 1
            
        word_indexes.append(index) # word_indexes all the words from the original text in it's index form 
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys() ))
    
    return word_counts, word_indexes, dictionary, reversed_dictionary

In [78]:
VOCABULARY_SIZE = 5000
# word_counts -> Most common top words
# word_indexes -> all the words from the original text in it's index form 
# dictionary -> unique indexs to top words; the most common words have the lowest index values
# reversed_dictionary -> index to word map
word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(vocabulary,VOCABULARY_SIZE)

In [33]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [35]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [38]:
import random

for key in random.sample(list(dictionary), 10):
    print (key, ":", dictionary[key])

adoption : 4051
madrid : 4732
module : 4389
fire : 671
fifty : 4348
sister : 1933
space : 320
efficient : 2929
ethics : 2685
private : 818


In [39]:
for key in random.sample(list(reversed_dictionary), 10):
    print (key, ":", reversed_dictionary[key])

3131 : seek
3761 : dynamic
3623 : ranges
2163 : vast
968 : variety
1373 : attempts
1957 : producer
4894 : unicode
86 : united
1970 : mr


In [40]:
del vocabulary

In [79]:
#Global index into words maintained accross batches
global_index = 0

In [60]:
# word_indexes -> all the words from the original text in it's index form 
# num_skips -> no of words we chose from the context window of any input word
# skip_window -> no of neighbors words we going to consider left or right for skip-gram model
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips  == 0
    assert num_skips <=2 * skip_window
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)  
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) # arrays of arrays contains the index of target predicted words
    
    span = 2 * skip_window + 1 # [skip_window input_word skip_window] 
    
    buffer = collections.deque(maxlen=span) # double ended queue
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])  #initialize with first few text fragment
        global_index = (global_index + 1)  % len(word_indexes)
        
    for i in range(batch_size // num_skips):  
        target = skip_window # input_word at the center of the buffer
        targets_to_avoid = [skip_window]
        
         #chose a random word from context window
        for j in range(num_skips):
            while target in targets_to_avoid: # dont chose already chosen word
                target = random.randint(0, span -1)
                
            targets_to_avoid.append(target) # so that you dont end up chosing this word again
            
            batch[i * num_skips + j] = buffer[skip_window] # this is the input word
            labels[i * num_skips + j, 0] = buffer[target]  # these are the context words
            
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes) #appending a word to the deque, removes a word from the beginning
        
    #Backtrack a little, ensure that the words at the end of a batch are included in the next batch    
    global_index = (global_index + len(word_indexes) - span) % len (word_indexes) 

    
    return batch, labels
        

In [80]:
#word_indexes, batch_size, num_skips, skip_window
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [62]:
batch

array([  20,   20,   59,   59, 2732, 2732,  363,  363,    7,    7])

In [63]:
labels

array([[   7],
       [ 363],
       [3673],
       [ 105],
       [   7],
       [ 363],
       [  59],
       [ 105],
       [ 372],
       [ 363]])

In [66]:
for i in range(9):
    print(reversed_dictionary[batch[i]], ": ", reversed_dictionary[labels[i][0]] )

that :  to
that :  means
used :  destroy
used :  any
violent :  to
violent :  means
means :  used
means :  any
to :  society


In [81]:
# as we train, validate embedding of words that are similar are closer together 
valid_size = 16 
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)


In [89]:
batch_size = 128 # you can experiment
embedding_size = 50 # no of dimension that the word embedding will have, i.e hidden layer have 50 neurons
skip_window = 2
num_skips = 2

In [90]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) # tensor shape of the placeholder is same as the dimensions our batch and labels
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [91]:
#embeddings are generated using the training dataset
embeddings = tf.Variable(
            tf.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, train_inputs) # look at the image embedding matrix and embedding lookup

In [92]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32_ref>

In [93]:
embed

<tf.Tensor 'embedding_lookup/Identity:0' shape=(128, 50) dtype=float32>

In [94]:
# settup linear hidden layers using math ops Y=Wx+B
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, embedding_size], stddev=1.0 /math.sqrt(embedding_size)))

biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases


In [95]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [98]:
#convert our labels to one hot model, to use it with Softmax prediction layer
train_one_hot = tf.one_hot(train_labels, VOCABULARY_SIZE)

# Softmax prediction layer, using cross entropy as the loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, labels=train_one_hot))

In [99]:
# Pass loss into GradientDescentOptimizer to minimize the loss
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [102]:
# Find L2 Norm
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / l2_norm

In [103]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [104]:
valid_embeddings # there are 16 words in the valid_dataset

<tf.Tensor 'embedding_lookup_1/Identity:0' shape=(16, 50) dtype=float32>

In [105]:
normalized_embeddings

<tf.Tensor 'truediv:0' shape=(5000, 50) dtype=float32>

In [107]:
# Find cosine similarity, between valid_embeddings, normalized_embeddings and find top k which are similar
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [108]:
similarity

<tf.Tensor 'MatMul_1:0' shape=(16, 5000) dtype=float32>

In [110]:
init = tf.global_variables_initializer()

In [111]:
num_steps = 20001

In [114]:
with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(word_indexes, batch_size, num_skips, skip_window)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % 2000 == 0:
            if step > 0:
                    average_loss /= 2000
                    
            print ('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
        #Note that this is expensive (-20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8 # Number of nearest neighbors
                
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                
                for k in xrange(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
            print("\n")
                    
                

Average loss at step  0 :  8.659011840820312
Nearest to zero: norse, sacred, alexandria, miller, imperial, publishers, channels, settled,
Nearest to no: knight, word, supported, philosophers, getting, newspaper, employed, superman,
Nearest to about: aimed, sacred, christianity, milk, doctors, colour, nfl, fly,
Nearest to on: nietzsche, add, turkish, a, join, she, euro, treated,
Nearest to to: only, subjects, tracks, keeping, roads, rainfall, agency, louisiana,
Nearest to that: fast, communion, caused, beginning, repeatedly, wish, networks, finalist,
Nearest to from: function, america, fall, kick, caesar, promoted, united, based,
Nearest to his: diagram, propaganda, cult, criminal, sets, average, protocol, operate,
Nearest to known: rolling, citizens, miles, enabled, roy, germans, distinguished, romance,
Nearest to during: outer, sexual, foundation, cities, dramatic, details, website, merely,
Nearest to often: sphere, decline, tank, court, available, protocols, gallery, dj,
Nearest to s