In [1]:
import collections
import math
import os
import random
import zipfile

In [2]:
from six.moves import urllib
from six.moves import xrange

In [3]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


In [4]:
DOWNLOADED_FILENAME = 'SampleText.zip'

def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size == expected_bytes:
        print('Found and verified file from this path : ', url_path)
        print('Downloaded File : ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception(
                'Failed to verify file from : ' + url_path + '. Can you get to it with browser?')

In [5]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
    
    return words

In [6]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016

maybe_download(URL_PATH, FILESIZE)

Found and verified file from this path :  http://mattmahoney.net/dc/text8.zip
Downloaded File :  SampleText.zip


In [7]:
vocabulary = read_words()

In [8]:
len(vocabulary)

17005207

In [9]:
vocabulary[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [10]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words - 1))
    
    dictionary= dict()
    
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)
        
    word_indexes = list()
    
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unknown_count += 1
        
        word_indexes.append(index)
    
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return word_counts, word_indexes, dictionary, reversed_dictionary

In [11]:
VOCABULARY_SIZE = 5000

word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(
                    vocabulary, VOCABULARY_SIZE)

In [12]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [13]:
word_indexes[:10]

[0, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [14]:
i = 0
for key in dictionary:
    if i < 10:
        print(key, ' : ', dictionary[key])
    else:
        break
    i += 1

UNKNOWN  :  0
the  :  1
of  :  2
and  :  3
one  :  4
in  :  5
a  :  6
to  :  7
zero  :  8
nine  :  9


In [15]:
i = 0
for key in reversed_dictionary:
    if i < 10:
        print(key, ' : ', reversed_dictionary[key])
    else:
        break
    i += 1

0  :  UNKNOWN
1  :  the
2  :  of
3  :  and
4  :  one
5  :  in
6  :  a
7  :  to
8  :  zero
9  :  nine


In [16]:
del vocabulary

In [17]:
# Global indexes into which word maintained across batches
global_index = 0

** Generating Batch of Data **
    1. num_skips : The number of the words that are to be predicted from the surronding based on a input word
    2. skip_window : Window size to be considered to left and right of the input word.

In [18]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index
    
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    batch = np.ndarray(shape = (batch_size), dtype = np.int32)
    labels = np.ndarray(shape = (batch_size, 1), dtype = np.int32)
    
    span = 2 * skip_window + 1    #[skip_window input_word skip_window]
    
    buffer = collections.deque(maxlen = span)
    
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
        
    for i in range(batch_size // num_skips):
        target = skip_window       # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
                
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window]   # this is the input word
            labels[i * num_skips + j] = buffer[target]       # these are the context words
            
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)
    
    return batch, labels

In [19]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [20]:
batch[:10]

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156],
      dtype=int32)

In [21]:
labels[:10]

array([[  46],
       [ 195],
       [3081],
       [ 128],
       [ 742],
       [ 477],
       [   2],
       [   0],
       [   2],
       [3134]], dtype=int32)

In [22]:
for i in range(10):
    print(reversed_dictionary[batch[i]], ' | ', reversed_dictionary[labels[i][0]])

of  |  first
of  |  term
abuse  |  originated
abuse  |  early
first  |  working
first  |  class
used  |  of
used  |  UNKNOWN
against  |  of
against  |  abuse
