# Note for Production Implementation.

In [None]:
#If embeddings,dictionary and reverser_dictionary files already exist, then don't train again. 
#Typically useful for running the code second time.
#Else run the trainings and return the predictions.

import os.path
print ((os.path.isfile("embed.npy")==True) & (os.path.isfile("dictionary.npy")==True) & (os.path.isfile("reverse_dictionary.npy")==True))

# Libraries

In [None]:
#Run these. Still there are many more libraries which I have placed in code in appropriate sections for ease of readability.
from __future__ import print_function
import os
import random
from six.moves import range

# Download the Corpus to train.

    The Data placed in your Google Drive and shared. You can get the id from URL itself.

In [None]:
import requests

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination) 

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [None]:
#Download the file.
#https://drive.google.com/file/d/1rI79RgJx2ZajOGtYsw-kxCjrr2VUYH1e/view?usp=sharing
#id=1rI79RgJx2ZajOGtYsw-kxCjrr2VUYH1e
id="1rI79RgJx2ZajOGtYsw-kxCjrr2VUYH1e"
destination="corpus.txt"
download_file_from_google_drive(id,destination)

## Corpus Reading and Cleaning Process.

In [None]:
file = open('corpus.txt', 'r')
text = file.read()
file.close()

In [None]:
text[1:100]

In [None]:
#NLTK for text processing
import nltk

In [None]:
#Removing non english words.
eng_words = set(nltk.corpus.words.words())

text_english=" ".join(w for w in nltk.wordpunct_tokenize(text) \
         if w.lower() in eng_words or not w.isalpha())

In [None]:
text_english[1:100]

In [None]:
#Text Cleansing
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text_english)

In [None]:
# convert to lower case
tokens = [w.lower() for w in tokens]

In [None]:
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

In [None]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

In [None]:
# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:10])

In [None]:
#Removed Single Letter words
single_letters=('b','c','d','e','f','g','h','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z')
words=[w for w in words if not w in single_letters]

In [None]:
#Total word size
len(words)

In [None]:
#Total unique words, your vocabulary
len(set(words))

# Create Indexed Dictionary, Reverse Dictionary, Analysis of Frequently Used Word Counts.

In [None]:
#Word Dictionary and Anlysis of the words.
vocabulary_size = len(set(words))
import collections

def build_dataset(words):
  count = [['UNK', -1]]
 
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
#del words  # Will help to reduce memory.

# Saving Dictionary & Reverse Dictionary in External File

In [None]:
import numpy as np
# Save
np.save('dictionary.npy', dictionary)
np.save('reverse_dictionary.npy', reverse_dictionary)

# Batch Generation
    
    Batches of data will be fed in each iteration of tensorflow. Every iteration will get new batch of data, which is controled by global data_index variable, which gets updated in each run of generate_batch function.

In [None]:
import numpy as np

#This is a important variable. This gets updated everytime batch function is run at global context.
#Basically data_index after one run, from where of data I need to select the next batch.
#So in tf for each iteration when you call batch you will get unique batch till the end of the data.
data_index = 0
#Creating multiple batches of rows=batch_size for batch learning.
#We will be calling this and passing batch size while calling TF session.

#This function is getting data from earlier defined and populated data variable.

def generate_batch(batch_size, skip_window):
    # skip window is the amount of words we're looking at from each side of a given word
    # creates a single batch
    
    #This is a important variable. This gets updated everytime batch function is run at global context.
    #Basically data_index after one run, from where of data I need to select the next batch.
    #So in tf for each iteration when you call batch you will get unique batch till the end of the data.
    global data_index

    span = 2 * skip_window + 1 # [ skip_window target skip_window ]

    batch = np.ndarray(shape=(batch_size,span-1), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # e.g if skip_window = 2 then span = 5
    # span is the length of the whole frame we are considering for a single word (left + word + right)
    # skip_window is the length of one side

    # queue which add and pop at the end
    buffer = collections.deque(maxlen=span)

    #get words starting from index 0 to span
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    for i in range(batch_size):
        target = skip_window  # target label at the center of the buffer
        target_to_avoid = [ skip_window ] # we only need to know the words around a given word, not the word itself

        # add selected target to avoid_list for next time
        col_idx = 0
        for j in range(span):
            if j==span//2:
                continue
            # e.g. i=0, j=0 => 0; i=0,j=1 => 1; i=1,j=0 => 2
            batch[i,col_idx] = buffer[j] # [skip_window] => middle element
            col_idx += 1
        labels[i, 0] = buffer[target]

        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    assert batch.shape[0]==batch_size and batch.shape[1]== span-1
    return batch, labels


#Checking if everything is working fine.
#Number of skips is useless here and nowhere used in CBOW generating dataset. Left over code from Skipgram implementation.

for num_skips, skip_window in [(1, 1)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=4, skip_window=skip_window)
    print('\nwith skip_window = %d:' % (skip_window))
    
    print('    batch:', [[reverse_dictionary[bii] for bii in bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(4)])


In [None]:
#Defining batch_size, embedding size(word vector size), skip window, negative sample size.
batch_size = 150
embedding_size = 150 # Dimension of the embedding vector.
skip_window = 1 # words to consider left and right.
num_sampled = 50 # Number of negative examples to sample.

# Define Tensorflow Computational Graph

In [None]:
import tensorflow as tf
import math
graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

        # Input data.
        train_dataset = tf.placeholder(tf.int32, shape=[batch_size,2*skip_window])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

        # Variables.
        # embedding, vector for each word in the vocabulary
        #Remember that you have defined vocabulary_size = 7 when you created function build dataset
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        
        softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
        
        softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Averaging embeddings accross the full context into a single embedding layer
        context_embeddings = []
        
        for i in range(2*skip_window):
            
            context_embeddings.append(tf.nn.embedding_lookup(embeddings, train_dataset[:,i]))
            
        avg_embed =  tf.reduce_mean(tf.stack(axis=0,values=context_embeddings),0,keep_dims=False)

        # Compute the softmax loss, using a sample of the negative labels each time.
        # inputs are embeddings of the train words
        # with this loss we optimize weights, biases, embeddings

        #loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=avg_embed,
         #                      labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=softmax_weights,
                 biases=softmax_biases,
                 labels=train_labels,
                 inputs=avg_embed,
                 num_sampled=num_sampled,
                 num_classes=vocabulary_size))

        # Optimizer.
        optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm

# Train using Tensorflow Session on Batches of Data

In [None]:
#Run
#Iterations
num_steps = 100000
with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        print('Session Initialized')
        average_loss = 0
        for step in range(num_steps):
            batch_data, batch_labels = generate_batch(batch_size, skip_window)
            feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
            #I am only interested to get loss in return in variable 1 and add it to average loss.
            _, l = session.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += l
            if step % 2000 == 0:
                if step > 0:
                    average_loss = average_loss / 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step %d: %f' % (step, average_loss))
                average_loss = 0
            # note that this is expensive (~20% slowdown if computed every 500 stepss
            final_embeddings = normalized_embeddings.eval()
            np.save("embed.npy", final_embeddings)

# Prediction Function

In [None]:
#Creating predict function
#Input: list of words, final_embeddings,reverse_dictionary,top_nearest_words_needed
#Output: The nearest 10 words in sorted order.
def predicted_words(target_words,final_embeddings,dictionary,reverse_dictionary,top_nearest_words_needed):

    #Handling the single input and multiple inputs, both.
    target_list=[]
    if (type(target_words)==tuple):
        for i in range(0,len(target_words)):
            target_list.append(target_words[i])
    else:
        target_list.append(target_words)
    
    #Take word by word and predict list of nearest words.
    for i in range(0,len(target_list)):
        #Cleaning target word.
        target=target_list[i]
        #search for index in dictionary. If not found assign it to unknown word.
        #The results will be obviously not accurate, but will not give annoying not found error.
        #Approach is debatable! 
        #But I am following approach in google keypad, when I enter non existing word, it will still give some junk predictions.
        if(dictionary.get(target)==None):
            target_embedding=final_embeddings[0,:]
        else:
            target_embedding=final_embeddings[int(dictionary.get(target)),:]
        
        if(dictionary.get(target)!=None):
            #Avoiding Comparing Similarity with Unknown Vector
            #cosine_similarity
            cosine_similarity=np.matmul(target_embedding,np.transpose(final_embeddings[1:final_embeddings.shape[0],:]))
        
            #I am building a array with index in one column and cosine similarity in another column.
            word_index=np.zeros((cosine_similarity.shape[0],1), dtype=int)
        
            for j in range(0,cosine_similarity.shape[0]):
                word_index[j]=j+1
        
            #Reshaping cosine_similarity to match with word index array.
            cosine_similarity_reshape=cosine_similarity.reshape(cosine_similarity.shape[0],1)
        
            #Appending & Sorting.
            target_sim_array=np.append(word_index,cosine_similarity_reshape,axis=1)
            target_sim_array_sorted=target_sim_array[target_sim_array[:, 1].argsort()]
        
            #Taking top nearest word index except word itself (0 th position)
            top_word_index=target_sim_array_sorted[:,0][-(top_nearest_words_needed+1):-1:]
        
            #Top word predictions using reverse array and reverse dictionary
            predicted_words=list()
            for k in reversed(top_word_index):
                predicted_words.append(reverse_dictionary[k])
            
            print ("Nearest words for word: %s" %target, ": ordered by nearest word predicted first is %s" %predicted_words)
        else:
            
            #cosine_similarity
            cosine_similarity=np.matmul(final_embeddings[0,:],np.transpose(final_embeddings))
        
            #I am building a array with index in one column and cosine similarity in another column.
            word_index=np.zeros((cosine_similarity.shape[0],1), dtype=int)
        
            for j in range(0,cosine_similarity.shape[0]):
                word_index[j]=j
        
            #Reshaping cosine_similarity to match with word index array.
            cosine_similarity_reshape=cosine_similarity.reshape(cosine_similarity.shape[0],1)
        
            #Appending & Sorting.
            target_sim_array=np.append(word_index,cosine_similarity_reshape,axis=1)
            target_sim_array_sorted=target_sim_array[target_sim_array[:, 1].argsort()]
        
            #Taking top nearest word index except word itself (0 th position)
            top_word_index=target_sim_array_sorted[:,0][-(top_nearest_words_needed+1):-1:]
        
            #Top word predictions using reverse array and reverse dictionary
            predicted_words=list()
            for k in reversed(top_word_index):
                predicted_words.append(reverse_dictionary[k])
            
            print ("The word is unknown as of now. Still, based on cosine distance for unknown word vector, Nearest for words for unknown word: %s" %target, ": ordered by nearest word predicted first is %s" %predicted_words)            
        
        #Top word predictions using reverse dictionary
    #for i in range(0,top_word_index.shape[0]):
        #print ("Nearest words for %s" %target, "based on nearest first is %s" %reverse_dictionary[top_word_index[i]])

# Test

In [None]:
#Enter Target words in comma seperated string manner as below.
target_words=("small","employee")

In [None]:
top_nearest_words_needed=10
# Load embeddings, dictionary and reverse dictionary saved earlier in files.
# Advantage is you can reduce run time, if these files doesn't exist.
final_embeddings=np.load("embed.npy")
dictionary = np.load('dictionary.npy').item()
reverse_dictionary = np.load('reverse_dictionary.npy').item()

In [None]:
predicted_words(target_words,final_embeddings,dictionary,reverse_dictionary,top_nearest_words_needed)