In [None]:
# Mounting Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle

# Initializing global variables
co_occurence_matrix = {}
vocabulary = []
local_context_window = 6
print("Global variables initialized...!!")

with open("/content/drive/My Drive/ISI-Project/train_data_bow","rb") as f:
    corpus = pickle.load(f)

training_data = []

no_of_sentences_so_far = 0 

# For all poems in the corpus, building the co-occurence matrix 
for poem in corpus:
  poem = poem[-2]

  no_of_sentences_so_far += 1
  if no_of_sentences_so_far%5000 == 0 :
    print("Co occurence matrix build in progress... " + str(no_of_sentences_so_far) + " poems read.")
    
  for index_tokens_list in range(len(poem)):
    token = poem[index_tokens_list]
    if token not in vocabulary:
      vocabulary.append(token)  

    # Iterating over all tokens and storing them in the co-occurence matrix
    for index in range(1, int(local_context_window/2)):
      if index + index_tokens_list >= len(poem):
        break
      tuple_main_context = (poem[index_tokens_list], poem[index_tokens_list + index])
      tuple_context_main = (poem[index_tokens_list + index], poem[index_tokens_list])
      if tuple_main_context not in co_occurence_matrix:
        co_occurence_matrix[tuple_main_context] = 1
        co_occurence_matrix[tuple_context_main] = 1
      
      else:
        co_occurence_matrix[tuple_main_context] += 1
        co_occurence_matrix[tuple_context_main] += 1

print( "Co occurence matrix built successfully..!!")
print( "Number of words in vocabulary: "  + str(len(vocabulary)))
print( "Number of entries in co-occurence matrix: "  + str(len(co_occurence_matrix)))
print( "Saving the co occurence matrix and vocabulary for future reference.")

pickle.dump( vocabulary, open( "/content/drive/My Drive/ISI_PROJECT/vocabulary", "wb" ) )
pickle.dump( co_occurence_matrix, open( "/content/drive/My Drive/ISI_PROJECT/co_occurence_matrix", "wb" ) )

print("Executed Successfully")

Global variables initialized...!!
Co occurence matrix build in progress... 5000 poems read.
Co occurence matrix build in progress... 10000 poems read.
Co occurence matrix build in progress... 15000 poems read.
Co occurence matrix build in progress... 20000 poems read.
Co occurence matrix build in progress... 25000 poems read.
Co occurence matrix build in progress... 30000 poems read.
Co occurence matrix build in progress... 35000 poems read.
Co occurence matrix build in progress... 40000 poems read.
Co occurence matrix build in progress... 45000 poems read.
Co occurence matrix built successfully..!!
Number of words in vocabulary: 24281
Number of entries in co-occurence matrix: 6447730
Saving the co occurence matrix and vocabulary for future reference.
Executed Successfully


In [None]:
import random
import numpy as np
vectors_main_word = {}
vectors_context_word = {}
biases_main_word = {}
biases_context_word = {}
alpha_glove_model = 0.75
x_max_glove_model = 100

number_of_iterations = 100
learning_rate = 0.001

# Method to find the weight as implemented in the research paper
def find_weight( main_token, context_token ):
  if (context_token,main_token) not in co_occurence_matrix:
    return 0
  if ( co_occurence_matrix[(context_token,main_token)] < x_max_glove_model ):
    return (co_occurence_matrix[(context_token,main_token)] / x_max_glove_model) ** alpha_glove_model
  return 1

# Randomly initializing the global vectors for all tokens in the vocabulary
def initilize_word_vectors_and_biases():
  for token in vocabulary:
    vectors_main_word[token] = np.random.random(100)
    vectors_context_word[token] = np.random.random(100)
    biases_main_word[token] = random.random()
    biases_context_word[token] = random.random()

initilize_word_vectors_and_biases()

print("Initialization of the word vectors and biases for the " + str(len(vocabulary)) + " tokens in our vocabulary complete.")


Initialization of the word vectors and biases for the 24281 tokens in our vocabulary complete.


In [None]:
# Using alpha and x_max as used by the authors 
alpha_glove_model = 0.75
x_max_glove_model = 100

import numpy as np
import math

# Method to carry out a single iteration of gradient descent using the vectors
# More formally, we find the value of the cost and then the gradient and find the new value
# by subtracting the learning rate * cost to get the updated value
# Implementation NOTE: We have applying gradient descent by batch methods. ie. we find the cost for the entire training set and then apply gradient descent
def run_single_iteration():
  total_cost = 0
  for (context_token,main_token), value in co_occurence_matrix.items():
    if main_token == context_token:
      continue
    weight = find_weight( main_token, context_token )
    
    if(weight == 0):
      continue
    cost_without_weight = ( np.dot(vectors_main_word[main_token] , vectors_context_word[context_token] ) + biases_main_word[main_token] + biases_context_word[context_token] - math.log(co_occurence_matrix[(context_token,main_token)]))
    total_cost += 0.5 * weight * cost_without_weight ** 2
    gradient_main_word_vector = weight * cost_without_weight * vectors_context_word[context_token]
    gradient_context_word_vector = weight * cost_without_weight * vectors_main_word[main_token]
    gradient_main_bias = weight * cost_without_weight
    gradient_context_bias = weight * cost_without_weight

    vectors_main_word[ main_token ] -= learning_rate * gradient_main_word_vector
    vectors_context_word[ context_token ] -= learning_rate * gradient_context_word_vector

    biases_main_word[ main_token ] -= learning_rate * gradient_main_bias
    biases_context_word[ context_token ] -= learning_rate * gradient_context_bias
  return total_cost

print("Function to run single iteration of gradient descent compiled successfully..!!")

Function to run single iteration of gradient descent compiled successfully..!!


In [None]:
import pickle

learning_rate = 0.01
print("Applying gradient descent to find the appropriate word vectors to carry out unsupervised learning...")

# Driver function that calls the above function and carries out single iteration 
for iteration in range(1,51):
  cost = run_single_iteration()
  print("Iteration " + str(50+iteration) + " successfull. Returned cost value is: " + str(cost))

print("All iterations fot gradient descent completed successfully..!!")
print("Saving word vectors in file 'word_vectors' ")
pickle.dump(vectors_main_word , open( "/content/drive/My Drive/ISI_PROJECT/word_vectors_test_set_100_iterations", "wb" ) )


Applying gradient descent to find the appropriate word vectors to carry out unsupervised learning...
Iteration 51 successfull. Returned cost value is: 99354.78822190886
Iteration 52 successfull. Returned cost value is: 98307.36998444368
Iteration 53 successfull. Returned cost value is: 97297.22479402274
Iteration 54 successfull. Returned cost value is: 96322.37942349452
Iteration 55 successfull. Returned cost value is: 95380.99771180993
Iteration 56 successfull. Returned cost value is: 94471.36881476142
Iteration 57 successfull. Returned cost value is: 93591.89664912703
Iteration 58 successfull. Returned cost value is: 92741.09039127913
Iteration 59 successfull. Returned cost value is: 91917.5559096207
Iteration 60 successfull. Returned cost value is: 91119.98802499166
Iteration 61 successfull. Returned cost value is: 90347.16350714616
Iteration 62 successfull. Returned cost value is: 89597.93472639588
Iteration 63 successfull. Returned cost value is: 88871.22389008348
Iteration 64 suc