In [25]:
from annoy import AnnoyIndex
import numpy as np

In [42]:
class Pretrained_vectors(object):
    def __init__(self,word_to_index,word_vectors):
        
        self.word_to_index = word_to_index
        self.index_to_word = {v:k for k,v in word_to_index.items()}
        print(self.index_to_word)
        
        self.word_vectors = word_vectors
        
        self.index = AnnoyIndex(len(word_vectors[0]),metric = 'euclidean')
        print("Building Index!")
        for _, i in self.word_to_index.items():
            try:
                self.index.add_item(i, self.word_vectors[i])
            except Exception as e:
                print(e)
        self.index.build(50)
        print("Finished!")
        
    @classmethod    
    def create_from_embedding_file(cls,file_path):
        
        word_vectors = []
        word_to_index = {}
        
        with open(file_path) as fp:
            for line in fp.readlines():
                word = line.split(" ")[0]
                vector = np.array([float(i) for i in line.split(" ")[1:]])
                
                word_to_index[word] = len(word_vectors)
                
                word_vectors.append(vector)
        
        return cls(word_to_index,word_vectors)
                
    def find_closest_words(self,word,n=1):
        
        vector = self.word_vectors[self.word_to_index[word]]
        
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]

    def find_closest_vectors(self,vector,n=1):
        
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    
    def compute_and_print_analogy(self, word1, word2, word3):
        """Prints the solutions to analogies using word embeddings

        Analogies are word1 is to word2 as word3 is to __
        This method will print: word1 : word2 :: word3 : word4
        
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1 = self.word_vectors[self.word_to_index[word1]]
        vec2 = self.word_vectors[self.word_to_index[word2]]
        vec3 = self.word_vectors[self.word_to_index[word3]]

        # now compute the fourth word's embedding!
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.find_closest_vectors(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words 
                             if word not in existing_words] 

        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the computed vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))

In [43]:
glove_vectors = Pretrained_vectors.create_from_embedding_file("/home/raghavan/Downloads/glove.6B.50d.txt")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Finished!


In [44]:
glove_vectors.find_closest_words("the",6)

['the', 'which', 'part', 'of', 'in', 'on']

In [45]:
glove_vectors.compute_and_print_analogy('man', 'he', 'woman')

man : he :: woman : she
man : he :: woman : her
man : he :: woman : having
