# Using PreTrained Word Embeddings

In [2]:
# install annoy package
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/a1/5b/1c22129f608b3f438713b91cd880dc681d747a860afe3e8e0af86e921942/annoy-1.17.0.tar.gz (646kB)
[K     |▌                               | 10kB 11.7MB/s eta 0:00:01[K     |█                               | 20kB 1.7MB/s eta 0:00:01[K     |█▌                              | 30kB 2.1MB/s eta 0:00:01[K     |██                              | 40kB 2.4MB/s eta 0:00:01[K     |██▌                             | 51kB 2.0MB/s eta 0:00:01[K     |███                             | 61kB 2.2MB/s eta 0:00:01[K     |███▌                            | 71kB 2.5MB/s eta 0:00:01[K     |████                            | 81kB 2.7MB/s eta 0:00:01[K     |████▋                           | 92kB 2.9MB/s eta 0:00:01[K     |█████                           | 102kB 2.8MB/s eta 0:00:01[K     |█████▋                          | 112kB 2.8MB/s eta 0:00:01[K     |██████                          | 122kB 2.8MB/s eta 0:00:01[K    

In [20]:
import numpy as np
from annoy import AnnoyIndex

class PreTrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors):
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors

        self.index_to_word = {
            idx: word for word, idx in self.word_to_index.items()
        }

        self.index = AnnoyIndex(len(self.word_vectors[0]),
                                metric='euclidean')
        
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)

    @classmethod
    def from_embeddings_file(cls, embedding_file):
        word_to_index = {}
        word_vectors = []

        with open(embedding_file) as f:
            for line in f.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(val) for val in line[1:]])

                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
        return cls(word_to_index, word_vectors)

    def get_embedding(self, word):
        """
        Get embedding values given a word
        Args:
            word(str): word to find
        Returns:
            embedding vector (np.ndarray)
        """
        return self.word_vectors[self.word_to_index[word]]

    def get_closest_vector(self, vector, neighbors=1):
        """
        Get neighboring words given a vector
        Args:
            vector (np.ndarray): embedding vector
            neighbors (int): number of neighbors to return
        Returns:
            List of neighboring words list(str)
        """
        neighbors_indices = self.index.get_nns_by_vector(vector,
                                                        neighbors)
        return [self.index_to_word[idx] for idx in neighbors_indices]

    def compute_analogy(self, word1, word2, word3):
        """
        Get Analogy of given words
        Args:
            word1 - word3 (str): words to find
        Returns:
            Shows the analogy of given words
        """
        vector1 = self.get_embedding(word1)
        vector2 = self.get_embedding(word2)
        vector3 = self.get_embedding(word3)

        # compute the spatial relationship
        # from vector1 and vector2
        # the relationsihp of vector3 is
        # vector3 + spatial_relationship
        spatial_relationship = vector2 - vector1
        vector4 = vector3 + spatial_relationship

        # from vector get it's relationship (neighbors)
        neighbors = self.get_closest_vector(vector4, 4)
        existing_words = set([word1, word2, word3])
        neighbors = set(neighbors)
        closest_words = [word for word in neighbors
                         if word not in existing_words]

        if len(closest_words) == 0:
            print("No closest words found")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1,
                                              word2,
                                              word3,
                                              word4))

In [21]:
# load the embedding file
embedding_file = "/content/drive/My Drive/Colab Notebooks/Data/Glove/glove.6B.100d.txt"
embeddings = PreTrainedEmbeddings.from_embeddings_file(embedding_file)

In [23]:
embeddings.compute_analogy('man', 'he', 'woman')
embeddings.compute_analogy('fly', 'plane', 'sail')
embeddings.compute_analogy('blue', 'color', 'dog')
embeddings.compute_analogy('fast', 'fastest', 'small')
# gender bias
embeddings.compute_analogy('man', 'doctor', 'woman')

man : he :: woman : she
man : he :: woman : her
fly : plane :: sail : ship
fly : plane :: sail : vessel
fly : plane :: sail : boat
blue : color :: dog : touch
blue : color :: dog : animal
blue : color :: dog : taste
blue : color :: dog : pet
fast : fastest :: small : quarters
fast : fastest :: small : among
fast : fastest :: small : smallest
fast : fastest :: small : largest
man : doctor :: woman : physician
man : doctor :: woman : doctors
man : doctor :: woman : nurse
