# Global Vectors for Word Representation

The GloVe word embeddings were due to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 
https://nlp.stanford.edu/projects/glove/

## 1. Model overview

In [1]:
import os
import sys
import numpy as np

from sklearn.neighbors import NearestNeighbors    

In [59]:
class WordEmbedding:
    """Class that handles word embeddings.
    """
    
    def __init__(self):
        self.words = []
        self.word_to_vec = {}
        self.matrix = None
        return
    
    def __len__(self):
        return len(self.words)
        
    def load_glove(self, filepath, filename):
        """Load the representation.
        
        :param filepath: path to the file.
        :type filepath: str.
        :param filename: name of the file.
        :type filename: str.
        """
        self.words = []
        self.matrix = []
        self.word_to_vec = {}
        
        f = open(os.path.join(filepath, filename))
        
        for line in f:
            values = line.split()
            word = values[0]
            coef = np.asarray(values[1:], dtype="float32")
            self.word_to_vec[word] = coef
            self.words.append(word)
            self.matrix.append(coef)
        f.close()
        self.matrix = np.array(self.matrix)
        return

    def cosine_similarity(self, u, v):
        """Mesure the cosine similarity between two arrays.
        
        :return: the cosine similarity.
        :rtype: float.        
        """
        norm_u = np.sqrt(np.sum(u * u))
        norm_v = np.sqrt(np.sum(v * v))
        return np.sum(u * v) / ( norm_u * norm_v )

    def similar(self, word_a, word_b):
        """Mesure the similarity between word a and b.
        
        :param word_a: the first word.
        :type word_a: str.
        :param word_b: the second word.
        :type word_b: str.
        
        :return: the cosine similarity.
        :rtype: float.
        """
        a = self.word_to_vec[word_a.lower()]
        b = self.word_to_vec[word_b.lower()]
        return self.cosine_similarity(a, b)
    
    def analogy(self, word_a, word_b, word_c):
        """Performs the word analogy.
        
        :param word_a: the first word.
        :type word_a: str.
        :param word_b: the second word.
        :type word_b: str.
        :param word_c: the third word.
        :type word_c: str.          
        """
        a = self.word_to_vec[word_a.lower()]
        b = self.word_to_vec[word_b.lower()]
        c = self.word_to_vec[word_c.lower()]
        
        max_sim = -1.0
        best_word = None        
        
        for w, x in self.word_to_vec.items():
            if w in [word_a, word_b, word_c]:
                pass
            else:
                sim = self.cosine_similarity(a - b, c - x)
                if sim > max_sim:
                    max_sim = sim
                    best_word = w 
                    
        return best_word
    
    def neighbor(self, word, k=1):
        """Performs the task of finding the k nearest neighbors to the word.
        
        :param word: the reference word.
        :type word: str.
        :param k: the number of neighbors to find.
        :type k: int.
        
        :return: the k-nearest neighbors.
        :rtype: list<str>.
        """
        if isinstance(word, str):
            vec = self.word_to_vec[word.lower()]
        else:
            vec = word
    
        nbrs = NearestNeighbors(n_neighbors=k+1, algorithm="ball_tree").fit(self.matrix)
        distances, indices = nbrs.kneighbors(vec.reshape(1,-1))

        best_words = []
        for i in indices.tolist()[0][1:]:
            best_words.append(self.words[i])

        return best_words

## 2. Test

We define here the parameters required to run the test. 
Feel free to change the embedding dimension. 

In [60]:
# Can be changed to 50, 100 or 200.
EMBEDDING_DIM = 50

In [61]:
# File details.
GLOVE_DIR = "dataset/glove"
GLOVE_FILE = "glove.6B.{}d.txt".format(EMBEDDING_DIM)

# Load the word embedding.
glove = WordEmbedding()
glove.load_glove(GLOVE_DIR, GLOVE_FILE)

# Display.
print("Total number of words = {}".format(len(glove)))

Total number of words = 400000


### A. Similarity.

You can mesure the similary between two words. For instance man and boy will be similar.

In [62]:
# Define the test to run.
tests =[
    ["man", "boy"],
    ["woman", "girl"],
    ["London", "zurich"],
    ["table", "bird"],
    ["sister", "brother"]
]

# Perform the tests.
for test in tests:
    sim = glove.similar(test[0], test[1])
    print("{} & {} = {}".format(test[0], test[1], sim))

man & boy = 0.8564431071281433
woman & girl = 0.906528115272522
London & zurich = 0.5148636102676392
table & bird = 0.2420566827058792
sister & brother = 0.7250392436981201


### B. Analogy

You can also evaluate the analogy between woman to queen, man to ??

In [63]:
# Define the test to run.
tests =[
    ["small", "smaller", "large"],
    ["france", "paris", "sweden"],    
    ["india", "delhi", "japan"],
    ["man", "woman", "boy"],
]

# Perform the tests.
for test in tests:
    a, b, c = test
    d = glove.analogy(a, b, c)
    print("{} -> {} & {} -> {}".format(a, b, c, d))

small -> smaller & large -> larger
france -> paris & sweden -> stockholm
india -> delhi & japan -> tokyo
man -> woman & boy -> girl


### C. Nearest neighbors.

In [64]:
# Obtain the nearest neighbors.
words = glove.neighbor("beer", k=10)

# Display the nearest neighbors.
for w in words:
    print("{}".format(w))

UnboundLocalError: local variable 'vec' referenced before assignment