# Global Vectors for Word Representation

The GloVe word embeddings were due to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 
https://nlp.stanford.edu/projects/glove/

## 1. Model overview

In [75]:
import os
import sys
import numpy as np

In [257]:
class WordEmbedding:
    
    def __init__(self):
        """Constructor.
        """
        self.words = []
        self.word_to_vec = {}
        return
    
    def __len__(self):
        return len(self.words)
    
    def load_glove(self, filepath, filename):
        """Load the representation.
        
        :param filepath: path to the file.
        :type filepath: str.
        :param filename: name of the file.
        :type filename: str.
        """
        self.word_to_vec = {}
        f = open(os.path.join(filepath, filename))
        for line in f:
            values = line.split()
            word = values[0]
            coef = np.asarray(values[1:], dtype="float32")
            self.word_to_vec[word] = coef
        f.close()
        self.words = list(self.word_to_vec.keys())
        return

    def cosine_similarity(self, u, v):
        """Mesure the cosine similarity between two arrays.
        """
        norm_u = np.sqrt(np.sum(u * u))
        norm_v = np.sqrt(np.sum(v * v))
        return np.sum(u * v) / ( norm_u * norm_v )

    def similar(self, word_a, word_b):
        """Mesure the similarity between word a and b.
        """
        a = self.word_to_vec[word_a.lower()]
        b = self.word_to_vec[word_b.lower()]
        return self.cosine_similarity(a, b)
    
    def analogy(self, word_a, word_b, word_c):
        """Performs the word analogy task as explained above: a is to b as c is to ____.
        """
        a = self.word_to_vec[word_a.lower()]
        b = self.word_to_vec[word_b.lower()]
        c = self.word_to_vec[word_c.lower()]
        
        max_sim = -1.0
        best_word = None        
        
        for w, x in self.word_to_vec.items():
            if w in [word_a, word_b, word_c]:
                pass
            else:
                sim = self.cosine_similarity(a - b, c - x)
                if sim > max_sim:
                    max_sim = sim
                    best_word = w 
                    
        return best_word
    
    def neighbor(self, word, k=1):
        """Performs the task of finding the k nearest neighbors to the word.
        """
        vec = self.word_to_vec[word.lower()]
        
        max_sims = []
        best_words = []
        for i in range(k):
            max_sims.append(-1.1)
            best_words.append(None)
        
        for w, x in self.word_to_vec.items():
            if w in [word]:
                pass
            else:
                sim = self.cosine_similarity(vec, x)
                
                if sim >= min(max_sims):
                    for i in range(k):
                        if sim > max_sims[i]:
                            max_sims[i] = sim
                            best_words[i] = w
                            break

        return best_words, max_sims

## 2. Test

We define here the parameters required to run the test. 
Feel free to change the embedding dimension. 

In [296]:
# Can be changed to 50, 100 or 200.
EMBEDDING_DIM = 50

In [297]:
# File details.
GLOVE_DIR = "dataset/glove"
GLOVE_FILE = "glove.6B.{}d.txt".format(EMBEDDING_DIM)

# Load the word embedding.
word2vec = WordEmbedding()
word2vec.load_glove(GLOVE_DIR, GLOVE_FILE)

# Display.
print("Total number of words = {}".format(len(word2vec)))

Total number of words = 400000


### A. Similarity.

You can mesure the similary between two words. For instance man and boy will be similar.

In [298]:
# Define the test to run.
tests =[
    ["man", "boy"],
    ["woman", "girl"],
    ["London", "zurich"],
    ["table", "bird"],
    ["sister", "brother"]
]

# Perform the tests.
for test in tests:
    sim = word2vec.similar(test[0], test[1])
    print("{} & {} = {}".format(test[0], test[1], sim))

man & boy = 0.8564431071281433
woman & girl = 0.906528115272522
London & zurich = 0.5148636102676392
table & bird = 0.2420566827058792
sister & brother = 0.7250392436981201


### B. Analogy

You can also evaluate the analogy between woman to queen, man to ??

In [305]:
# Define the test to run.
tests =[
    ["small", "smaller", "large"],
    ["france", "paris", "sweden"],    
    ["india", "delhi", "japan"],
    ["man", "woman", "boy"],
]

# Perform the tests.
for test in tests:
    a, b, c = test
    d = word2vec.analogy(a, b, c)
    print("{} -> {} & {} -> {}".format(a, b, c, d))

small -> smaller & large -> larger
france -> paris & sweden -> stockholm
india -> delhi & japan -> tokyo
man -> woman & boy -> girl


### C. Nearest neighbors.

In [306]:
# Obtain the nearest neighbors.
words, cos_sim = word2vec.neighbor("table", k=10)

# Display the nearest neighbors.
for w, sim in zip(words, cos_sim):
    dotes = "." * ( 23 - len(w) )
    print("{} {} = {}".format(w, dotes, sim))

tables ................. = 0.8177436590194702
sit .................... = 0.7571902871131897
hold ................... = 0.7306693196296692
bottom ................. = 0.7166719436645508
sitting ................ = 0.6948686838150024
pool ................... = 0.6913972496986389
wrap ................... = 0.6869691610336304
draw ................... = 0.6701829433441162
standing ............... = 0.6688733696937561
placing ................ = 0.6672315001487732
