In [1]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [10]:
def find_analogies(w1,w2,w3):
    
    """
    ---------------------------------------
    Description :
    To calculate the word analogies given three input words 
    Word Analogies like : king - man = queen - woman
                          india - indian = america - americans
                          
    Input :
    w1 : a string value (word 1)
    w2 : a string value (word 2)
    w3 : a string value (word 3)
    
    Return :
    None 
    -------------------------------------------
    
    """
    
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("Word not present in dictionary...")
            return
    
    v1 = word2vec[w1]
    v2 = word2vec[w2]
    v3 = word2vec[w3]
    
    v_result = v1 - v2 + v3
    
    distance = pairwise_distances(v_result.reshape(1, D), embedding, metric = 'cosine').reshape(V)
    idxs = distance.argsort()[:4]
    for i in idxs:
        word = idx2word[i]
        if word not in (w1, w2, w3):
            best_word = word
            break
    
    print("{} - {} = {} - {}".format(w1, w2, best_word, w3))
    

In [5]:
def nearest_neighbour(word, n=5):
    """
    --------------------------------------
    Description : 
    Calculate the nearest neighbouring words of a given word
    e.g king = emperor, head, man, male etc.
    
    Input :
    word : a string value
    n : number of neighbouring words to be calculated
    
    Return : 
    None 
    ---------------------------------------
    
    """
    if word not in word2vec:
        print("Word not present in dictionary... ")
        return
    
    vector = word2vec[word]
    distances = pairwise_distances(vector.reshape(1, D), embedding, metric='cosine').reshape(V)
    idxs = distances.argsort()[1 : n+1]
    for i in idxs:
        print("\t {}".format(idx2word[i]))

In [12]:
if __name__ == '__main__':
    
    print("Loading Word vectors from glove")
    word2vec = {}
    idx2word = []
    embedding = []
    
    for line in open('./GLOVE/glove.6B.50d.txt'):
        values = line.split()
        word = values[0]
        word2vec[word] = np.asarray(values[1:],dtype=np.float32)
        idx2word.append(word)
        embedding.append(word2vec[word])
    
    embedding = np.asarray(embedding, dtype=np.float32)
    
    print("Number of words found in GLove : {}".format(len(word2vec)))
    V,D = embedding.shape
    print("Shape of Embedding Matrix : {}".format(embedding.shape))
    
    find_analogies('king','man','woman')
    find_analogies('japan','japanese','french')
    find_analogies('man','woman','she')
    
    print("Nearest Words for King : ")
    nearest_neighbour('king')
    print(">>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<")
    print("Nearest Words for Japan : ")
    nearest_neighbour('japan')
    print(">>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<")
    print("Nearest Words for Man : ")
    nearest_neighbour('man')
    
        

Loading Word vectors from glove
Number of words found in GLove : 400000
Shape of Embedding Matrix : (400000, 50)
king - man = queen - woman
japan - japanese = france - french
man - woman = he - she
Nearest Words for King : 
	 prince
	 queen
	 ii
	 emperor
	 son
>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<
Nearest Words for Japan : 
	 japanese
	 china
	 korea
	 tokyo
	 taiwan
>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<
Nearest Words for Man : 
	 woman
	 boy
	 another
	 old
	 one
