In [40]:
import numpy as np

Method to read a glove file and build a set of words and a map of word encodings

In [41]:
def read_glove_vectors(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

Reads the glove vectors file and prints various dimensions

In [42]:
glove_file = "./glove.6B.50d.txt"

word, word_to_vec_map = read_glove_vectors(glove_file)
print("Total words in vocabulary = " + str(len(word)))
print("Size of word vectors = " + str(len(word_to_vec_map)))
print("Dimensions of each word vector = " + str(word_to_vec_map.get('lion').shape))

Total words in vocabulary = 400000
Size of word vectors = 400000
Dimensions of each word vector = (50,)


In [43]:
# Calculates the cosine similarity of 2 vectors
def cosine_similarity(u, v):
    
    dot_product = np.sum(u * v) # Numerator of cosine similarity
    norm_u = np.sqrt(np.sum(u ** 2)) # Norm of vector u
    norm_v = np.sqrt(np.sum(v ** 2)) # Norm of vector v
    
    cosine_similarity = dot_product / (norm_u * norm_v)
    
    return cosine_similarity
    
    

In [44]:
# Given a word, find the most analogous word
def find_analogous_word(word):
    
    word_vector = word_to_vec_map.get(word)
    
    all_words = word_to_vec_map.keys()
    max_similarity = -100
    best_word = None
    
    for w in all_words:
        if w in [word]:
            continue
            
        similarity = cosine_similarity(word_vector, word_to_vec_map[w])
        
        if similarity > max_similarity:
            max_similarity = similarity
            best_word = w
            
    return best_word, max_similarity

In [45]:
given_words = ['lion', 'airplane', 'boy', 'school', 'hotel', 'motel', "shirt", "jeans", "tiger"]

for given_word in given_words:
    similar_word, sim_score = find_analogous_word(given_word)
    print("Given word : " + given_word + ", Similar word : " + similar_word + ", Similarity score : " + str(sim_score))

Given word : lion, Similar word : dragon, Similarity score : 0.7917536230532733
Given word : airplane, Similar word : plane, Similarity score : 0.8621733057090953
Given word : boy, Similar word : girl, Similarity score : 0.9327198629646993
Given word : school, Similar word : college, Similarity score : 0.9344996087241083
Given word : hotel, Similar word : hotels, Similarity score : 0.8347788319732117
Given word : motel, Similar word : hotel, Similarity score : 0.7792252941873062
Given word : shirt, Similar word : shirts, Similarity score : 0.900019510373655
Given word : jeans, Similar word : denim, Similarity score : 0.9055760847898471
Given word : tiger, Similar word : tigers, Similarity score : 0.7239238726304342


In [46]:
# Given 3 words word_a, word_b and word_c, find a word_d such that the word pair (word_c, word_d) closely
# matches the word pair (word_a, word_b)

def complete_analogy(word_a, word_b, word_c):
    
    encoding_a = word_to_vec_map[word_a]
    encoding_b = word_to_vec_map[word_b]
    encoding_c = word_to_vec_map[word_c]
    
    words = word_to_vec_map.keys()
    
    best_sim_score = -100
    best_word = None
    
    for w in words:
 #       if w in [word_a, word_b, word_c]:
 #          continue
        
        encoding_w = word_to_vec_map[w]
        sim_score = cosine_similarity(encoding_b - encoding_a, encoding_w - encoding_c)
        
        if sim_score > best_sim_score:
            best_sim_score = sim_score
            best_word = w
            
    return best_word

In [47]:
word_triads = [('italy', 'rome', 'germany'), ('king', 'queen', 'man'), ('lion', 'animal', 'tiger')]

for word_triad in word_triads:
    similar_word = complete_analogy(word_triad[0], word_triad[1], word_triad[2])
    print(word_triad[0] + " -> " + word_triad[1] + " :: " + word_triad[2] + " -> " + similar_word)

  


italy -> rome :: germany -> berlin
king -> queen :: man -> woman
lion -> animal :: tiger -> animal
