# Operations on word vectors


In [7]:
import numpy as np
from w2v_utils import *

Lets load a 50-dimensional GloVe vectors to represent words

In [10]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

In [11]:
words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

We've loaded:
- `words`: set of words in the vocabulary.
- `word_to_vec_map`: dictionary mapping words to their GloVe vector representation.



## 1 - Cosine similarity

$$\text{CosineSimilarity(u, v)} = \frac {u . v} {||u||_2 ||v||_2} = cos(\theta) \tag{1}$$

where $u.v$ is the dot product (or inner product) of two vectors, $||u||_2$ is the norm (or length) of the vector $u$, and $\theta$ is the angle between $u$ and $v$. This similarity depends on the angle between $u$ and $v$. If $u$ and $v$ are very similar, their cosine similarity will be close to 1; if they are dissimilar, the cosine similarity will take a smaller value. 


In [47]:
def cosine_similarity(u, v):
    
    distance = 0.0
    
    dot = np.dot(u, v)
    norm_u = np.linalg.norm(u,ord=2)
    norm_v = np.linalg.norm(v,ord=2)
    cosine_similarity = 1.0 * dot / norm_u / norm_v
    
    return cosine_similarity

In [13]:
father = word_to_vec_map["father"]
mother = word_to_vec_map["mother"]
baby = word_to_vec_map["baby"]
lullaby = word_to_vec_map["lullaby"]
infant = word_to_vec_map["infant"]

print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
print("cosine_similarity(baby, lullaby) = ",cosine_similarity(baby, lullaby))
print("cosine_similarity(baby, infant) = ",cosine_similarity(baby, infant))

cosine_similarity(father, mother) =  0.8909038442893616
cosine_similarity(baby, lullaby) =  0.3699459221388092
cosine_similarity(baby, infant) =  0.7405669236193029


## 2 - Word analogy

We complete the sentence <font color='brown'>"*a* is to *b* as *c* is to **____**"</font>. An example is <font color='brown'> '*man* is to *woman* as *king* is to *queen*' </font>. In detail, we are trying to find a word *d*, such that the associated word vectors $e_a, e_b, e_c, e_d$ are related in the following manner: $e_b - e_a \approx e_d - e_c$. We will measure the similarity between $e_b - e_a$ and $e_d - e_c$ using cosine similarity. 

In [14]:
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
    
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    
    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in words:        
        # to avoid best_word being one of the input words, pass on them.
        if w in [word_a, word_b, word_c] :
            continue
        
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)

        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w

        
    return best_word

## 3 - Testing

In [15]:
triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

italy -> italian :: spain -> spanish
india -> delhi :: japan -> tokyo
man -> woman :: boy -> girl
small -> smaller :: large -> larger


In [16]:
triads_to_try = [('baby', 'lullaby', 'adult'),('baby', 'lullaby', 'teenager')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

baby -> lullaby :: adult -> easy-listening
baby -> lullaby :: teenager -> easy-listening


In [48]:
triads_to_try = [('man', 'hip-hop', 'woman'),('man', 'beatles', 'woman'),('jazz', 'hip-hop', 'punk')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

man -> hip-hop :: woman -> r&b
man -> beatles :: woman -> presley
jazz -> hip-hop :: punk -> terrorizer


In [42]:
triads_to_try = [('spotify','apple','google'),('spotify','sweden','google')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

spotify -> apple :: google -> microsoft
spotify -> sweden :: google -> germany


In [23]:
# not gender bias adjusted
triads_to_try = [('man', 'doctor', 'woman')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

man -> doctor :: woman -> nurse


In [54]:

triads_to_try = [('r&b', 'beyonce', 'hip-hop')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

r&b -> beyonce :: hip-hop -> liwen


It is pretty terrible at music related analogies..