In [1]:
import numpy as np

In [2]:
def read_word2vec_matrix(word2vec_file):
    with open(word2vec_file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        words = set()
        word_to_vec_map = {}

        line = f.readline().strip().split()
        m = line[0] # number of rows
        n = line[1] # number of cols

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map

In [3]:
words, word_to_vec_map = read_word2vec_matrix('../../../nlp/model.txt')


words:  2469122


TypeError: 'set' object is not subscriptable

In [16]:
words = list(words)
print("words[18623]: ", words[18623])
print("word_to_vec_map[words[18623]:")
word_to_vec_map[words[18623]]

words[18623]:  proguanile
word_to_vec_map[words[18623]:


array([ 5.19170e-02, -2.16962e-01,  2.91658e-01, -5.59510e-02,
        3.82250e-02,  1.33154e-01, -3.43920e-01,  1.54411e-01,
        9.16760e-02,  7.03040e-02,  9.55600e-03,  4.36170e-02,
       -3.87940e-02, -1.35658e-01,  1.64581e-01, -1.01549e-01,
        1.24315e-01, -3.70790e-02,  1.50376e-01,  1.55852e-01,
       -1.54380e-01,  5.34490e-02,  8.69620e-02,  3.05327e-01,
        2.22060e-01, -1.27903e-01, -7.02760e-02,  2.80031e-01,
        9.70360e-02, -1.13833e-01, -2.17892e-01,  1.59282e-01,
       -4.14302e-01,  1.82430e-01,  3.77200e-02,  4.09520e-02,
        2.37330e-02, -1.61899e-01,  6.48000e-02,  6.82770e-02,
        3.05240e-02,  1.21533e-01, -1.89350e-02,  4.70600e-02,
       -2.34020e-02,  1.68790e-01,  9.46610e-02,  1.36476e-01,
       -2.09627e-01, -1.43936e-01,  2.51430e-01, -1.79624e-01,
       -1.23201e-01,  1.03637e-01, -5.14990e-02,  9.56310e-02,
        5.80380e-02,  8.55280e-02, -8.73440e-02, -1.75922e-01,
        1.93957e-01,  2.84770e-02, -1.14581e-01,  1.382

In [17]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v
        
    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """
    
    distance = 0.0
   
    dot = np.dot(u, v)

    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(u**2))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(v**2))

    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot / np.dot(norm_u, norm_v)

   
    return cosine_similarity

In [18]:
father = word_to_vec_map["padre"]
mother = word_to_vec_map["madre"]
ball = word_to_vec_map["palla"]
crocodile = word_to_vec_map["coccodrillo"]
france = word_to_vec_map["francia"]
italy = word_to_vec_map["italia"]
paris = word_to_vec_map["parigi"]
rome = word_to_vec_map["rooma"]

print("cosine_similarity(padre, madre) = ", cosine_similarity(father, mother))
print("cosine_similarity(palla, coccodrillo) = ",cosine_similarity(ball, crocodile))
print("cosine_similarity(francia - parigi, roma - italia) = ",cosine_similarity(france - paris, rome - italy))

cosine_similarity(padre, madre) =  0.7659759663297602
cosine_similarity(palla, coccodrillo) =  0.3832868493247648
cosine_similarity(francia - parigi, roma - italia) =  -0.39235044171214056


In [19]:
def words_analogy(word_a, word_b, word_c, word_to_vec_map):
    """
    Performs the word analogy task as explained above: a is to b as c is to ____. 
    
    Arguments:
    word_a -- a word, string
    word_b -- a word, string
    word_c -- a word, string
    word_to_vec_map -- dictionary that maps words to their corresponding vectors. 
    
    Returns:
    best_word --  the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
    """
    
    # convert words to lower case
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
        
    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in words:        
        # to avoid best_word being one of the input words, pass on them.
        if w in [word_a, word_b, word_c] :
            continue
        
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity((e_b - e_a), (word_to_vec_map[w] - e_c))
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        
    return best_word

In [20]:
triads_to_try = [('italia', 'italiano', 'spagna'), ('india', 'delhi', 'giappone'), ('uomo', 'donna', 'ragazzo'), ('piccolo', 'minuscolo', 'grande')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, words_analogy(*triad,word_to_vec_map)))

italia -> italiano :: spagna -> spagnolo
india -> delhi :: giappone -> osaka
uomo -> donna :: ragazzo -> ragazza
piccolo -> minuscolo :: grande -> minuscola


In [22]:
triads_to_try = [('mela', 'frutta', 'carota')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, words_analogy(*triad,word_to_vec_map)))

mela -> frutta :: carota -> verdure
