# Loading word vector

In [1]:
import torch
import torchtext.vocab as vocab

In [2]:
glove = vocab.GloVe(name='6B', dim=100)

print('Loaded {} words'.format(len(glove.itos)))


Loaded 400000 words


In [3]:
type(glove.itos)

list

In [4]:
glove.itos[:10]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s"]

# Get "closet" word

In [5]:
def get_word(word):
    return glove.vectors[glove.stoi[word]]



In [14]:
glove.stoi['<sos>']

KeyError: '<sos>'

In [42]:
def closest(vec, n=10):
    all_dist = [(w,torch.dist(vec,get_word(w))) for w in glove.itos]
    all_dist = sorted(all_dist, key= (lambda t: t[1]))
    return all_dist[:n]

In [43]:
closest(getword('google'),10)

[('google', tensor(0.)),
 ('yahoo', tensor(3.0772)),
 ('microsoft', tensor(3.8836)),
 ('web', tensor(4.1048)),
 ('aol', tensor(4.1082)),
 ('facebook', tensor(4.1165)),
 ('ebay', tensor(4.3917)),
 ('msn', tensor(4.4122)),
 ('internet', tensor(4.4540)),
 ('netscape', tensor(4.4651))]

# Word Analogies

In [46]:
def print_tuples(tuples):
    for tuple in tuples:
        print('(%.4f) %s' % (tuple[1], tuple[0]))

In [47]:

def analogy(w1, w2, w3, n=5, filter_given=True):
    print('\n[%s : %s vs %s : ?]' % (w1, w2, w3))
   
    # w2 - w1 + w3 = w4
    closest_words = closest(get_word(w2) - get_word(w1) + get_word(w3))
    
    # Optionally filter out given words
    if filter_given:
        closest_words = [t for t in closest_words if t[0] not in [w1, w2, w3]]
        
    print_tuples(closest_words[:n])


In [48]:
analogy('king', 'man', 'queen')



[king : man vs queen : ?]
(4.0811) woman
(4.6916) girl
(5.2703) she
(5.2788) teenager
(5.3084) boy


In [49]:
analogy('jordan', 'basketball', 'woods')




[jordan : basketball vs woods : ?]
(5.8607) golf
(6.4110) golfers
(6.4418) tournament
(6.4592) tennis
(6.6560) collegiate


# Here comes the funny one :)

In [54]:
analogy('physics','newton','china' )


[physics : newton vs china : ?]
(6.6914) hong
(6.8442) kong
(6.9104) mainland
(6.9944) south
(7.0044) beijing
