In [2]:
import csv
import pandas as pd
import numpy as np
import operator

In [3]:
wordframe = pd.read_table("glove.42B.300d.txt", sep=" ", header=None, quoting=csv.QUOTE_NONE)
wordvecs = wordframe.drop(columns=0).values.T

word_index = {}
for idx, w in enumerate(wordframe[0]):
    word_index[w] = idx

In [7]:
def word2vec(word):
    idx = word_index[word]
    return wordvecs[:,idx]

def cos_sim(a, b):
    return (np.dot(a, b) / (np.sqrt(np.dot(a, a)) * np.sqrt(np.dot(b, b))))

In [153]:
print(cos_sim(word2vec('the'), word2vec('the')))
print(cos_sim(word2vec('a'), word2vec('the')))

1.0000000000000002
0.7759782144883963


In [5]:
def find_closest(wv, n):
    sims = [(idx, cos_sim(wv, wordvecs[:,idx])) for idx in range(wordvecs.shape[1])]
    scores = sorted(sims, key=operator.itemgetter(1, 0), reverse=True)
    top = scores[1:n+1]
    top_words = [(wordframe[0][i], score) for i, score in top]
    return top_words

def find_synonyms(word, n):
    wv = word2vec(word)
    return find_closest(wv, n)

In [8]:
find_synonyms('sad', 10)

[('depressing', 0.6871760913711346),
 ('sorry', 0.6800469815345019),
 ('sadly', 0.6717799399716502),
 ('pathetic', 0.6704121421696385),
 ('tragic', 0.6591917754988509),
 ('funny', 0.6548493765323571),
 ('kinda', 0.6547635219788633),
 ('unhappy', 0.6436547720082669),
 ('happy', 0.64359088270462),
 ('awful', 0.6426853220313836)]

In [176]:
def solve_analogy(a, b, c, n):
    av = word2vec(a)
    bv = word2vec(b)
    cv = word2vec(c)
    guess = bv - av + cv
    return find_closest(guess, n)

In [175]:
solve_analogy('man', 'king', 'woman', 5)

[('queen', 0.7852211802171629),
 ('prince', 0.6025958343386526),
 ('princess', 0.5830840998108826),
 ('elizabeth', 0.5545557845397115),
 ('woman', 0.5479353384064566)]

In [177]:
solve_analogy('tree', 'leaf', 'flower', 5)

[('leaf', 0.7213918136564571),
 ('flowers', 0.6268049978086533),
 ('petal', 0.6077496442464622),
 ('floral', 0.5882048928142201),
 ('petals', 0.5837074814311084)]

In [178]:
solve_analogy('dog', 'puppy', 'cat', 5)

[('kitten', 0.7830060073888362),
 ('puppy', 0.7784001658859766),
 ('kittens', 0.6804946836129994),
 ('cats', 0.6548770614342793),
 ('puppies', 0.6275406757725545)]