In [1]:
import csv
import pandas as pd
import numpy as np
import operator

In [35]:
def load_wordvecs(path):
    wordframe = pd.read_table(path, sep=" ", header=None, quoting=csv.QUOTE_NONE)
    words = wordframe[0]
    wordvecs = wordframe.drop(columns=0).values.T

    word_index = {}
    for idx, w in enumerate(wordframe[0]):
        word_index[w] = idx
        
    word_norms = np.zeros((wordvecs.shape[1],))
    for i in range(wordvecs.shape[1]):
        word_norms[i] = np.linalg.norm(wordvecs[:,i])
        
    return words, wordvecs, word_index, word_norms

In [23]:
words, wordvecs, word_index, word_norms = load_wordvecs("glove.840B.300d.txt")

In [24]:
print("vocab size is", wordvecs.shape[1])

vocab size is 2196017


In [58]:
def word2vec(word):
    idx = word_index[word]
    return wordvecs[:,idx]

def cos_sim(a, b):
    return (np.dot(a, b) / (np.sqrt(np.dot(a, a)) * np.sqrt(np.dot(b, b))))

def find_closest(wv, n):
    sims = [(idx, cos_sim(wv, wordvecs[:,idx])) for idx in range(wordvecs.shape[1])]
    scores = sorted(sims, key=operator.itemgetter(1, 0), reverse=True)
    top = scores[1:n+1]
    top_words = [(wordframe[0][i], score) for i, score in top]
    return top_words

def find_synonyms(word, n):
    wv = word2vec(word)
    return find_closest(wv, n)

class Toplist:
    def __init__(self, size, keyfn):
        self.min = None
        self.max = None
        self.size = size
        self.keyfn = keyfn or (lambda x: x)
        self.items = []
        
    def _calc_min_max(self):
        min_ = None
        max_ = None
        for i in self.items:
            if min_ is None or self.keyfn(min_) > self.keyfn(i):
                min_ = i
            if max_ is None or self.keyfn(max_) < self.keyfn(i):
                max_ = i
        self.min = min_
        self.max = max_
        
    def push(self, item):
        if len(self.items) < self.size:
            self.items.append(item)
            self._calc_min_max()
        else:
            key = self.keyfn(item)
            if key <= self.keyfn(self.min):
                return
            self.items.remove(self.min)
            self.items.append(item)
            self._calc_min_max()

def find_word(word):
    wi = word_index[word]
    wv = wordvecs[:,wi]
    wn = word_norms[wi]
    return (wi, wv, wn)

def cos_sim2(a, b):
    return np.dot(a[1], b[1]) / (a[2] * b[2])

def find_synonyms2(word, n):
    target = find_word(word)
    topwords = Toplist(n + 1, lambda x: x[1])
    for i in range(wordvecs.shape[1]):
        cand = (i, wordvecs[:,i], word_norms[i])
        sim = cos_sim2(target, cand)
        topwords.push((i, sim))
    return [(words[i], s) for (i, s) in sorted(topwords.items, key=lambda x: -x[1])[1:]]    

In [30]:
import time

class benchmark:
    def __init__(self, name, reps=1):
        self.name = name
        self.reps = reps
    def __enter__(self):
        self.start = time.perf_counter()
    def __exit__(self, *args):
        elapsed = time.perf_counter() - self.start
        print("{}: {:0.2f}s elapsed".format(self.name, elapsed / self.reps))

test_words = ['simple', 'sad', 'happy', 'frog', 'window', 'celerity', 'table', 'acquire', 'quiet', 'spark']

In [31]:
with benchmark("slow way", len(test_words[:2])):
    for w in test_words[:2]:
        find_synonyms(w, 15)

slow way: 23.57s elapsed


In [50]:
with benchmark("fast way", len(test_words[:2])):
    for w in test_words[:2]:
        find_synonyms2(w, 15)

fast way: 7.76s elapsed


In [11]:
find_synonyms('simple', 10)

[('straightforward', 0.7598401789000422),
 ('easy', 0.7527235589305404),
 ('simplest', 0.698272448362614),
 ('basic', 0.684735295654891),
 ('simpler', 0.655042516253308),
 ('quick', 0.6546937970492188),
 ('simply', 0.6456051957769308),
 ('Simple', 0.6379334979408948),
 ('make', 0.6294584219950385),
 ('neat', 0.6270022632851301)]

In [62]:
find_synonyms2('wonderful', 15)

[('fantastic', 0.859159579181614),
 ('lovely', 0.8388477050475734),
 ('fabulous', 0.8359376678974119),
 ('amazing', 0.8156014577622632),
 ('great', 0.8059450072940093),
 ('marvelous', 0.8023499439044454),
 ('beautiful', 0.789621965745745),
 ('delightful', 0.7839891580590513),
 ('terrific', 0.7751862496614813),
 ('gorgeous', 0.7233308378389622),
 ('wonderfully', 0.7219120388363572),
 ('awesome', 0.7182067151596502),
 ('incredible', 0.7168842503985587),
 ('nice', 0.7063791247499048),
 ('inspiring', 0.7040061156673132)]

In [7]:
def solve_analogy(a, b, c, n):
    av = word2vec(a)
    bv = word2vec(b)
    cv = word2vec(c)
    guess = bv - av + cv
    return find_closest(guess, n)

In [8]:
solve_analogy('man', 'king', 'woman', 5)

[('queen', 0.788084429184129),
 ('prince', 0.6401077949675448),
 ('kings', 0.6208543934897165),
 ('princess', 0.6125636524073937),
 ('royal', 0.5800970791610942)]

In [9]:
solve_analogy('tree', 'leaf', 'flower', 5)

[('leaf', 0.7391099662079963),
 ('flowers', 0.681038924905471),
 ('petals', 0.6773535497107525),
 ('petal', 0.6751669154136665),
 ('floral', 0.6213623994091386)]

In [10]:
solve_analogy('dog', 'puppy', 'cat', 5)

[('cat', 0.8533778446069687),
 ('puppy', 0.8248203189011135),
 ('kittens', 0.7661139728453403),
 ('pup', 0.7530981677990902),
 ('kitty', 0.7515497639056457)]