In [1]:
import csv
import pandas as pd
import numpy as np
import operator

In [2]:
def load_wordvecs(path):
    wordframe = pd.read_table(path, sep=" ", header=None, quoting=csv.QUOTE_NONE)
    words = wordframe[0]
    wordvecs = wordframe.drop(columns=0).values.T

    word_index = {}
    for idx, w in enumerate(wordframe[0]):
        word_index[w] = idx
        
    word_norms = np.zeros((wordvecs.shape[1],))
    for i in range(wordvecs.shape[1]):
        word_norms[i] = np.linalg.norm(wordvecs[:,i])
        
    return words, wordvecs, word_index, word_norms

In [3]:
words, wordvecs, word_index, word_norms = load_wordvecs("glove.840B.300d.txt")

In [4]:
print("vocab size is", wordvecs.shape[1])

vocab size is 2196017


In [14]:
class Toplist:
    def __init__(self, size, keyfn):
        self.min = None
        self.max = None
        self.size = size
        self.keyfn = keyfn or (lambda x: x)
        self.items = []
        
    def _calc_min_max(self):
        min_ = None
        max_ = None
        for i in self.items:
            if min_ is None or self.keyfn(min_) > self.keyfn(i):
                min_ = i
            if max_ is None or self.keyfn(max_) < self.keyfn(i):
                max_ = i
        self.min = min_
        self.max = max_
        
    def push(self, item):
        if len(self.items) < self.size:
            self.items.append(item)
            self._calc_min_max()
        else:
            key = self.keyfn(item)
            if key <= self.keyfn(self.min):
                return
            self.items.remove(self.min)
            self.items.append(item)
            self._calc_min_max()

def find_word(word):
    wi = word_index[word]
    wv = wordvecs[:,wi]
    wn = word_norms[wi]
    return (wi, wv, wn)

def make_target(wv):
    return (None, wv, np.linalg.norm(wv))

def cos_sim(a, b):
    return np.dot(a[1], b[1]) / (a[2] * b[2])

def find_closest(target, n):
    topwords = Toplist(n + 1, lambda x: x[1])
    for i in range(wordvecs.shape[1]):
        cand = (i, wordvecs[:,i], word_norms[i])
        sim = cos_sim(target, cand)
        topwords.push((i, sim))
    return [(words[i], s) for (i, s) in sorted(topwords.items, key=lambda x: -x[1])[1:]]        

def find_synonyms(word, n):
    target = find_word(word)
    return find_closest(target, n)

def solve_analogy(a, b, c, n):
    at = find_word(a)
    bt = find_word(b)
    ct = find_word(c)
    guess = bt[1] - at[1] + ct[1]
    return find_closest(make_target(guess), n)

In [9]:
find_synonyms('simple', 15)

[('straightforward', 0.7598401789000424),
 ('easy', 0.7527235589305407),
 ('simplest', 0.698272448362614),
 ('basic', 0.684735295654891),
 ('simpler', 0.6550425162533078),
 ('quick', 0.6546937970492189),
 ('simply', 0.6456051957769307),
 ('Simple', 0.6379334979408947),
 ('make', 0.6294584219950387),
 ('neat', 0.62700226328513),
 ('way', 0.625465131086621),
 ('kind', 0.6221105383923256),
 ('complicated', 0.6185878422461943),
 ('easier', 0.6119539330710158),
 ('simplistic', 0.6081463102432625)]

In [7]:
find_synonyms('wonderful', 15)

[('fantastic', 0.8591595791816141),
 ('lovely', 0.8388477050475734),
 ('fabulous', 0.8359376678974116),
 ('amazing', 0.8156014577622631),
 ('great', 0.8059450072940098),
 ('marvelous', 0.8023499439044458),
 ('beautiful', 0.7896219657457448),
 ('delightful', 0.7839891580590513),
 ('terrific', 0.775186249661481),
 ('gorgeous', 0.7233308378389621),
 ('wonderfully', 0.7219120388363572),
 ('awesome', 0.7182067151596504),
 ('incredible', 0.7168842503985587),
 ('nice', 0.7063791247499048),
 ('inspiring', 0.7040061156673134)]

In [13]:
solve_analogy('man', 'king', 'woman', 5)

[('queen', 0.7880844291841294),
 ('prince', 0.6401077949675448),
 ('kings', 0.6208543934897167),
 ('princess', 0.6125636524073936),
 ('royal', 0.5800970791610943)]

In [15]:
solve_analogy('tree', 'leaf', 'flower', 5)

[('leaf', 0.7391099662079962),
 ('flowers', 0.6810389249054707),
 ('petals', 0.677353549710753),
 ('petal', 0.6751669154136666),
 ('floral', 0.6213623994091388)]

In [16]:
solve_analogy('dog', 'puppy', 'cat', 5)

[('cat', 0.8533778446069686),
 ('puppy', 0.8248203189011132),
 ('kittens', 0.7661139728453404),
 ('pup', 0.7530981677990902),
 ('kitty', 0.7515497639056455)]

In [17]:
solve_analogy('white', 'black', 'up', 5)

[('out', 0.6714371583257094),
 ('down', 0.6469337262680082),
 ('get', 0.6134338430239189),
 ('going', 0.5983556643455921),
 ('put', 0.5968824404289338)]

In [19]:
solve_analogy('bees', 'hive', 'bears', 10)

[('bear', 0.5394098004124965),
 ('itself', 0.4416594674163141),
 ('lion', 0.4076591831253801),
 ('wolf', 0.40610565815910354),
 ('grizzly', 0.3957266725509989),
 ('teddy', 0.3858165815967754),
 ('beast', 0.37727804992840114),
 ('lair', 0.37326781836920003),
 ('lions', 0.3720534335594756),
 ('cubs', 0.371431507794096)]