In [5]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors, Word2Vec
import numpy as np
from sklearn.manifold import TSNE
from collections import defaultdict

In [6]:
vec_file = datapath('/home/markg/workspace/vectors/word2vec-google-news-300')
model = KeyedVectors.load_word2vec_format(vec_file, binary=True)

In [7]:
def sample_spherical(npoints, ndim=3):
    vec = np.random.randn(ndim, npoints)
    vec /= np.linalg.norm(vec, axis=0)
    return vec

In [8]:
test = model.get_vector('test')

In [9]:
plane_vec = sample_spherical(1, ndim=300).flatten()

In [10]:
np.dot(plane_vec, test)

-0.09007873171485564

In [None]:
model.similar_by_vector(plane_vec, topn=100, restrict_vocab=5000)

In [65]:
import pandas as pd

In [79]:
counts = pd.read_csv('../vectors/coca-counts.csv')

In [86]:
nouns = [w.strip().lower() for w in list(counts[counts['pos'] == 'n'].word[:2500])]

In [91]:
d = defaultdict(float)
for w in nouns:
    vec = model.get_vector(w)
    d[w] = np.dot(vec, plane_vec)

In [92]:
for w in sorted(d, key=d.get, reverse=True):
    print(w, d[w])

flavor 1.1573978359465353
talent 0.8922420835199435
consumption 0.8351865624830461
processor 0.8070030442895461
network 0.7859910291018418
poet 0.7713211061612743
hint 0.7663932141293424
satellite 0.746965849448074
warmth 0.7456164674431034
athlete 0.7146090992672754
correspondent 0.6922487657396255
astronomer 0.6836836390194614
receiver 0.6484460372701537
entrepreneur 0.6331993116660497
columnist 0.6194158827260297
beer 0.6114340215531842
programming 0.6061493683774901
personality 0.6048192859091663
station 0.6034303141000732
wine 0.6019621883590366
drinking 0.596094264551862
scent 0.5933224577248742
hardware 0.5926979876560701
vessel 0.5899873841318252
taste 0.5889363937946825
computer 0.5853438237026257
practitioner 0.5780841682458949
storage 0.5773767292159894
depth 0.5704754883685081
radio 0.5564201390471796
recipient 0.5550955096153819
sibling 0.5514194218849119
device 0.5494393844338089
memory 0.5399764678082266
romance 0.5371956845360281
sensation 0.5306446354741479
counsel 0.5

constraint -0.1425878049233299
approach -0.1436808518507385
presence -0.14386679553200976
glance -0.14389132456776077
provider -0.14420558003329106
mention -0.14421019541210978
booth -0.14421999772910785
contractor -0.14462315151414468
ass -0.14512560138265157
hip -0.1451640605642786
tomato -0.14546048529965927
proposal -0.14711512961924395
outfit -0.14723195329306776
security -0.14758582330327233
friendship -0.14841036587029138
rage -0.1490058410733346
existence -0.14927296861334233
dialogue -0.14980227285724265
mountain -0.14996662019589446
sovereignty -0.15014957652267943
container -0.15016138061595918
lane -0.15020493229750267
reform -0.15024888922867555
gender -0.15030353666494783
gas -0.15107642013355838
photograph -0.15123177227880033
effectiveness -0.15125480896066953
job -0.15302803685351632
appreciation -0.153056415368974
strain -0.15335503457123092
speculation -0.15375759443940698
background -0.15412966716325036
liquid -0.15459012940553
surface -0.1545971859157012
minimum -0

twist -0.504161098545562
length -0.5044108874178977
white -0.5050888611986495
cut -0.5051613873062158
weight -0.5052452035974068
contact -0.5055011574592757
ruling -0.5074130619515921
chain -0.5074228422362007
damage -0.5077495747291382
temple -0.5086711196446597
design -0.5097271201171467
revolution -0.512362724632472
republican -0.5139789128983615
drive -0.5144484489200845
piece -0.5146342417044194
move -0.5146393911858443
effect -0.5147657767116587
hope -0.5153671783340232
particle -0.5158220776740364
hat -0.5158896928289703
chaos -0.5170204020644605
murder -0.5181117685563601
action -0.5183463206990424
license -0.518972453333439
sock -0.5190531939991975
face -0.519264977803944
worth -0.5198017302559506
side -0.5199020164159756
jew -0.52002672324479
charge -0.5211062157891649
mess -0.5214468512741834
hockey -0.5223699752619159
collapse -0.5225572535712321
manufacturing -0.522895852542394
soil -0.5236159782145331
abortion -0.5238908617876792
clothes -0.5253023977969337
inspector -0.5

In [None]:
X = model[model.vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.show()