In [1]:
from nlpia.loaders import get_data
wv = get_data('word2vec')  # <1>
# 100%|############################| 402111/402111 [01:02<00:00, 6455.57it/s]
len(wv.vocab), len(wv[next(iter(wv.vocab))])
# (3000000, 300)
wv.vectors.shape
# (3000000, 300)

(3000000, 300)

In [2]:
from annoy import AnnoyIndex
num_words, num_dimensions = wv.vectors.shape  # <1>
index = AnnoyIndex(num_dimensions)
index.set_seed(1983)

In [3]:
from tqdm import tqdm
for i, word in enumerate(tqdm(wv.index2word)):
    index.add_item(i, wv[word])

100%|██████████| 3000000/3000000 [01:38<00:00, 30410.31it/s]


In [4]:
import numpy as np
num_trees = int(np.log(num_words).round(0))  # <1>
index.build(num_trees)  # <2>
index.save('Word2vec_index.ann')  # <3>

True

In [5]:
num_trees

15

In [6]:
wv.vocab['Harry_Potter'].index  # <1>

9494

In [7]:
wv.vocab['Harry_Potter'].count  # <2>

2990506

In [8]:
w2id = dict(zip(wv.vocab, range(len(wv.vocab))))  # <3>

In [9]:
w2id['Harry_Potter']

9494

In [10]:
ids = index.get_nns_by_item(w2id['Harry_Potter'], 11)  # <4>
ids

[9494,
 32643,
 407349,
 39034,
 14728,
 1752224,
 51081,
 43101,
 22364,
 113955,
 155169]

In [11]:
[wv.index2word[i] for i in ids]

['Harry_Potter',
 'Narnia',
 'Harry_Potters',
 'Sherlock_Holmes',
 'Star_Wars',
 'Hallows_Part',
 'Sith',
 'Toy_Story_3',
 'Shrek',
 'Sorcerer_Apprentice',
 'LOTR']

In [12]:
[word for word, similarity in wv.most_similar('Harry_Potter', topn=10)]

  if np.issubdtype(vec.dtype, np.int):


['JK_Rowling_Harry_Potter',
 'JK_Rowling',
 'boy_wizard',
 'Deathly_Hallows',
 'Half_Blood_Prince',
 'Rowling',
 'Actor_Rupert_Grint',
 'HARRY_Potter',
 'wizard_Harry_Potter',
 'HARRY_POTTER']

In [13]:
index_cos = AnnoyIndex(f=num_dimensions, metric='angular')

In [14]:
for i, word in enumerate(wv.index2word):
    if not i % 100000:
        print('{}: {}'.format(i, word))
    index_cos.add_item(i, wv[word])

0: </s>
100000: distinctiveness
200000: barbiturate
300000: Sony_PS3
400000: Infiniti_FX
500000: Attorney_Bud_Cummins
600000: Giske
700000: f_***_er
800000: Shaw_Stockbroking_Ltd.
900000: HKSTP
1000000: Starwood_Hotels_HOT
1100000: McGrath_RentCorp_NASDAQ_MGRC
1200000: Piveteau
1300000: Rob_Pavey
1400000: Giant_Octopus
1500000: eur_UPM_Kymmene
1600000: CSSL
1700000: Lubina
1800000: Ndian
1900000: Cape_Solander
2000000: Iordanis
2100000: Allegiance_recitation
2200000: brandy_soaked
2300000: Coach_Kurt_Budke
2400000: backcountry_hikers
2500000: Brawn_BMW_Sauber
2600000: cedar_juniper
2700000: Wendy_Liberatore
2800000: Management_GDCM
2900000: BOARDED_UP


In [15]:
index_cos.build(30)
index_cos.save('Word2vec_cos_index.ann')

True

In [16]:
ids_cos = index_cos.get_nns_by_item(w2id['Harry_Potter'], 10)
ids_cos

[9494, 71557, 41526, 340510, 337152, 420722, 148450, 852429, 2339857, 2149220]

In [25]:
[wv.index2word[i] for i in ids_cos]

['Harry_Potter',
 'boy_wizard',
 'Half_Blood_Prince',
 'wizard_Harry_Potter',
 'Stephenie_Meyer_Twilight',
 'Potter_mania',
 'wizarding',
 'Stephenie_Meyers',
 'Philosophers_Stone',
 'Fenrir_Greyback']

In [26]:
annoy_top10 = zip([wv.index2word[i] for i in ids], [wv.index2word[i] for i in ids_cos])

In [27]:
import pandas as pd
pd.DataFrame(annoy_top10, columns=['annoy_15trees', 'annoy_30trees'])

Unnamed: 0,annoy_15trees,annoy_30trees
0,Harry_Potter,Harry_Potter
1,Narnia,boy_wizard
2,Harry_Potters,Half_Blood_Prince
3,Sherlock_Holmes,wizard_Harry_Potter
4,Star_Wars,Stephenie_Meyer_Twilight
5,Hallows_Part,Potter_mania
6,Sith,wizarding
7,Toy_Story_3,Stephenie_Meyers
8,Shrek,Philosophers_Stone
9,Sorcerer_Apprentice,Fenrir_Greyback


In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [38]:
real_values = np.array([-1.2, 3.4, 5.6, -7.8, 9.0]).reshape(-1, 1)
scaler.fit(real_values)
real_values

array([[-1.2],
       [ 3.4],
       [ 5.6],
       [-7.8],
       [ 9. ]])

In [37]:
[int(x * 100.) for x in scaler.transform(real_values)]

[39, 66, 79, 0, 100]