In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
# Dataset is glove 6B 50T
# udemy nlp with python

In [17]:
# euclidean distance
def dist1(a,b):
    return np.linalg.norm(a-b)

In [18]:
# cosine distance
def dist2(a,b):
    return 1-a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [19]:
a = pd.Series([1,1])
b = pd.Series([1,2])
print(a)
print(b)
print(f'euclidian distance {dist1(a,b)}')
print(f'cosine distance {dist2(a,b)}')

0    1
1    1
dtype: int64
0    1
1    2
dtype: int64
euclidian distance 1.0
cosine distance 0.05131670194948623


In [24]:
# the cosine distance does not change when doubeling
# cosine distance only takes into account angel (the smaller the higher the similarity. )
# euclidian looks at absolute length
a_double = a*2
b_double = b*2
print(a_double)
print(b_double)
print(f'euclidian distance {dist1(a_double,b_double)}')
print(f'cosine distance {dist2(a_double,b_double)}')

0    2
1    2
dtype: int64
0    2
1    4
dtype: int64
euclidian distance 2.0
cosine distance 0.05131670194948623


In [25]:
dist, metric = dist2, 'cosine'
#alternative: dist, metric = dist1, 'euclidean'

In [26]:
# load pretrained word vectors
def load_word2vec(path = '../large_files/glove.6B/glove.6B.50d.txt'):
    print(f'loading word embeddings word to vec from path {path}')
    
    word2vec = {}
    embedding = []
    index_to_word = []
    
    with open(path) as file:
        num = 0
        for line in file:
            values = line.split()
            
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            
            word2vec[word] = vec
            
            embedding.append(vec)
            index_to_word.append(word)
            
            num +=1
            if num % 200000 == 0:
                print(f'line number {num}')
                print(line)
    
    embedding = pd.DataFrame(embedding, index=word2vec.keys())
    num_words, num_dims = embedding.shape
    
    print(f'total number of entries found:  {num_words}. Dimension: {num_dims}')
    return(word2vec, embedding, index_to_word)

In [86]:
# scribble
embedding_test = embedding.iloc[:8, :4]
print(embedding_test)

test_array = [0.1, 0.2, 0.3, 0.1]

distances = embedding_test.apply(lambda x: dist1(x, test_array), axis=1)
print(type(distances))

print(distances)
max_value = np.max(distances)
print(np.where(distances == max_value))

print(np.argmax(distances))
print(type(np.argmax(distances)))
print(np.argsort(distances)[:2])

            0         1        2         3
the  0.418000  0.249680 -0.41242  0.121700
,    0.013441  0.236820 -0.16899  0.409510
.    0.151640  0.301770 -0.16763  0.176840
of   0.708530  0.570880 -0.47160  0.180480
to   0.680470 -0.039263  0.30186 -0.177920
and  0.268180  0.143460 -0.27877  0.016257
in   0.330420  0.249950 -0.60874  0.109230
a    0.217050  0.465150 -0.46757  0.100820
<class 'pandas.core.series.Series'>
the    0.782052
,      0.569733
.      0.487448
of     1.053425
to     0.686611
and    0.611121
in     0.938873
a      0.820469
dtype: float64
(array([3]),)
of
<class 'str'>
the    2
,      1
dtype: int64


In [108]:
# king - man = X - woman
# <=> X = woman - man + king
# I wanted to solve this on my own, but it does not converge
def find_analogies_ma(word1, word2, word3, embedding):
    
    for word in [word1, word2, word3]:
        if word.lower() not in embedding.index:
            print(f'sorry, I do not have an embedding for {word}')
            return ''
        
    king = embedding.loc[word1.lower()]
    man = embedding.loc[word2.lower()]
    woman = embedding.loc[word3.lower()]
    searched_vector = woman - man + king
    
    print('computing distances')   
    
    distances = embedding.apply(lambda x: dist1(x, searched_vector), axis=1)
    print(f'nearest words: {all_distances.sort_values()[:3]}')
    
    return all_distances.sort_values()[:1]

In [107]:
find_analogies_ma('man', 'woman', 'king', embedding)

computing distances
nearest words: king     2.602624
ruler    4.369554
ii       4.400222
dtype: float64


king    2.602624
dtype: float64

In [109]:
def find_analogies(w1, w2, w3, D, index_to_word):
  for w in (w1, w2, w3):
    if w not in word2vec:
      print("%s not in dictionary" % w)
      return

  king = word2vec[w1]
  man = word2vec[w2]
  woman = word2vec[w3]
  v0 = king - man + woman

  distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[:4]
  for idx in idxs:
    word = index_to_word[idx]
    if word not in (w1, w2, w3): 
      best_word = word
      break

  print(w1, "-", w2, "=", best_word, "-", w3)

In [110]:
def nearest_neighbors(w,index_to_word, word2vec, D,  n=5 ):
  if w not in word2vec:
    print("%s not in dictionary:" % w)
    return

  v = word2vec[w]
  distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[1:n+1]
  print("neighbors of: %s" % w)
  for idx in idxs:
    print("\t%s" % index_to_word[idx])

# 'Main':

In [111]:
glove_path = '../large_files/glove.6B/glove.6B.50d.txt'

#takes about 10 sec
word2vec, embedding, index_to_word = load_word2vec(glove_path)

loading word embeddings word to vec from path ../large_files/glove.6B/glove.6B.50d.txt
line number 200000
soroca 0.049805 -0.92157 -0.78814 0.40001 0.73747 0.23841 1.0884 0.74811 0.35275 -0.092519 0.068657 -1.5069 0.71963 -0.62382 0.42556 -0.12594 0.12767 0.48246 0.747 0.95415 -0.61376 -0.41512 -0.04666 0.91667 -0.70671 0.71371 -0.09069 0.63914 -0.22234 -0.51031 -1.009 -0.93208 0.53891 -0.32442 0.060975 0.12915 -0.17024 0.29168 0.14596 0.18409 -0.16361 0.21103 -0.092295 -0.49671 -1.3887 0.29717 0.070581 -0.19783 -0.62638 0.25806

line number 400000
sandberger 0.072617 -0.51393 0.4728 -0.52202 -0.35534 0.34629 0.23211 0.23096 0.26694 0.41028 0.28031 0.14107 -0.30212 -0.21095 -0.10875 -0.33659 -0.46313 -0.40999 0.32764 0.47401 -0.43449 0.19959 -0.55808 -0.34077 0.078477 0.62823 0.17161 -0.34454 -0.2066 0.1323 -1.8076 -0.38851 0.37654 -0.50422 -0.012446 0.046182 0.70028 -0.010573 -0.83629 -0.24698 0.6888 -0.17986 -0.066569 -0.48044 -0.55946 -0.27594 0.056072 -0.18907 -0.59021 0.55559

tot

In [29]:
V, D = embedding.shape
print(embedding.shape)
embedding.head(1)

(400000, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581


In [122]:
word1 = 'motivation'
word2 = 'man'
word3 = 'woman'
#find_analogies_Lena(embedding, word1, word2, word3) # this is slow AF
find_analogies(word1, word2, word3, D, index_to_word)
find_analogies('success', 'man', 'woman',  D, index_to_word)
find_analogies('fun', 'man', 'woman',  D, index_to_word)
find_analogies('love', 'woman', 'man',  D, index_to_word)
find_analogies('love', 'man', 'woman',  D, index_to_word)

find_analogies('king', 'man', 'woman',  D, index_to_word)
find_analogies('money', 'man', 'woman',  D, index_to_word)
find_analogies('sex', 'man', 'woman',  D, index_to_word)
find_analogies('france', 'paris', 'london',  D, index_to_word)
find_analogies('france', 'paris', 'rome',  D, index_to_word)
find_analogies('paris', 'france', 'italy',  D, index_to_word)
find_analogies('december', 'november', 'june',  D, index_to_word)
find_analogies('miami', 'florida', 'texas',  D, index_to_word)
find_analogies('einstein', 'scientist', 'painter',  D, index_to_word)
find_analogies('rice', 'china', 'germany',  D, index_to_word)
find_analogies('beer', 'germany', 'japan',  D, index_to_word)

find_analogies('tree', 'forest', 'flock',  D, index_to_word)

find_analogies('man', 'woman', 'she',  D, index_to_word)
find_analogies('man', 'woman', 'aunt',  D, index_to_word)
find_analogies('man', 'woman', 'sister',  D, index_to_word)
find_analogies('man', 'woman', 'wife',  D, index_to_word)
find_analogies('man', 'woman', 'friend',  D, index_to_word)
find_analogies('man', 'woman', 'actress',  D, index_to_word)
find_analogies('man', 'woman', 'mother',  D, index_to_word)
find_analogies('heir', 'heiress', 'princess',  D, index_to_word)

motivation - man = satisfaction - woman
success - man = recognition - woman
fun - man = kids - woman
love - woman = me - man
love - man = mother - woman
king - man = queen - woman
money - man = paying - woman
sex - man = sexual - woman
france - paris = britain - london
france - paris = italy - rome
paris - france = rome - italy
december - november = july - june
miami - florida = houston - texas
einstein - scientist = matisse - painter
rice - china = bacon - germany
beer - germany = drink - japan
tree - forest = devotees - flock
man - woman = he - she
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = friend - wife
man - woman = brother - friend
man - woman = actor - actress
man - woman = father - mother
heir - heiress = queen - princess


In [44]:
nearest_neighbors('king',index_to_word, word2vec, D)
nearest_neighbors('france',index_to_word, word2vec, D)
nearest_neighbors('japan',index_to_word, word2vec, D)
nearest_neighbors('einstein',index_to_word, word2vec, D)
nearest_neighbors('woman',index_to_word, word2vec, D)
nearest_neighbors('man',index_to_word, word2vec, D)
nearest_neighbors('nephew',index_to_word, word2vec, D)
nearest_neighbors('february',index_to_word, word2vec, D)
nearest_neighbors('success',index_to_word, word2vec, D)
nearest_neighbors('money',index_to_word, word2vec, D)
nearest_neighbors('love',index_to_word, word2vec, D)

neighbors of: king
	prince
	queen
	ii
	emperor
	son
neighbors of: france
	french
	belgium
	paris
	spain
	netherlands
neighbors of: japan
	japanese
	china
	korea
	tokyo
	taiwan
neighbors of: einstein
	relativity
	bohr
	physics
	heisenberg
	freud
neighbors of: woman
	girl
	man
	mother
	her
	boy
neighbors of: man
	woman
	boy
	another
	old
	one
neighbors of: nephew
	cousin
	brother
	grandson
	son
	uncle
neighbors of: february
	october
	december
	january
	august
	september
neighbors of: success
	achieved
	successful
	ever
	thanks
	best
neighbors of: money
	cash
	paying
	funds
	pay
	raise
neighbors of: love
	dream
	life
	dreams
	loves
	me
