In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
# Dataset is glove 6B 50T
# udemy nlp with python

In [17]:
# euclidean distance
def dist1(a,b):
    return np.linalg.norm(a-b)

In [18]:
# cosine distance
def dist2(a,b):
    return 1-a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [19]:
a = pd.Series([1,1])
b = pd.Series([1,2])
print(a)
print(b)
print(f'euclidian distance {dist1(a,b)}')
print(f'cosine distance {dist2(a,b)}')

0    1
1    1
dtype: int64
0    1
1    2
dtype: int64
euclidian distance 1.0
cosine distance 0.05131670194948623


In [24]:
# the cosine distance does not change when doubeling
# cosine distance only takes into account angel (the smaller the higher the similarity. )
# euclidian looks at absolute length
a_double = a*2
b_double = b*2
print(a_double)
print(b_double)
print(f'euclidian distance {dist1(a_double,b_double)}')
print(f'cosine distance {dist2(a_double,b_double)}')

0    2
1    2
dtype: int64
0    2
1    4
dtype: int64
euclidian distance 2.0
cosine distance 0.05131670194948623


In [25]:
dist, metric = dist2, 'cosine'
#alternative: dist, metric = dist1, 'euclidean'

In [26]:
# load pretrained word vectors
def load_word2vec(path = '../large_files/glove.6B/glove.6B.50d.txt'):
    print(f'loading word embeddings word to vec from path {path}')
    
    word2vec = {}
    embedding = []
    index_to_word = []
    
    with open(path) as file:
        num = 0
        for line in file:
            values = line.split()
            
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            
            word2vec[word] = vec
            
            embedding.append(vec)
            index_to_word.append(word)
            
            num +=1
            if num % 200000 == 0:
                print(f'line number {num}')
                print(line)
    
    embedding = pd.DataFrame(embedding, index=word2vec.keys())
    num_words, num_dims = embedding.shape
    
    print(f'total number of entries found:  {num_words}. Dimension: {num_dims}')
    return(word2vec, embedding, index_to_word)

In [30]:
# king - man = X - woman
# <=> X = woman - man + king
# I wanted to solve this on my own, but it does not converge
def find_analogies_lena(word1, word2, word3, embedding):
    
    for word in [word1, word2, word3]:
        if word.lower() not in embedding.index:
            print(f'sorry, I do not have an embedding for {word}')
            return ''
        
    king = embedding.loc[word1.lower()]
    man = embedding.loc[word2.lower()]
    woman = embedding.loc[word3.lower()]
    searched_vector = woman - man + king
    
    print(king, man, woman, searched_vector)
    
    #jetzt brauche ich eine Matrix, die mir die distanz searched_vector - word gibt
    #pd_distances = pd.DataFrame(columns = embedding.index)
    
    #takes several minutes
    #for word in embedding.index:
    #    pd_distances[word] = dist1(searched_vector, embedding.loc[word])
    
    #print('And the searched vector is: ', searched_vector)
    
    print('computing distances')   
    columns = embedding.index
    embedding['Dist_to_word'] = embedding.apply(lambda row: dist1(searched_vector, row[columns]), axis=1)    
    
    print(embedding.head())
    
    print(pd_distances.loc[0:10, 0:10])

In [31]:
def find_analogies(w1, w2, w3, D, index_to_word):
  for w in (w1, w2, w3):
    if w not in word2vec:
      print("%s not in dictionary" % w)
      return

  king = word2vec[w1]
  man = word2vec[w2]
  woman = word2vec[w3]
  v0 = king - man + woman

  distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[:4]
  for idx in idxs:
    word = index_to_word[idx]
    if word not in (w1, w2, w3): 
      best_word = word
      break

  print(w1, "-", w2, "=", best_word, "-", w3)

In [32]:
def nearest_neighbors(w,index_to_word, word2vec, D,  n=5 ):
  if w not in word2vec:
    print("%s not in dictionary:" % w)
    return

  v = word2vec[w]
  distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
  idxs = distances.argsort()[1:n+1]
  print("neighbors of: %s" % w)
  for idx in idxs:
    print("\t%s" % index_to_word[idx])

# Main:

In [None]:
glove_path = '../large_files/glove.6B/glove.6B.50d.txt'

#takes about 10 sec
word2vec, embedding, index_to_word = load_word2vec(glove_path)

In [29]:
V, D = embedding.shape
print(embedding.shape)
embedding.head(1)

(400000, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581


In [40]:
word1 = 'motivation'
word2 = 'man'
word3 = 'woman'
#find_analogies_Lena(embedding, word1, word2, word3) # this is slow AF
find_analogies(word1, word2, word3, D, index_to_word)
find_analogies('king', 'man', 'woman',  D, index_to_word)
find_analogies('france', 'paris', 'london',  D, index_to_word)
find_analogies('france', 'paris', 'rome',  D, index_to_word)
find_analogies('paris', 'france', 'italy',  D, index_to_word)
find_analogies('december', 'november', 'june',  D, index_to_word)
find_analogies('miami', 'florida', 'texas',  D, index_to_word)
find_analogies('einstein', 'scientist', 'painter',  D, index_to_word)
find_analogies('china', 'rice', 'bread',  D, index_to_word)
find_analogies('man', 'woman', 'she',  D, index_to_word)
find_analogies('man', 'woman', 'aunt',  D, index_to_word)
find_analogies('man', 'woman', 'sister',  D, index_to_word)
find_analogies('man', 'woman', 'wife',  D, index_to_word)
find_analogies('man', 'woman', 'friend',  D, index_to_word)
find_analogies('man', 'woman', 'actress',  D, index_to_word)
find_analogies('man', 'woman', 'mother',  D, index_to_word)
find_analogies('heir', 'heiress', 'princess',  D, index_to_word)
find_analogies('february', 'january', 'november',  D, index_to_word)

motivation - man = satisfaction - woman
king - man = queen - woman
france - paris = britain - london
france - paris = italy - rome
paris - france = rome - italy
december - november = july - june
miami - florida = houston - texas
einstein - scientist = matisse - painter
china - rice = chinese - bread
man - woman = he - she
man - woman = uncle - aunt
man - woman = brother - sister
man - woman = friend - wife
man - woman = brother - friend
man - woman = actor - actress
man - woman = father - mother
heir - heiress = queen - princess
february - january = october - november


In [44]:
nearest_neighbors('king',index_to_word, word2vec, D)
nearest_neighbors('france',index_to_word, word2vec, D)
nearest_neighbors('japan',index_to_word, word2vec, D)
nearest_neighbors('einstein',index_to_word, word2vec, D)
nearest_neighbors('woman',index_to_word, word2vec, D)
nearest_neighbors('man',index_to_word, word2vec, D)
nearest_neighbors('nephew',index_to_word, word2vec, D)
nearest_neighbors('february',index_to_word, word2vec, D)
nearest_neighbors('success',index_to_word, word2vec, D)
nearest_neighbors('money',index_to_word, word2vec, D)
nearest_neighbors('love',index_to_word, word2vec, D)

neighbors of: king
	prince
	queen
	ii
	emperor
	son
neighbors of: france
	french
	belgium
	paris
	spain
	netherlands
neighbors of: japan
	japanese
	china
	korea
	tokyo
	taiwan
neighbors of: einstein
	relativity
	bohr
	physics
	heisenberg
	freud
neighbors of: woman
	girl
	man
	mother
	her
	boy
neighbors of: man
	woman
	boy
	another
	old
	one
neighbors of: nephew
	cousin
	brother
	grandson
	son
	uncle
neighbors of: february
	october
	december
	january
	august
	september
neighbors of: success
	achieved
	successful
	ever
	thanks
	best
neighbors of: money
	cash
	paying
	funds
	pay
	raise
neighbors of: love
	dream
	life
	dreams
	loves
	me
