In [1]:
import io, sys
import numpy as np
from heapq import *

In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data

In [3]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

word_vectors = load_vectors('wiki.en.vec')
print(word_vectors["cat"])


 ** Word vectors ** 

[-0.13819    0.14029   -0.32621    0.11624   -0.19806    0.45526
  0.21282   -0.51256    0.033657   0.15429    0.15162   -0.0029573
  0.19644   -0.17596    0.28147   -0.091412   0.07636   -0.43859
  0.19801    0.28139    0.0098646  0.51562   -0.41693    0.10776
  0.35227    0.024383  -0.074379   0.26591    0.33723    0.47339
  0.26984    0.23394    0.11666   -0.22181    0.18746   -0.10135
 -0.064922  -0.042677   0.063772  -0.027752  -0.11039   -0.26154
 -0.22353   -0.036962   0.12765    0.51871   -0.081972  -0.39103
  0.16349   -0.29408    0.092915  -0.059598  -0.092276  -0.34925
  0.31541    0.37776    0.0094893 -0.42358    0.075348   0.19263
  0.20816   -0.47312    0.093752   0.21432   -0.061307  -0.3775
  0.12458   -0.028288  -0.12738    0.047164  -0.051377  -0.34661
  0.24864   -0.41215   -0.39386    0.026905  -0.16849    0.34931
 -0.37351    0.11903   -0.068579   0.012468   0.1888     0.3691
  0.35854    0.11405   -0.16632    0.047209   0.2411     0.063058
 

In [4]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    ## FILL CODE
    return u.dot(v)/(np.linalg.norm(u)*np.linalg.norm(v))

## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

In [5]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [6]:
## Functions for nearest neighbors

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    best_score = -1.0
    best_word = ''

    ## FILL CODE
    for word in word_vectors:
        if word not in exclude_words:
            sim_score = cosine(word_vectors[word], x)
            if sim_score > best_score:
                best_score = sim_score
                best_word = word
    return best_word

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k, exclude_words):
    heap = []
#     nearest_neighbor()
    ## FILL CODE
    for word in word_vectors:
        if word not in exclude_words:
            sim_score = cosine(x, word_vectors[word])
            heappush(heap, (sim_score, word))

            #fullside off thinking rather than all length approach         
            if len(heap) > k:
                heappop(heap)
    return [heappop(heap) for i in range(len(heap))][::-1]

In [7]:
# looking at nearest neighbors of a word

print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, ["cat", "cats"]))

knn_cat = knn(word_vectors['cat'], word_vectors, 5, ["cat", "cats"])
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5,["cat", "cats"]):
    print(word + '\t%.3f' % score)

The nearest neighbor of cat is: dog

cat
--------------
dog	0.638
pet	0.573
rabbit	0.549
dogs	0.538
pig	0.458


In [26]:
## This function return the word d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    ## FILL CODE
    d = word_vectors[b] - word_vectors[a] + word_vectors[c]
    return nearest_neighbor(d, word_vectors, exclude_words=[a, b, c])

In [27]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))
print('king - man + woman = ' + analogy('man', 'king', 'woman', word_vectors))


france - paris + rome = italy
king - man + woman = queen


In [28]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [30]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    ## FILL CODE
    strength_a = sum([cosine(vectors[w], vectors[a]) for a in A])/len(A)
    strength_b = sum([cosine(vectors[w], vectors[b]) for b in B])/len(B)    
    return strength_a - strength_b

## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    score = 0.0
    ## FILL CODE
    strength_x = sum([association_strength(x,A,B, vectors) for x in X])
    strength_y = sum([association_strength(y,A,B, vectors) for y in Y])
    return strength_x - strength_y

In [31]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847
