In [2]:
## Juho Peltomäki
## 
## The code for find relation between X and Y and search that same relation between Z and its best match.
## What is X to Y, as Z is to _ ?
##
## Here X is the first word to input, Y is second and Z is the word we want to find an analogy for.


###
import random
import numpy as np
import os 



## How many (n) best matches do you want to search:
n = 3


vocabulary_file='word_embeddings.txt'

# Read words
print('Read words...')
with open(vocabulary_file, 'r', encoding='utf-8') as f:
    words = [x.rstrip().split(' ')[0] for x in f.readlines()]

# Read word vectors
print('Read word vectors...')
with open(vocabulary_file, 'r') as f:
    vectors = {}
    for line in f:
        vals = line.rstrip().split(' ')
        vectors[vals[0]] = [float(x) for x in vals[1:]]

vocab_size = len(words)
vocab = {w: idx for idx, w in enumerate(words)}
ivocab = {idx: w for idx, w in enumerate(words)}

# For exploring the dict sizes:
# Vocabulary and inverse vocabulary (dict objects)
print('Vocabulary size')
print(len(vocab))
print(vocab['man'])
print(len(ivocab))
print(ivocab[10])




# W contains vectors for
print('Vocabulary word vectors')
vector_dim = len(vectors[ivocab[0]])
W = np.zeros((vocab_size, vector_dim))
for word, v in vectors.items():
    if word == '<unk>':
        continue
    W[vocab[word], :] = v
print(W.shape)
print()
    




def cos_similarity(W, sim_vector):
    
    # a numerator:
    dot_product = np.dot(W, sim_vector)
    
    # a denominator: 
    # (axis=1) to specify the norm to be calculated row-wise
    norms = np.linalg.norm(W, axis=1) * np.linalg.norm(sim_vector)
    
    # returns vector containing angle for every word (entire formula):
    return dot_product / norms
    



def analogy(x, y ,z, words):


    # new = z + y - x     
    analogy_vector = np.array(z) + np.array(y) - np.array(x)
    
    simi = cos_similarity(W, analogy_vector)
    
    closest2 = np.argsort(-simi)[:n]
    
    print(f'Top {n} closest words for the analogy for words: {words}:')
    for i in closest2:
        print(f"Word: {ivocab[i]}, 'Distance' as cosine similarity: {simi[i]:.3f}")
    
    return [(ivocab[idx], simi[idx]) for idx in closest2]


    pass


while True:
    
    input_term = input("\nEnter three words separated with space ('EXIT' to quit):  ")
    
    if input_term == 'EXIT':
        break
    
    else:        
        words = input_term.split()
    
        # print(words)
        
        if len(words) != 3:
            
            
            print("Please enter exactly three words!")
            continue
        
        try:
            a, b ,c = words
            a_vec = W[vocab[a]]
            b_vec = W[vocab[b]]
            c_vec = W[vocab[c]]
        
            # print(a_vec)
            
            
            jou = analogy(a_vec, b_vec, c_vec, words)
            
            print("____")
            
            
        except KeyError as e:
            print(f"Word not in vocabulary {e}")
            continue
        
    
        
    

    

Read words...
Read word vectors...
Vocabulary size
400000
300
400000
for
Vocabulary word vectors
(400000, 50)

Top 3 closest words for the analogy for words: ['finland', 'helsinki', 'germany']:
Word: berlin, 'Distance' as cosine similarity: 0.893
Word: vienna, 'Distance' as cosine similarity: 0.823
Word: moscow, 'Distance' as cosine similarity: 0.802
____
Top 3 closest words for the analogy for words: ['man', 'father', 'woman']:
Word: mother, 'Distance' as cosine similarity: 0.937
Word: daughter, 'Distance' as cosine similarity: 0.931
Word: wife, 'Distance' as cosine similarity: 0.908
____
Top 3 closest words for the analogy for words: ['king', 'queen', 'prince']:
Word: princess, 'Distance' as cosine similarity: 0.849
Word: queen, 'Distance' as cosine similarity: 0.847
Word: prince, 'Distance' as cosine similarity: 0.807
____
