Juho Peltomäki
k-NN algorithm implementation with cosine similarity based on vector values in the dataset "word_embeddings.txt",
where each word has 50 different vector values. 

The code asks a word and then finds n -amount of nearest words in 50 dimensional space.

In [6]:

import random
import numpy as np
import os 


## How many (n) nearest word you want to sought:
n = 3

## update your path here:
vocabulary_file='word_embeddings.txt'

## General path:
# vocabulary_file='word_embeddings.txt'


# Read words
print('Read words...')
with open(vocabulary_file, 'r', encoding='utf-8') as f:
    words = [x.rstrip().split(' ')[0] for x in f.readlines()]

# Read word vectors
print('Read word vectors...')
with open(vocabulary_file, 'r') as f:
    vectors = {}
    for line in f:
        vals = line.rstrip().split(' ')
        vectors[vals[0]] = [float(x) for x in vals[1:]]

vocab_size = len(words)
vocab = {w: idx for idx, w in enumerate(words)}
ivocab = {idx: w for idx, w in enumerate(words)}


# for inspecting the dict sizes:
# vocabulary and inverse vocabulary (dict objects)
# print('Vocabulary size')
# print(len(vocab))
# print(vocab['man'])
# print(len(ivocab))
# print(ivocab[10])

# W contains vectors for
print('Vocabulary word vectors')
vector_dim = len(vectors[ivocab[0]])
W = np.zeros((vocab_size, vector_dim))
for word, v in vectors.items():
    if word == '<unk>':
        continue
    W[vocab[word], :] = v
print(W.shape)
    


def cos_similarity(W, sim_vector):
    
    # a numerator:
    dot_product = np.dot(W, sim_vector)
    
    # a denominator: 
    # (axis=1) to specify the norm to be calculated row-wise
    norms = np.linalg.norm(W, axis=1) * np.linalg.norm(sim_vector)
    
    # returns vector containing angle for every word (entire formula):
    return dot_product / norms
    
    
    

def similarity(sim, W, vocab, ivocab):
    
    if sim not in vocab:
        print(f'Word {sim} not found')
        return []
    
    # search the word index from the vocab:
    sim_index = vocab[sim]
    
    # search the vector values from W:
    sim_vector = W[sim_index]
    
    cos = cos_similarity(W, sim_vector)
    
    closest = np.argsort(-cos)[:n]

    for i in closest:
        
        print(f'The angle for word {ivocab[i]}:  {cos[i]:.3f}')
    print()
    return [(ivocab[idx], cos[idx]) for idx in closest]



    
# main loop for analogy
while True:
    input_term = input("\nEnter a  word (EXIT to break): ")
    if input_term == 'EXIT':
        break
    
    else:
        sim_word = input_term
        
        print(f"Most similar three words to {sim_word}: ")
        print()
        
        three_closest = similarity(sim_word, W, vocab, ivocab)
        
        # for i in three_closest:
        
        #    print(f'The angle for word {ivocab[i]}:  {cos[i]:.3f}')

Read words...
Read word vectors...
Vocabulary word vectors
(400000, 50)
Most similar three words to queen: 

The angle for word queen:  1.000
The angle for word princess:  0.852
The angle for word lady:  0.805

Most similar three words to helsinki: 

The angle for word helsinki:  1.000
The angle for word stockholm:  0.784
The angle for word gothenburg:  0.774

Most similar three words to waltari: 

The angle for word waltari:  1.000
The angle for word mika:  0.707
The angle for word boorem:  0.699

Most similar three words to kafka: 

The angle for word kafka:  1.000
The angle for word tolstoy:  0.728
The angle for word goethe:  0.674

