In [1]:
# Get the interactive Tools for Matplotlib
import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline

import plotly.graph_objects as go
import plotly.express as px



# Get tools to download files and load them 
import pickle
import urllib.request
from os.path import exists as check_path
from os import makedirs

# Get tools to performe analysis
import numpy as np
from heapq import heappushpop
from sklearn.decomposition import PCA

In [3]:
def download_files_from_github(file_target_dir):
    main_url = 'https://raw.githubusercontent.com/Goussha/word-vector-visualization/master/'
    if not check_path(file_target_dir):
        makedirs(file_target_dir)
    
    urls = [main_url+'file{}.p'.format(x) for x in range(1,9)]
    file_names = [file_target_dir+'file{}.p'.format(x) for x in range(1,9)]
    for file_name, url in zip(file_names, urls):
        if not check_path(file_name):
            print ("Downloading file: ",file_name)
            filename, headers = urllib.request.urlretrieve(url, filename=file_name)
        else:
            print('Allready exists: {}'.format(file_name))

In [4]:
def load_word2vecfiles(file_target_dir):
    word_dict_loded = {}
    for file_num in range(1,9):
        full_file_name = file_target_dir+'file{}.p'.format(file_num)
        print('Loading file: {}'.format(full_file_name))
        with open(full_file_name, 'rb') as fp:
            data = pickle.load(fp)
        word_dict_loded.update(data)
    return word_dict_loded

In [5]:
file_target_dir = "./tmp/"

#Download files
download_files_from_github(file_target_dir)
#Load files and create dict
word_dict = load_word2vecfiles(file_target_dir)

Downloading file:  ./tmp/file1.p
Downloading file:  ./tmp/file2.p
Downloading file:  ./tmp/file3.p
Downloading file:  ./tmp/file4.p
Downloading file:  ./tmp/file5.p
Downloading file:  ./tmp/file6.p
Downloading file:  ./tmp/file7.p
Downloading file:  ./tmp/file8.p
Loading file: ./tmp/file1.p
Loading file: ./tmp/file2.p
Loading file: ./tmp/file3.p
Loading file: ./tmp/file4.p
Loading file: ./tmp/file5.p
Loading file: ./tmp/file6.p
Loading file: ./tmp/file7.p
Loading file: ./tmp/file8.p


## cosine similarity
- reflects the degree of similarity between two vectors

In [2]:
def cosine_similarity(u, v):
    distance = 0.0
    epsilon = 1e-10 # prevent deviding by 0
    dot = np.dot(u.T, v)
    # Compute the L2 norm of u & v
    norm_u = np.sqrt(np.sum(u**2))
    norm_v = np.sqrt(np.sum(v**2))
    cosine_similarity = dot/((norm_u*norm_v)+epsilon)
    return cosine_similarity    

## most k similar
- find the most similar word to the input word by calculating the cosine similarity between the word vector and the other word vectors and returning K most similar words

In [3]:
def most_k_similar(word_in, word_dict,k=1):
    words = word_dict.keys()
    word_vec = word_dict[word_in]
    most_similars_heap = [(-100, '') for _ in range(k)]
    for w in words:
        if w==word_in:
            continue
        cosine_sim = cosine_similarity(word_vec, word_dict[w])
        heappushpop(most_similars_heap, (cosine_sim, w))
    most_similars_tuples = [tup for tup in most_similars_heap]
    _, best_words = zip(*most_similars_tuples)
    return best_words

## doesn't match
- takes a list of words and returns the word the doesnt match by comparing cousine similarities each word and all the other words and returning the words with the lowest score

In [4]:
def doesnt_match(words, word_dict):
    dots_tot = []
    for w in words:
        dots = 0
        for w2 in words:
            if w2==w:
                continue
            v = word_dict[w]
            u = word_dict[w2]
            dots = dots + cosine_similarity(v, u)
        dots.tot.append(dots)
    return (words[np.argmin(dots_tot)])

## complete_analogy
- To find the analogy between words, this function subtracks one word vector from the other, and then add the difference to the vector of the third word.
- The difference between two word vectors represents the difference between their meaning, or the relationship between them, also known as their analogy.
- By adding this difference to a different word vector, you can find a forth word that has the same relationship with word 3 as words 1 and 2 have

In [None]:
def complete_analogy(word_a, word_b, word_c, word_dict):
    
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    # Get the word embeddings e_a, e_b and e_c (≈1-3 lines)
    e_a, e_b, e_c = word_dict[word_a],word_dict[word_b],word_dict[word_c]
    words = word_dict.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # to avoid best_word being one of the input words, skip the input words
    # place the input words in a set for faster searching than a list
    # We will re-use this set of input words inside the for-loop
    input_words_set = set([word_a, word_b, word_c])  
    # loop over the whole word vector set    
    for w in words:        
        # to avoid best_word being one of the input words, skip the input words
        if w in input_words_set:
            continue       
        #Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity(e_b - e_a, word_dict[w]- e_c)       
        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
    return best_word