In [1]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = '/Users/javidbeck/projects/data/model/'

In [2]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin', True)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [3]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [4]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [5]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [6]:
if __name__ == '__main__':
    # example using Word2Vec
    str1 = 'Mercedes Benz'
    str2 = 'Alfa Romeo'
    str3 = 'German car'
    str4 = 'Italian car'
    output1 = calc_similarity(str1, str3, model_word2vec)
    print(str1, str3, output1)
    output1 = calc_similarity(str1, str4, model_word2vec)
    print(str1, str4, output1)
    output1 = calc_similarity(str2, str4, model_word2vec)
    print(str2, str4, output1)
    output1 = calc_similarity(str2, str3, model_word2vec)
    print(str2, str3, output1)

Mercedes Benz German car 0.521342848055
Mercedes Benz Italian car 0.422309532927
Alfa Romeo Italian car 0.342467760695
Alfa Romeo German car 0.259916218455


In [7]:
    output2 = calc_similarity('White House Has Plan To Force Tillerson Out, Replace With CIA Chief', 
                              "Executive Resignation", model_word2vec)
    print(output2)
    output2 = calc_similarity('White House Has Plan To Force Tillerson Out, Replace With CIA Chief', 
                              "Business Expansion", model_word2vec)
    print(output2)


0.427572193957
0.289258259851
