In [3]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = 'PATH_TO_MODEL_FILES_ON_YOUR_COMPUTER'

In [4]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin', True)
model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec model...
Finished loading Word2Vec model...
Loading FastText model...
Finished loading FastText model...


In [6]:
# develop own topic taxonomy
topic_taxonomy = {
    "category_1_keywords":
    {
        "Topic Label A": "keyword1 keyword2 keyword3",
        "Topic Label B": "keyword4 keyword5 keyword6",
        "Topic Label C": "keyword7 keyword8 keyword9"
    },
    "category_2_keywords":
    {
        "Topic Label D": "keyword1 keyword2 keyword3",
        "Topic Label E": "keyword4 keyword5 keyword6",
        "Topic Label F": "keyword7 keyword8 keyword9"
    }
} 

In [7]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [8]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [9]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [10]:
# function takes an input string, runs similarity for each item in topic_taxonomy, sorts and returns top 3 results
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:3]

In [None]:
if __name__ == '__main__':
    # example using Word2Vec
    output1 = classify_topics('White House Has Plan To Force Tillerson Out, Replace With CIA Chief', model_word2vec)
    print(output1)
    # example output using FastText, note for FastText, string needs to be non-capitalized
    output2 = classify_topics('White House Has Plan To Force Tillerson Out Replace With CIA Chief'.lower(), model_fasttext)
    print(output2)
    