In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import json
from sklearn.metrics import pairwise_distances

In [4]:
import gensim.downloader as api

In [5]:
def read_json(file_path):
    """
    Read json file.
    Arguments:
     file_path -- string, path to file.
    Returns:
     d -- dictionary, with json contents.
    Tips:
     None.
    """

    with open(file_path) as json_data:
        d = json.load(json_data)

    return d

def write_json(data_dict, file_path):
    """
    Write dictionary to json.
    Arguments:
     data_dict -- dictionary.
     file_path -- string, path to file.
    Returns:
     None.
    Tips:
     None.
    """

    with open(file_path, "w") as fp:
        json.dump(data_dict, fp, indent=4)

In [11]:
def tf_vocab(vectorizer, doc_string, quantile):
    tfidf_vect = vectorizer.fit_transform(doc_string)
    tfidf_array = tfidf_vect.toarray()
    tfidf_features = vectorizer.get_feature_names()
    hits = np.where(tfidf_array > (tfidf_array.max() * (1 - quantile)))
    vocab = []
    for idx in hits[1]:
        word = tfidf_features[idx]
        if word not in vocab:
            vocab.append(word)
            
    return vocab, tfidf_array, tfidf_features

def vocab_dict(combined_vocabs):
    idx = 0
    vocab = {}
    for word in combined_vocabs:
        if word not in vocab.keys():
            vocab.update({word: idx})
            idx += 1
            
    return vocab

def doc_to_vector(document, vocabulary):
    vector = np.zeros((1,len(vocabulary.keys())))
    for word in document.split(' '):
        if word in vocabulary.keys():
            vector[0, int(vocabulary[word])] = 1
    
    return vector
        

In [10]:
vectorizer = CountVectorizer(stop_words='english')

In [7]:
embeddings = api.load('word2vec-google-news-300')

In [8]:
positive_docs = read_json("../data/positive_docs.json")
negative_docs = read_json("../data/negative_docs.json")
nvidia_docs = read_json("../data/nvidia_docs.json")

In [54]:
tech_vocab, tech_tf, tech_feats = tf_vocab(vectorizer, [positive_docs['nlp_string']], 0.85)
print(len(tech_vocab))

18


In [59]:
tech_vocab

['centuri',
 'develop',
 'field',
 'gener',
 'human',
 'includ',
 'mani',
 'mathemat',
 'natur',
 'physic',
 'research',
 'scienc',
 'scientif',
 'social',
 'studi',
 'technolog',
 'theori',
 'use']

In [55]:
soc_vocab, soc_tf, soc_feats = tf_vocab(vectorizer, [negative_docs['nlp_string']], 0.85)
print(len(soc_vocab))

24


In [60]:
soc_vocab

['art',
 'belief',
 'concept',
 'cultur',
 'develop',
 'differ',
 'exampl',
 'form',
 'group',
 'human',
 'includ',
 'peopl',
 'philosoph',
 'philosophi',
 'religi',
 'religion',
 'social',
 'societi',
 'studi',
 'term',
 'theori',
 'thought',
 'tradit',
 'use']

In [56]:
nvi_vocab, nvi_tf, nvi_feats = tf_vocab(vectorizer, [nvidia_docs['nlp_string']], 0.85)

In [61]:
nvi_vocab

['announc',
 'compani',
 'comput',
 'driver',
 'gpu',
 'graphic',
 'hardwar',
 'nvidia']

In [57]:
distance = embeddings.wmdistance(tech_vocab, nvi_vocab)
distance

3.072546479593212

In [58]:
distance = embeddings.wmdistance(soc_vocab, nvi_vocab)
distance

3.2755192793140884