In [2]:
%load_ext autoreload
%autoreload 2

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import json
from sklearn.metrics import pairwise_distances

In [5]:
def read_json(file_path):
    """
    Read json file.
    Arguments:
     file_path -- string, path to file.
    Returns:
     d -- dictionary, with json contents.
    Tips:
     None.
    """

    with open(file_path) as json_data:
        d = json.load(json_data)

    return d

def write_json(data_dict, file_path):
    """
    Write dictionary to json.
    Arguments:
     data_dict -- dictionary.
     file_path -- string, path to file.
    Returns:
     None.
    Tips:
     None.
    """

    with open(file_path, "w") as fp:
        json.dump(data_dict, fp, indent=4)

In [21]:
def tf_vocab(vectorizer, documents_list, quantile):
    tfidf_vect = vectorizer.fit_transform(documents_list)
    tfidf_array = tfidf_vect.toarray()
    tfidf_features = vectorizer.get_feature_names()
    hits = np.where(tfidf_array > (tfidf_array.max() * (1 - quantile)))
    vocab = []
    for idx in hits[1]:
        word = tfidf_features[idx]
        if word not in vocab:
            vocab.append(word)
            
    return vocab, tfidf_array, tfidf_features

def vocab_dict(combined_vocabs):
    idx = 0
    vocab = {}
    for word in combined_vocabs:
        if word not in vocab.keys():
            vocab.update({word: idx})
            idx += 1
            
    return vocab

def doc_to_vector(document, vocabulary):
    vector = np.zeros((1,len(vocabulary.keys())))
    for word in document.split(' '):
        if word in vocabulary.keys():
            vector[0, int(vocabulary[word])] = 1
    
    return vector
        

In [1]:
positive_docs = read_json("../data/positive_docs.json")
negative_docs = read_json("../data/negative_docs.json")
nvidia_docs = read_json("../data/nvidia_docs.json")

NameError: name 'read_json' is not defined

In [8]:
pos = [positive_docs[k] for k in positive_docs.keys()]

In [9]:
neg = [negative_docs[k] for k in negative_docs.keys()]

In [10]:
nvi = [nvidia_docs[k] for k in nvidia_docs.keys()]

In [11]:
vectorizer = CountVectorizer(stop_words='english')

In [82]:
tech_vocab, tech_tf, tech_feats = tf_vocab(vectorizer, pos, 0.90)

In [83]:
tech_vocab

['human',
 'humans',
 'new',
 'science',
 'technological',
 'technology',
 'tools',
 'use',
 'used',
 'applied',
 'research',
 'analysis',
 'anthropology',
 'communication',
 'disciplines',
 'economics',
 'field',
 'fields',
 'geography',
 'history',
 'law',
 'methods',
 'natural',
 'political',
 'psychology',
 'sciences',
 'social',
 'sociology',
 'studies',
 'study',
 'theory',
 'century',
 'development',
 'formal',
 'greek',
 'knowledge',
 'mathematics',
 'medicine',
 'modern',
 'nature',
 'philosophy',
 'physics',
 'public',
 'scientific',
 'scientists',
 'world',
 'aristotle',
 'astronomy',
 'earth',
 'including',
 'animals',
 'life',
 'million',
 'ocean',
 'species',
 'surface',
 'water',
 'years',
 'geometry',
 'mathematical',
 'mathematicians',
 'number',
 'numbers',
 'problems']

In [84]:
soc_vocab, soc_tf, soc_feats = tf_vocab(vectorizer, neg, 0.90)

In [85]:
soc_vocab

['food',
 'social',
 'societies',
 'society',
 'cultural',
 'culture',
 'human',
 'studies',
 'art',
 'arts',
 'music',
 'century',
 'developed',
 'knowledge',
 'philosophical',
 'philosophy',
 'schools',
 'thought',
 'traditions',
 'mind',
 'belief',
 'beliefs',
 'believe',
 'different',
 'example',
 'like',
 'mental',
 'people',
 'religion',
 'religious',
 'sense',
 'terms',
 'true']

In [66]:
nvi_vocab, nvi_tf, nvi_feats = tf_vocab(vectorizer, nvi, 0.90)

In [67]:
nvi_vocab

['announced',
 'based',
 'company',
 'geforce',
 'gpu',
 'gpus',
 'graphics',
 'hardware',
 'nvidia']

In [68]:
tech_str = " ".join(tech_vocab)

In [69]:
soc_str = " ".join(soc_vocab)

In [70]:
nvi_str = " ".join(nvi_vocab)

In [71]:
nvi_str

'announced based company geforce gpu gpus graphics hardware nvidia'

In [73]:
aggregate_vocab = tech_vocab + soc_vocab

In [74]:
vocab = vocab_dict(aggregate_vocab)

In [75]:
vocab.keys()

dict_keys(['human', 'humans', 'new', 'science', 'technological', 'technology', 'tools', 'use', 'used', 'applied', 'research', 'analysis', 'anthropology', 'communication', 'disciplines', 'economics', 'field', 'fields', 'geography', 'history', 'law', 'methods', 'natural', 'political', 'psychology', 'sciences', 'social', 'sociology', 'studies', 'study', 'theory', 'century', 'development', 'formal', 'greek', 'knowledge', 'mathematics', 'medicine', 'modern', 'nature', 'philosophy', 'physics', 'public', 'scientific', 'scientists', 'world', 'aristotle', 'astronomy', 'earth', 'including', 'animals', 'life', 'million', 'ocean', 'species', 'surface', 'water', 'years', 'geometry', 'mathematical', 'mathematicians', 'number', 'numbers', 'problems', 'food', 'societies', 'society', 'cultural', 'culture', 'art', 'arts', 'music', 'developed', 'philosophical', 'schools', 'thought', 'traditions', 'mind', 'belief', 'beliefs', 'believe', 'different', 'example', 'like', 'mental', 'people', 'religion', 'reli

In [76]:
print(len(tech_vocab))
print(len(soc_vocab))
print(len(vocab))
print(len(tech_vocab) - len(soc_vocab))

64
33
91
31


In [77]:
tech_vect = doc_to_vector(tech_str, vocab)
np.sum(tech_vect)

64.0

In [78]:
soc_vect = doc_to_vector(soc_str, vocab)
np.sum(soc_vect)

33.0

In [79]:
nvi_vect = doc_to_vector(nvi_str, vocab)
np.sum(nvi_vect)

0.0

In [None]:
nvi_vect

In [None]:
distances = pairwise_distances(nvi_vect, Y=soc_vect, metric='correlation')
distances

In [None]:
distances = pairwise_distances(nvi_vect, Y=tech_vect, metric='correlation')
distances

In [None]:
nvi_vocab

In [None]:
nvi_str

In [88]:
news_headlines = ['i like cats', 'one two three', 'i like dogs']
headline_tokens = [['i', 'like', 'cats'], ['one', 'two', 'three'], ['i', 'like', 'dogs']]


In [89]:
for token, headline in zip(headline_tokens, news_headlines):
    print(headline)
    print("    " + token)

i like cats


TypeError: can only concatenate str (not "list") to str