In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
def read_json(file_path):
    """
    Read json file.
    Arguments:
     file_path -- string, path to file.
    Returns:
     d -- dictionary, with json contents.
    Tips:
     None.
    """

    with open(file_path) as json_data:
        d = json.load(json_data)

    return d

def write_json(data_dict, file_path):
    """
    Write dictionary to json.
    Arguments:
     data_dict -- dictionary.
     file_path -- string, path to file.
    Returns:
     None.
    Tips:
     None.
    """

    with open(file_path, "w") as fp:
        json.dump(data_dict, fp, indent=4)

In [6]:
def nlp_preprocess(doc_string):
    download('punkt')
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer() 
    
    tokens = word_tokenize(doc_string)
    remove_punct = [word.lower() for word in tokens if word.isalnum()]
    remove_stops = [word for word in remove_punct if word not in stopwords_english]
    stemmed = [stemmer.stem(word) for word in remove_stops]

    return stemmed

def preprocess_content(content_dict):
    clean_list = []
    for k in content_dict:
        tokens = nlp_preprocess(content_dict[k])
        for t in tokens:
            clean_list.append(t)
    clean_str = " ".join(clean_list)
    content_dict.update({"nlp_string": clean_str})
    
    return content_dict
    

In [7]:
positive_docs = read_json("../data/positive_docs.json")
negative_docs = read_json("../data/negative_docs.json")
nvidia_docs = read_json("../data/nvidia_docs.json")

In [8]:
positive_docs = preprocess_content(positive_docs)

[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
negative_docs = preprocess_content(negative_docs)

[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
nvidia_docs = preprocess_content(nvidia_docs)

[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
write_json(positive_docs, "../data/positive_docs.json")
write_json(negative_docs, "../data/negative_docs.json")
write_json(nvidia_docs, "../data/nvidia_docs.json")