In [28]:
import json
import string
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import numpy as np

[nltk_data] Downloading package punkt to /Users/yhkuo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yhkuo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yhkuo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocessing functions

In [3]:
def tokenize(line):
    punct = set(string.punctuation)
    token = word_tokenize(line)
    token = [''.join(c for c in s if c not in punct) for s in token]
    return token

def removeStopword(tokens):
    stopWords = set(stopwords.words('english'))
    return [ token.lower() for token in tokens if token.lower() not in stopWords ]

def lemmatize(tokens):
    return [ WordNetLemmatizer().lemmatize(token) for token in tokens ]

def preprocess(line):
    token = tokenize(line)
    token = removeStopword(token)
    lemmas = [ c for c in lemmatize(token) if c != '']
    return lemmas

In [4]:
def createFrequencyMatrix(lemmas):
    frequency_matrix = {}
    for i in range(len(lemmas)):
        sen = lemmas[i]
        for word in sen:
            if word in frequency_matrix:
                frequency_matrix[word] += 1
            else:
                frequency_matrix[word] = 1
    return  {k: v for k, v in sorted(frequency_matrix.items(), key=lambda item: item[1], reverse=True)}

In [14]:
dataPath = '../final-project/public/data/news.json'
with open(dataPath) as file:
    data = json.load(file)
    print(len(data))

2871


In [33]:
threshold = 1
for story in data:
    content = story['content']
    lemma = []
    for line in content:
        lemma.append(preprocess(line))
    fq = createFrequencyMatrix(lemma)
    
    wordcloud = []
    for key, value in fq.items():
        if value > 1:
            item = [key, value]
            wordcloud.append(item)
    
    
    story['wordcloud'] = wordcloud
    scope = list(filter(lambda x: x > 1, list(fq.values())))
    story['wordcloud_scope'] = {}
    if scope:
        story['wordcloud_scope']['Q3'] = int(np.quantile(scope, .75))
        story['wordcloud_scope']['Q2'] = int(np.quantile(scope, .50))
        story['wordcloud_scope']['Q1'] = int(np.quantile(scope, .25))
    

In [34]:
with open('../final-project/public/data/wordcloud.json', 'w') as file:
    json.dump(data, file)

In [25]:
scope = list(filter(lambda x: x > 1, list(fq.values())))
print(scope)

[140, 138, 126, 126, 124, 90, 81, 67, 64, 64, 62, 60, 59, 54, 52, 49, 47, 46, 43, 39, 38, 35, 35, 32, 32, 32, 31, 30, 30, 29, 27, 27, 26, 25, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5

In [29]:
import numpy as np
print(int(np.quantile(scope, .90)))
print(np.quantile(scope, .60))
print(np.quantile(scope, .40))

11
4.0
3.0
