In [89]:
import json
import string
import nltk
from itertools import islice


nltk.download('punkt')
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import numpy as np

[nltk_data] Downloading package punkt to /Users/yhkuo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yhkuo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yhkuo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Preprocessing functions

In [55]:
def tokenize(line):
    punct = set(string.punctuation)
    punct.update(['’','”','“','—'])
    token = word_tokenize(line)
    token = [''.join(c for c in s if c not in punct) for s in token]
    return token

def removeStopword(tokens):
    stopWords = set(stopwords.words('english'))
    return [ token.lower() for token in tokens if token.lower() not in stopWords ]

def lemmatize(tokens):
    return [ WordNetLemmatizer().lemmatize(token) for token in tokens ]

def preprocess(line):
    token = tokenize(line)
    token = removeStopword(token)
    lemmas = [ c for c in lemmatize(token) if c != '']
    return lemmas

In [56]:
def createFrequencyMatrix(lemmas):
    frequency_matrix = {}
    for i in range(len(lemmas)):
        sen = lemmas[i]
        for word in sen:
            if word in frequency_matrix:
                frequency_matrix[word] += 1
            else:
                frequency_matrix[word] = 1
    return  {k: v for k, v in sorted(frequency_matrix.items(), key=lambda item: item[1], reverse=True)}

In [104]:
dataPath = '../final-project/public/data/news.json'
with open(dataPath) as file:
    data = json.load(file)
    print(len(data))

2871


In [58]:
threshold = 1
for story in data:
    content = story['content']
    lemma = []
    for line in content:
        lemma.append(preprocess(line))
    fq = createFrequencyMatrix(lemma)
    
    wordcloud = []
    for key, value in fq.items():
        if value > 1:
            item = [key, value]
            wordcloud.append(item)
    
    
    story['wordcloud'] = wordcloud
    scope = list(filter(lambda x: x > 1, list(fq.values())))
    story['wordcloud_scope'] = {}
    if scope:
        story['wordcloud_scope']['Q3'] = int(np.quantile(scope, .75))
        story['wordcloud_scope']['Q2'] = int(np.quantile(scope, .50))
        story['wordcloud_scope']['Q1'] = int(np.quantile(scope, .25))
    

In [59]:
with open('../final-project/public/data/wordcloud.json', 'w') as file:
    json.dump(data, file)

## date subset

In [35]:
#"03/07/2020, 04:50:35"
date_encoding="%m/%d/%Y, %X"

In [68]:
date = "03/07/2020"

In [69]:
specific = []
for each in data:
    if date in each['time-stamp']:
        specific.append(each)

In [96]:
def mergeFrequencyMatrix(fq, wordcloud):
    for word, value in wordcloud:
        if word in fq:
            fq[word] += value
        else:
            fq[word] = value
    return fq
    #return  {k: v for k, v in sorted(frequency_matrix.items(), key=lambda item: item[1], reverse=True)}
def take(n, iterable):
    filtered = list(islice(iterable, n))
    return dict((x,y) for x,y in filtered)

In [97]:
fq = {}
for each in specific:
    fq = mergeFrequencyMatrix(fq, each['wordcloud'])
fq = {k: v for k, v in sorted(fq.items(), key=lambda item: item[1], reverse=True)}
fq = take(50, fq.items())
print(fq)

{'said': 2158, 'coronavirus': 1554, 'health': 1068, 'case': 982, 'state': 900, 'virus': 855, 'people': 851, 'friday': 761, 'official': 692, 'new': 647, 'outbreak': 637, 'week': 476, 'tested': 464, 'one': 452, 'positive': 422, 'would': 400, 'u': 397, 'patient': 393, 'two': 391, 'trump': 388, 'country': 377, 'spread': 374, 'public': 369, 'test': 368, 'home': 363, 'also': 346, 'according': 344, 'county': 331, 'thursday': 330, 'confirmed': 322, 'newsletter': 320, 'president': 313, 'ship': 309, 'china': 308, 'covid19': 306, 'employee': 303, 'reported': 302, 'resident': 296, 'news': 293, 'day': 291, 'first': 284, 'cruise': 276, 'work': 265, 'disease': 261, 'company': 258, 'travel': 253, 'told': 249, 'announced': 246, 'united': 244, 'worker': 241}


In [101]:
def quantile(fq):
    scope = list(filter(lambda x: x > 30, list(fq.values())))
    return int(np.quantile(scope, .75)), int(np.quantile(scope, .50)), int(np.quantile(scope, .25))
def DictToList(fq):
    fq_list = []
    for key, value in fq.items():
        item = [key, value]
        fq_list.append(item)
    return fq_list

In [102]:
df = {}
df['wordcloud'] = DictToList(fq)
df['wordcloud_scope'] = {}
df['wordcloud_scope']['Q3'], df['wordcloud_scope']['Q2'], df['wordcloud_scope']['Q1'] = quantile(fq)

In [103]:
with open('../final-project/public/data/wordcloud_day.json', 'w') as file:
    json.dump(df, file)

## textRank

In [107]:
test = data[0]['content']
content = []
for line in test:
    line.encode().

['\nA virus epidemic that started in China about two months ago is now tightening its grip around the globe.\n', "\nToday's biggest developments:\n", "\nHere's the latest from Tuesday. All times are Eastern.\n", "\nOfficials at Amazon, the giant online retailer, said that one of the company's Seattle employees has contracted the coronavirus.\n", "\nIt was not immediately known what the employee's role was, but a company spokesperson said the infected person is now in quarantine.\n", '\nAmazon has more than 50,000 employees in the Seattle region.\n', '\nThe number of coroavirus cases in South Korea jumped by 516 Tuesday, according to the Yonhap news agency. The country now has 5,328 cases in total.\n', "\nThirty-two people have died from the virus in South Korea, according to the country's Centers for Disease Control.\n", '\nVice President Mike Pence said all passengers coming into the U.S. from South Korea on direct flights would be subject to screening at South Korean airports.\n', '\