In [8]:
import json
from pprint import pprint
import pandas as pd

import text_processing

In [9]:
# read in articles
with open('data/scraped_articles.json', 'r') as f:
    data = json.load(f)

In [10]:
# explore the structure of the data
data.keys()

dict_keys(['newspapers'])

In [11]:
print(sorted(data['newspapers'].keys()))

['cnbc_investing', 'cnbc_top_news', 'cnnmoney', 'economist_business', 'economist_science_and_tech', 'entrepreneur_latest', 'entrepreneur_marketing', 'findmarketresearch', 'fivethirtyeight_economics', 'fivethirtyeight_science', 'fox_business_latest_headlines', 'fox_business_opinion', 'heraldanalyst', 'highlandmirror', 'insidertradings', 'journalismday', 'marketwatch', 'marketwatch_newsletters', 'marketwatch_stories', 'nbpostgazette', 'newyorktimes_business', 'newyorktimes_science', 'newyorktimes_technology', 'reportagestuff', 'reuters_money', 'reuters_science', 'satprnews', 'sciencedaily', 'thefinancialanalyst', 'tokenfolks', 'truthfulreporter']


In [12]:
print(data['newspapers']['cnbc_investing'].keys())

dict_keys(['rss', 'link', 'articles'])


In [13]:
print(data['newspapers']['cnbc_investing']['rss'])
print(data['newspapers']['cnbc_investing']['link'])
print(data['newspapers']['cnbc_investing']['articles'][0].keys())

https://www.cnbc.com/id/15839069/device/rss/rss.html
https://www.cnbc.com/investing/
dict_keys(['link', 'published', 'title', 'text', 'author'])


In [14]:
pprint(data['newspapers']['cnbc_investing']['articles'][0])

{'author': ['Elizabeth Gurdus',
            'Andrew Harrer',
            'Bloomberg',
            'Getty Images',
            'Scott Mlyn'],
 'link': 'https://www.cnbc.com/2018/03/23/cramer-remix-the-one-bright-spot-after-a-tough-week.html',
 'published': '2018-03-24T00:00:00',
 'text': "CNBC's Jim Cramer welcomed Dropbox's successful initial public "
         'offering on Friday as a reminder that even in a sell-off, good '
         'things can still happen.\n'
         '\n'
         'Shares of the cloud-based data storage company closed up over 35 '
         'percent after its first day of trading, at $28.48 a share, well '
         "above the company's stated price of $21.\n"
         '\n'
         '"If you were paying attention over the course of the session, you '
         'know that this was a white-hot deal," the "Mad Money" host said. '
         '"This is exactly the kind of stock that Wall Street wants right '
         'now."\n'
         '\n'
         "Cramer's main point of c

In [15]:
data['newspapers']['cnbc_investing']['articles'][0]['text']

'CNBC\'s Jim Cramer welcomed Dropbox\'s successful initial public offering on Friday as a reminder that even in a sell-off, good things can still happen.\n\nShares of the cloud-based data storage company closed up over 35 percent after its first day of trading, at $28.48 a share, well above the company\'s stated price of $21.\n\n"If you were paying attention over the course of the session, you know that this was a white-hot deal," the "Mad Money" host said. "This is exactly the kind of stock that Wall Street wants right now."\n\nCramer\'s main point of concern was whether Dropbox would be able to keep turning its non-paying subscribers into paying customers. To maintain its growth rate, it needs to switch between two million and four million free users into paying customers each year.\n\nBut that\'s less than 1 percent of its 500 million free subscribers, Cramer noted, adding that the company will now have millions of dollars to invest in and bolster the business.\n\n"Put it all togeth

In [16]:
# use 'clean_text' function to clean an article and compare to the original version
text_processing.clean_text(data['newspapers']['cnbc_investing']['articles'][0]['text'])

'cnbc jim cramer welcomed dropbox successful initial public offering friday reminder that even sell off  good things can still happen   shares the cloud based data storage company closed over   percent after its first day trading share  well above the company stated price you were paying attention over the course the session  you know that this was white hot deal   the  mad money  host said   this exactly the kind stock that wall street wants right now    cramer main point concern was whether dropbox would able keep turning its non paying subscribers into paying customers maintain its growth rate needs switch between two million and four million free users into paying customers each year   but that less than   percent its   million free subscribers  cramer noted  adding that the company will now have millions dollars invest and bolster the business    put all together and big fan dropbox the company and its ceo  drew houston said  but for dropbox the stock asked investors little more c

In [17]:
# clean all of the articles and store in a list; store the links in a separate list
article_text_clean = []
article_links = []
for src in data['newspapers'].keys():
    for article in data['newspapers'][src]['articles']:
        article_text_clean.append(text_processing.clean_text(article['text']))
        article_links.append(article['link'])

In [18]:
# inspect
print(article_links[0])
article_text_clean[0]

https://www.nytimes.com/2018/03/24/technology/google-facebook-data-privacy.html?partner=rss&emc=rss


'there are other avenues still  said jascha kaykas wolff  the chief marketing officer mozilla  the nonprofit organization behind the popular firefox browser  including advertisers and large tech platforms collecting vastly less user data and still effectively customizing ads consumers    they are just collecting all the data try find magic growth algorithms  kaykas wolff said online marketers  this past week  mozilla halted its ads facebook  saying the social network default privacy settings allowed access too much data   the greatest likelihood that the internet companies  frightened the tumult  will accept few more rules and work little harder for transparency  and there will hearings capitol hill   the next chapter also set play out not washington but europe  where regulators have already cracked down privacy violations and are examining the role data online advertising   photo  the cambridge analytica case  said vera jourova  the european union commissioner for justice  consumers a

In [19]:
# compile in a DataFrame
articles = pd.DataFrame(data = {'text': article_text_clean, 'link': article_links})
articles.head()

Unnamed: 0,text,link
0,there are other avenues still said jascha kay...,https://www.nytimes.com/2018/03/24/technology/...
1,and tran who his mid and speaks english wi...,https://www.nytimes.com/2018/03/24/business/we...
2,that analogy guo will essentially become the ...,https://www.nytimes.com/2018/03/25/business/ch...
3,news williams likely selection was first repo...,https://www.nytimes.com/2018/03/24/business/jo...
4,what increasingly clear many capitol hill and ...,https://www.nytimes.com/2018/03/24/business/tr...


In [20]:
articles.shape

(631, 2)

In [21]:
# removed comma from end of last item in data/blacklist.json
# load blacklist
with open('data/blacklist.json') as f:
    blacklist = json.load(f)

In [22]:
print(sorted(blacklist))

['assetsstock.com', 'chiefobserver.com', 'dailycommercenews.com', 'dailytelescope.com', 'factsweek.com', 'financialportal24.com', 'firstnewshawk.com', 'firstnewsservice.com', 'freenewsman.com', 'heraldanalyst.com', 'ibnservice.com', 'incrediblenews24.com', 'islandpostgazette.com', 'ittechnology24.com', 'journalismday.com', 'marketingcorrect.com', 'marketnewsaccess.com', 'monotonecritic.com', 'nbherard.com', 'nbpostgazette.com', 'newmexicocourierexpress.com', 'newschemicals.com', 'nwctrail.com', 'opinioninvestor.com', 'peopletoday24.com', 'perfectanalyst.com', 'positivenewspaper.com', 'publicistreport.com', 'publicsector24.com', 'reportagestuff.com', 'silverpinyon.com', 'talkdailynews.com', 'talkherald.com', 'theanalystfinancial.com', 'theanalystfinancial.com', 'thefinancialconsulting.com', 'thefuturetechnologies.com', 'thefuturetechnologies.com', 'therealfact24.com', 'thetricountypress.com', 'timegazette.com', 'tokenfolks.com', 'truthfulreporter.com', 'www.findmarketresearch.org', 'www

In [23]:
# create a column 'spam' that tracks whether an article is spam or not (boolean)
# if any of the blacklisted sites appear in the article's link, it is identified as spam
spam = []
for link in articles.link:
    spam.append(any(site in link for site in blacklist))
articles['spam'] = spam
articles.tail()

Unnamed: 0,text,link,spam
626,the worldwide perfume market newly published ...,https://heraldanalyst.com/global-perfume-marke...,True
627,the worldwide pecialty printing consumables m...,https://heraldanalyst.com/global-pecialty-prin...,True
628,the worldwide pipes market newly published res...,https://heraldanalyst.com/global-pe-pipes-mark...,True
629,the worldwide patient warming devices market ...,https://heraldanalyst.com/global-patient-warmi...,True
630,the worldwide pancreatitis drugs market newly...,https://heraldanalyst.com/global-pancreatitis-...,True


In [24]:
# create document-term matrices with articles as rows and individual words as columns (bag of words)
# tracks the frequency of each word in each article
# one using Count Vectorizer and one using Term Frequency - Inverse Document Frequency
articles_cv = text_processing.doc_term_matrix(articles['text'], vectorizer = 'CV')
articles_tfidf = text_processing.doc_term_matrix(articles['text'], vectorizer = 'TFIDF')

In [25]:
articles_tfidf.head()

Unnamed: 0,aadr,aapl,aaron,aarp,abacus,abandon,abandoned,abandoning,abandonment,abb,...,zoology,zoomlion,zoos,zscaler,zte,zuck,zuckerberg,zurawik,zwenkau,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02568,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
print(articles_tfidf.shape, articles_cv.shape)

(631, 20946) (631, 20946)


In [27]:
# export DataFrames
articles.to_pickle('articles_df.pickle')
articles_cv.to_pickle('articles_doc_term_CV.pickle')
articles_tfidf.to_pickle('articles_doc_term_TFIDF.pickle')