In [115]:
from newspaper import Article
import newspaper
import pandas as pd
import numpy as np
import datetime
import os

In [86]:
def scrape_news_website(website_url):
    paper = newspaper.build(website_url)
    num_articles_found = len(paper.articles)
    #print("number found: ",num_articles_found)
    article_dict = {
        'headline': [],
        'publish_date': [],
        'content': [],
        'authors': [],
        'keywords': [],
        'summary': []
    }
    for i, paper_article in enumerate(paper.articles):
        print(f"{i+1}/{num_articles_found}", end="\r")
        try:
            article = Article(paper_article.url)
        except AttributeError as e:
            continue
        article.download()
        try:
            article.parse()
        except Exception as e:
            continue 
        article_dict['headline'].append(article.title)
        article_dict['date'].append(article.publish_date)
        article_dict['content'].append(article.text)
        article_dict['authors'].append(article.authors)
        try:
            article.nlp()
            article_dict['keywords'].append(article.keywords)
            article_dict['summary'].append(article.summary)
        except Exception as e:
            article_dict['keywords'].append(np.nan)
            article_dict['summary'].append(np.nan)
        

    df = pd.DataFrame(article_dict)
    return df


In [94]:
news_urls = [
    "https://news.google.com/home?hl=en-US&gl=US&ceid=US:en",
    "https://www.cnn.com/",
    "https://www.nbcnews.com/",
    "https://www.foxnews.com/",
    "https://apnews.com/",
    "https://www.vox.com/",
    "https://time.com/",
    "https://www.washingtonpost.com/",
    "https://www.usatoday.com/",
    "https://abcnews.go.com/",
    "https://www.wsj.com/news/world?gclid=Cj0KCQiA6rCgBhDVARIsAK1kGPJMbSU0Bd_RBeUkLv3YVhPk6G0yc9cZiVGv7FFrXKyH2-jFcIjZbq4aAumbEALw_wcB&mod=djmc_DGWorld&gclsrc=aw.ds&ef_id=YNEdOwAAABkp53MD:20230312011148:s",
    "https://www.npr.org/sections/news/",
    "https://www.usnews.com/",
    "https://www.nytimes.com/"]

In [112]:
def extract_names(news_urls):
    news_names = []
    for url in news_urls:
        if "www." in url:
            website_name = url.split(".")[1]
        else:
            website_name = url.split(".com")[0].split("//")[-1]
        news_names.append(website_name)
    return news_names

In [113]:
news_names = extract_names(news_urls)

In [123]:
lookup = {name: url for name, url in zip(news_names, news_urls)}

In [214]:
def scrape_websites(news_names, news_urls, save_file="data/news_data/news_data.pkl"):

    frames = []

    for i, (name, url) in enumerate(zip(news_names, news_urls)):
        print()
        print(f"Scraping data for {name} ({i+1}/{len(news_names)})")
        news_df = scrape_news_website(url)
        news_df['source'] = name
        news_df['url'] = url
        news_df['date_pulled'] = datetime.date.today().strftime("%m-%d-%y")
        print(f"Found {news_df.shape[0]} usable articles.")
        frames.append(news_df)

    if os.path.exists(save_file):
        frames.insert(0, pd.read_pickle(save_file))

    df = pd.concat(frames)
    df.reset_index(inplace=True, drop=True)

    df = df[~df.duplicated(subset=['headline', 'publish_date', 'content'], keep='first')] #remove duplicates (leave first instance of duplicate)

    df.to_pickle(save_file)

In [216]:
scrape_websites(news_names, news_urls, save_file="data/news_data/news_data.pkl")


Scraping data for news.google (1/14)
Found 0 usable articles.

Scraping data for cnn (2/14)
Found 0 usable articles.

Scraping data for nbcnews (3/14)
Found 0 usable articles.

Scraping data for foxnews (4/14)
Found 0 usable articles.

Scraping data for apnews (5/14)
Found 0 usable articles.

Scraping data for vox (6/14)
Found 0 usable articles.

Scraping data for time (7/14)
Found 0 usable articles.

Scraping data for washingtonpost (8/14)
Found 0 usable articles.

Scraping data for usatoday (9/14)
Found 0 usable articles.

Scraping data for abcnews.go (10/14)
Found 12 usable articles.

Scraping data for wsj (11/14)
Found 0 usable articles.

Scraping data for npr (12/14)
Found 0 usable articles.

Scraping data for usnews (13/14)
Found 0 usable articles.

Scraping data for nytimes (14/14)
Found 0 usable articles.


In [217]:
df = pd.read_pickle("data/news_data/news_data.pkl")

In [218]:
df.shape

(1678, 10)

In [219]:
df.head()

Unnamed: 0,headline,publish_date,content,authors,keywords,summary,source,url,date_pulled,date
0,Watch ABC News Network Online,NaT,18+ only. Any free trials valid for new and el...,[],,,abcnews.go,https://abcnews.go.com/,03-11-23,
1,3 women missing for 2 weeks after traveling fr...,NaT,The women left on Feb. 24 to go to a flea mark...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23,
2,"California's Pajaro River breaches overnight, ...",NaT,Dozens of water rescues have been conducted af...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23,
3,Paul Flores sentenced to 25 years to life for ...,NaT,Paul Flores was sentenced Friday to 25 years t...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23,
4,One of Silicon Valley's top banks fails; asset...,NaT,Regulators have seized the assets of one of Si...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23,


In [212]:
df.shape

(1671, 10)

In [186]:
df.duplicated(subset=['headline']).sum()

91

In [191]:
len(frames)

14

In [193]:
test_df = pd.concat(frames)

In [194]:
test_df.head()

Unnamed: 0,headline,date,content,authors,keywords,summary,source,url,date_pulled
0,Billionaire Bill Ackman on SVB collapse: Gover...,,Billionaire investor Bill Ackman wrote a lengt...,[Andrea Vacchiano],,,foxnews,https://www.foxnews.com/,03-11-23
1,Billionaire Bill Ackman on SVB collapse: Gover...,,Billionaire investor Bill Ackman wrote a lengt...,[Andrea Vacchiano],,,foxnews,https://www.foxnews.com/,03-11-23


In [202]:
test_df.iloc[0]['headline'] in test_df['headline'].values

True

In [199]:
test_df['headline']

0    Billionaire Bill Ackman on SVB collapse: Gover...
1    Billionaire Bill Ackman on SVB collapse: Gover...
Name: headline, dtype: object

In [158]:
old_df = pd.read_pickle("data/news_data/news_data.pkl")

In [159]:
old_df.shape

(1729, 9)

In [163]:
old_df.head()

Unnamed: 0,headline,publish_date,content,authors,keywords,summary,source,url,date_pulled
0,Watch ABC News Network Online,NaT,18+ only. Any free trials valid for new and el...,[],,,abcnews.go,https://abcnews.go.com/,03-11-23
1,3 women missing for 2 weeks after traveling fr...,NaT,The women left on Feb. 24 to go to a flea mark...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
2,"California's Pajaro River breaches overnight, ...",NaT,Dozens of water rescues have been conducted af...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
3,Paul Flores sentenced to 25 years to life for ...,NaT,Paul Flores was sentenced Friday to 25 years t...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
4,One of Silicon Valley's top banks fails; asset...,NaT,Regulators have seized the assets of one of Si...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23


In [177]:
type(old_df[~old_df['publish_date'].isnull()]['publish_date'][412])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
old_df

In [178]:
old_df.columns

Index(['headline', 'publish_date', 'content', 'authors', 'keywords', 'summary',
       'source', 'url', 'date_pulled'],
      dtype='object')

In [209]:
old_df = old_df[~old_df.duplicated(subset=['headline', 'publish_date', 'content'], keep='first')]

In [210]:
old_df.shape

(1667, 9)

In [103]:

print(df.shape)
df.head()

71/132

Building prefix dict from c:\Users\Mike's PC\Desktop\Masters_DU\MSDS_capstone\capenv\lib\site-packages\jieba\dict.txt ...
Dumping model to file cache C:\Users\MIKE'S~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.3987021446228027 seconds.
Prefix dict has been built succesfully.


(132, 6)


Unnamed: 0,headline,date,content,authors,keywords,summary
0,Silicon Valley Bank Collapse Sets Off Blame Ga...,2023-03-11 00:00:00,"SAN FRANCISCO — For once, the crisis didn’t se...","[David Yaffe-Bellany, Erin Griffith, Mike Isaac]",,
1,3 Lessons From Silicon Valley Bank’s Failure,2023-03-11 00:00:00,What can the collapse of Silicon Valley Bank t...,[Kevin Roose],,
2,The Second-Biggest Bank Failure,2023-03-10 00:00:00,"A bar chart of U.S. bank failures since 2001, ...","[Karl Russell, Christine Zhang]",,
3,Chinese-Brokered Deal Upends Mideast Diplomacy...,2023-03-11 00:00:00,"WASHINGTON — Finally, there is a peace deal of...",[Peter Baker],,
4,Inside Ron DeSantis’s Politicized Removal of a...,2023-03-11 00:00:00,"Months before suspending Mr. Warren, Mr. DeSan...","[Alexandra Berzon, Ken Bensinger]",,


In [104]:
df.to_pickle("data/news_data/nytimes_3-11-23.pkl")

In [152]:
df = pd.read_pickle("data/news_data/news_data.pkl")

In [154]:
df.head()

Unnamed: 0,headline,publish_date,content,authors,keywords,summary,source,url,date_pulled
0,Watch ABC News Network Online,NaT,18+ only. Any free trials valid for new and el...,[],,,abcnews.go,https://abcnews.go.com/,03-11-23
1,3 women missing for 2 weeks after traveling fr...,NaT,The women left on Feb. 24 to go to a flea mark...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
2,"California's Pajaro River breaches overnight, ...",NaT,Dozens of water rescues have been conducted af...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
3,Paul Flores sentenced to 25 years to life for ...,NaT,Paul Flores was sentenced Friday to 25 years t...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
4,One of Silicon Valley's top banks fails; asset...,NaT,Regulators have seized the assets of one of Si...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23


In [155]:
df.shape

(1729, 9)

In [151]:
frames = []
for fname in os.listdir("data/news_data/"):
    if fname=="news_data.pkl":
        continue
    df = pd.read_pickle("data/news_data/" + fname)
    news_name = fname.split("_")[0]
    df['source'] = news_name
    df['url'] = lookup[news_name]
    print(df['date'].isnull().sum())
    frames.append(df)

708
13
12
35
202


In [127]:
len(frames)

5

In [129]:
df = pd.concat(frames)

In [144]:
df.to_pickle("data/news_data/news_data.pkl")

In [133]:
df = df.rename(columns={'date': 'publish_date'})

In [134]:
df['date_pulled'] = datetime.date.today().strftime("%m-%d-%y")

In [140]:
df.head(22)

Unnamed: 0,headline,publish_date,content,authors,keywords,summary,source,url,date_pulled
0,Watch ABC News Network Online,NaT,18+ only. Any free trials valid for new and el...,[],,,abcnews.go,https://abcnews.go.com/,03-11-23
1,3 women missing for 2 weeks after traveling fr...,NaT,The women left on Feb. 24 to go to a flea mark...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
2,"California's Pajaro River breaches overnight, ...",NaT,Dozens of water rescues have been conducted af...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
3,Paul Flores sentenced to 25 years to life for ...,NaT,Paul Flores was sentenced Friday to 25 years t...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
4,One of Silicon Valley's top banks fails; asset...,NaT,Regulators have seized the assets of one of Si...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
5,Oscars 2023: How to watch and what to know ahe...,NaT,"The 2023 Oscars will air live on Sunday, March...",[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
6,FDA approves Pfizer's new nasal spray treatmen...,NaT,The U.S. Food and Drug Administration has appr...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
7,Democrats propose countermeasure to GOP's Pare...,NaT,"Democrats say their bill would advance ""inclus...",[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
8,Santos denies new allegation he organized 2017...,NaT,Rep. George Santos on Friday denied a new alle...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23
9,Michigan man allegedly threatened Gov. Gretche...,NaT,The FBI is investigating threats allegedly mad...,[Abc News],,,abcnews.go,https://abcnews.go.com/,03-11-23


In [143]:
df.reset_index(inplace=True, drop=True)

In [149]:
df['publish_date'].isnull().sum()

970

In [None]:
pd._libs.tslibs.nattype.NaTType

In [107]:
datetime.date.today().strftime("%m-%d-%y")

'03-11-23'

In [None]:
from GoogleNews import GoogleNews
googlenews = GoogleNews()

In [None]:
print(googlenews.getVersion())

In [None]:
googlenews.enableException(True)
googlenews.set_lang('en')
googlenews.set_time_range('02/01/2020','02/28/2020')
googlenews.set_encode('utf-8')

In [None]:
googlenews.get_news('APPLE')

In [None]:
googlenews.search('APPLE')

In [None]:
googlenews.get_page(2)

In [None]:
googlenews.total_count()

In [None]:
googlenews.results()

In [None]:
googlenews.get_texts()

In [None]:
googlenews.clear()

In [None]:
googlenews.get_texts()

In [None]:
from pygooglenews import GoogleNews

gn = GoogleNews()


In [None]:
# search for the best matching articles that mention MSFT and 
# do not mention AAPL (over the past 6 month
search = gn.search('APPL', when = '1m')

In [None]:
type(search)

In [None]:
search.keys()

In [None]:
type(search['entries'])

In [None]:
len(search['entries']) #number of entries

In [None]:
search['entries'][0]