In [6]:
import newspaper
import feedparser
import numpy as np
import pandas as pd
import requests
import datetime 
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import joblib
from newspaper import Article
import pickle
import smtplib
from email.message import EmailMessage
import pwd_google

In [7]:
def send_email(login
               , password
               , subject
               , content = None
               , attachment = None
               , to_list = "lorenazhang@gmail.com,jingyao.zhang@wellsfargo.com"
               ):
    msg = EmailMessage()
    msg.set_content(content)
    msg['From'] = login
    msg['Subject'] = subject
    msg['To'] = to_list
    #attachment
    with open(attachment, 'rb') as content_file:
        content = content_file.read()
        msg.add_attachment(content, maintype='application', subtype='pickle', filename = attachment)

    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.set_debuglevel(1)
    server.ehlo()
    server.starttls()
    server.login(login, password)
    server.send_message(msg)
    server.quit()
    print('Email successfully sent!')

In [8]:
today = pd.Timestamp.today()
sunday = today - datetime.timedelta(days = today.dayofweek) + datetime.timedelta(days = 6)
sunday = sunday.strftime('%Y%m%d')

LOGIN    = "lorenazhang@gmail.com"
PASSWORD = pwd_google.pwd

In [9]:
# Blank dataframe, based on fields identified later

rss_feeds = pd.DataFrame(columns = ['title',  'summary',  'links',  'link',  'id',  'guidislink',  'published',  
                                    'published_parsed',  'title_detail.type',  'title_detail.language',  
                                    'title_detail.base',  'title_detail.value',  'summary_detail.type',  
                                    'summary_detail.language',  'summary_detail.base',  'summary_detail.value',  
                                    'media_content',  'feedburner_origlink'])
rss_urls = ['https://news.google.com/rss/search?q=%3Cdigital+inequality%3E&hl=en-US&gl=US&ceid=US:en']

In [10]:
# Get all the feed entries.  But the dataframe resulting from this has only a summary line, 
# not the entire text of the article.  For that we will pull the URL in using the 
# newspaper library later.

for rss in tqdm(rss_urls):
    feed = feedparser.parse(rss)
    rss_feeds=pd.concat([rss_feeds, pd.json_normalize(feed.entries)], axis=0)
print(len(rss_feeds), 'items in rss_feed dataframe')

100%|██████████| 1/1 [00:01<00:00,  1.52s/it]

100 items in rss_feed dataframe





In [11]:
# Remove duplicate URLs
urllist =rss_feeds.link.unique()

In [12]:
# Get full text using scraping from the newspaper library

df = pd.DataFrame(columns = ["date", "URL", "authors", "keywords", "summary", "text"])

for url in tqdm(urllist):
    article = Article(url)
    try:
        article.download()
        article.parse()
        article.nlp()
        dict1 = {"date": article.publish_date, 
                 "URL": url, 
                 "authors": article.authors,
                 "keywords": article.keywords, 
                 "summary": article.summary, 
                 "text": article.text}
    #print(dict1)
        df = df.append(dict1, ignore_index=True)
    except:
        print('Something wrong with', url)

print(len(df),'stories in dataframe df')



  1%|          | 1/100 [00:00<00:13,  7.43it/s]

Something wrong with https://www.makeuseof.com/what-is-digital-inequality/


  8%|▊         | 8/100 [00:04<01:02,  1.48it/s]

Something wrong with https://www.prospectmagazine.co.uk/science-and-technology/inequality-just-went-stratospheric-can-we-bring-it-down-to-earth


 32%|███▏      | 32/100 [00:18<00:29,  2.34it/s]

Something wrong with https://telanganatoday.com/impact-of-covid-19-on-children


 46%|████▌     | 46/100 [00:29<00:24,  2.17it/s]

Something wrong with https://www.forbes.com/sites/ryancraig/2021/07/23/cybersecuritys-sputnik-moment/


 77%|███████▋  | 77/100 [01:12<00:14,  1.61it/s]

Something wrong with https://www.forbes.com/sites/anniebrown/2021/07/23/fixing-ais-diversity-crisis-through-edtech-an-in-depth-interview-with-tony-effik-co-founder-of-the-black-and-brilliant-advocacy-network/


100%|██████████| 100/100 [01:27<00:00,  1.14it/s]

95 stories in dataframe df





In [13]:
# Merge the RSS dataframe with the full text obtained from the 
# newspaper library

df_final = rss_feeds.merge(df,how="right", left_on="link", right_on="URL")
print(len(df_final),'unique articles in file.')

95 unique articles in file.


In [14]:
#just keep columns that are useful
df_final = df_final[['id','title', 'summary_x', 'URL', 'published', 'keywords', 'summary_y', 'text' ]]

In [15]:
def no_timezone(time_string):
    try:
        no_tz = datetime.datetime.strftime(pd.Timestamp(time_string), "%Y-%m-%d %H:%M:%S")
    except:
        no_tz = 0
    return no_tz

In [16]:
df_final['published_date'] = df_final['published'].apply(lambda x: no_timezone(x))
df_final.drop(columns = ['published'], inplace = True)
df_final['keywords'] = [','.join(a) for a in df_final['keywords'].copy(deep=True)]

In [17]:
pickle.dump(df_final, open(f'df_googlenews_{sunday}.p', 'wb'))

In [18]:
#sent email with attachment
try:
    send_email(login = LOGIN
              , password = pwd_google.pwd
              , subject = f'Google News for {sunday}'
              , content = f'Please find attached the google news for week {sunday}'
              , attachment = f'df_googlenews_{sunday}.p')
except:
    send_email(login = LOGIN
              , password = pwd_google.pwd
              , subject = 'Web scrapping Google news failed'
              , content = f'Job failed for Web scrapping Google news for week {sunday}'
              )

send: 'ehlo jingyaos-mbp.fios-router.home\r\n'
reply: b'250-smtp.gmail.com at your service, [72.76.47.69]\r\n'
reply: b'250-SIZE 35882577\r\n'
reply: b'250-8BITMIME\r\n'
reply: b'250-STARTTLS\r\n'
reply: b'250-ENHANCEDSTATUSCODES\r\n'
reply: b'250-PIPELINING\r\n'
reply: b'250-CHUNKING\r\n'
reply: b'250 SMTPUTF8\r\n'
reply: retcode (250); Msg: b'smtp.gmail.com at your service, [72.76.47.69]\nSIZE 35882577\n8BITMIME\nSTARTTLS\nENHANCEDSTATUSCODES\nPIPELINING\nCHUNKING\nSMTPUTF8'
send: 'STARTTLS\r\n'
reply: b'220 2.0.0 Ready to start TLS\r\n'
reply: retcode (220); Msg: b'2.0.0 Ready to start TLS'
send: 'ehlo jingyaos-mbp.fios-router.home\r\n'
reply: b'250-smtp.gmail.com at your service, [72.76.47.69]\r\n'
reply: b'250-SIZE 35882577\r\n'
reply: b'250-8BITMIME\r\n'
reply: b'250-AUTH LOGIN PLAIN XOAUTH2 PLAIN-CLIENTTOKEN OAUTHBEARER XOAUTH\r\n'
reply: b'250-ENHANCEDSTATUSCODES\r\n'
reply: b'250-PIPELINING\r\n'
reply: b'250-CHUNKING\r\n'
reply: b'250 SMTPUTF8\r\n'
reply: retcode (250); Msg: b

Email successfully sent!


reply: b'250 2.0.0 OK  1627064704 w26sm11800499qki.6 - gsmtp\r\n'
reply: retcode (250); Msg: b'2.0.0 OK  1627064704 w26sm11800499qki.6 - gsmtp'
data: (250, b'2.0.0 OK  1627064704 w26sm11800499qki.6 - gsmtp')
send: 'quit\r\n'
reply: b'221 2.0.0 closing connection w26sm11800499qki.6 - gsmtp\r\n'
reply: retcode (221); Msg: b'2.0.0 closing connection w26sm11800499qki.6 - gsmtp'
