# Gather news 

## Load packages

In [1]:
# data essentials
import pandas as pd
import numpy as np
import re

# custom helper functions
import custom_helper as ps

# API
from pynytimes import NYTAPI
import requests

# Construct search terms 
## Define topics

In [2]:
topics_negative = ['scandal', 'greenwashing', 'corruption', 'fraud', 
                   'bribe', 'tax', 'forced', 'harassment', 'violation', 
                   'illegal', 'conflict', 'weapons', 'pollution',
                   'inequality', 'discrimination', 'sexism', 'racist', 
                   'intransparent', 'nontransparent', 'breach', 'lawsuit', 
                   'unfair', 'bad', 'problem', 'hate', 'issue', 'controversial', 
                  'strike', 'scam', 'trouble', 'controversy', 'mismanagement', 
                  'crisis', 'turmoil', 'shock', 'whistleblow', 'dispute']

topics_positive =  ['green', 'sustainable', 'positive', 'best', 'good', 
                    'social', 'charity', 'ethical', 'renewable', 'carbon neutral', 
                   'equitable', 'ecological', 'efficient', 'improve', 'cooperative', 
                   'beneficial', 'collaborative', 'productive', 'leader', 
                   'donate', 'optimal', 'favorable', 'desirable', 'resilient', 
                   'robust', 'reasonable', 'strong', 'organic']

print("Defined {} negative and {} positive topics".format(len(topics_negative), len(topics_positive)))


# create df with topics and label
df_topics_neg = pd.DataFrame({'topic':topics_negative, 'positive': 0})
df_topics_pos = pd.DataFrame({'topic':topics_positive, 'positive': 1})
df_topics = pd.concat([df_topics_neg, df_topics_pos]).reset_index(drop=True)
df_topics.head()

Defined 37 negative and 28 positive topics


Unnamed: 0,topic,positive
0,scandal,0
1,greenwashing,0
2,corruption,0
3,fraud,0
4,bribe,0


## Get firm names

In [3]:
import re
def regex_strip_legalname(raw_names):
    """Removes legal entity, technical description or firm type from firm name
    
    Input
        raw_names: list of strings with firm names
        
    Return
        list of strings: firm names without legal description 
    
    """
    
    pattern = r"(\s|\.|\,|\&)*(\.com|Enterprise|Worldwide|Int\'l|N\.V\.|LLC|Co\b|Inc\b|Corp\w*|Group\sInc|Group|Company|Holdings\sInc|\WCo(\s|\.)|plc|Ltd|Int'l\.|Holdings|\(?Class\s\w+\)?)\.?\W?"
    stripped_names = [re.sub(pattern,'', n) for n in raw_names]
    
    return stripped_names

# get firm S&P500 from Wikipedia
keep_columns = ['Symbol','Security', 'GICS Sector']
df_sp500_wiki = ps.get_firms_sp500().loc[:,keep_columns]

# rename column, set ticker as index
df_sp500_wiki= df_sp500_wiki.rename(columns={'Symbol': 'ticker', 'Security': 'firm_name_raw', 'GICS Sector': 'sector'})\
    .set_index('ticker')

# process firm names (remove legal entity)
df_sp500_wiki['firm_name_processed'] = regex_strip_legalname(list(df_sp500_wiki.firm_name_raw))

# drop duplicate firm names (after processing)
df_sp500_wiki.drop_duplicates(subset='firm_name_processed', inplace=True)

df_sp500_wiki.head()

Unnamed: 0_level_0,firm_name_raw,sector,firm_name_processed
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,3M Company,Industrials,3M
ABT,Abbott Laboratories,Health Care,Abbott Laboratories
ABBV,AbbVie Inc.,Health Care,AbbVie
ABMD,ABIOMED Inc,Health Care,ABIOMED
ACN,Accenture plc,Information Technology,Accenture


## Construct query keywords
> **Desired table** where 1 row = 1 search term

> | topic | firm_name | ...| search_term | positive (dummy)
> | --- | --- | --- | ---| --- 

> *search_term* is a pairwise combination of *topic* and *firm_name*
> *positive* is a binary variable that indicates a positive topic

In [4]:
# expand firm names for each topic
df_sp500_expanded = df_sp500_wiki.iloc[np.repeat(np.arange(len(df_sp500_wiki)), len(df_topics))]
# expand topics for each firm 
df_topics_expanded = df_topics.iloc[list(np.arange(len(df_topics)))*len(df_sp500_wiki)]\
    .set_index(df_sp500_expanded.index)

# create search keywords as a combintation of firm name + topic
search_terms = pd.DataFrame({'search_term':[i+' '+j for j in df_sp500_wiki.firm_name_processed for i in df_topics.topic]})\
    .set_index(df_sp500_expanded.index)

# merge topics, firm names, and search terms into 1 df
df_query_input = pd.concat([df_topics_expanded, df_sp500_expanded, search_terms], axis=1)

df_query_input.head()

Unnamed: 0_level_0,topic,positive,firm_name_raw,sector,firm_name_processed,search_term
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MMM,scandal,0,3M Company,Industrials,3M,scandal 3M
MMM,greenwashing,0,3M Company,Industrials,3M,greenwashing 3M
MMM,corruption,0,3M Company,Industrials,3M,corruption 3M
MMM,fraud,0,3M Company,Industrials,3M,fraud 3M
MMM,bribe,0,3M Company,Industrials,3M,bribe 3M


# Google News

##  Query functions
### I. `get_news()` retrieves news for a keyword
### II. `query_news()` loops over list of keywords, handles potential errors and stores query results as csv

In [5]:
import sys

def sleep_countdown(duration, print_step=2):
    """Sleep for certain duration and print remaining time in steps of print_step

    Input
    duration: duration of timeout (int)
    print_step: steps to print countdown (int)

    Return 
    None
    """
    for i in range(duration,0,-print_step):
        sys.stdout.write(str(i)+' ')
        sys.stdout.flush()
        sleep(print_step)
    sys.stdout.write('cont.')

In [6]:
import pandas as pd
from GoogleNews import GoogleNews
import custom_helper

def get_news(keyword, until_page=10, keep_columns=['title', 'date', 'desc'], timeout=10):
    """Retrieve news for keyword for the first specified number of result pages
        within the period until 1 year ago
        
    Input
        keyword to look up news for
    
    Return
        dataframe
    """
    
    ## define 1 year timespan with datestrings 
    # today's date
    date_today = date.today().strftime("%m/%d/%Y")
    # date 1 year ago
    date_1year_ago = ps.date_add_year(date.today(), -1).strftime("%m/%d/%Y")

    ## Google news query
    # init googlenews object with US-en country-language setting
    googlenews=GoogleNews(lang='en-US&gl=US&ceid=US:en', start=date_1year_ago, end=date_today)    
    
    # retrieve search news for keyword
    googlenews.search(keyword)
    
    # get results for each page 
    for p in range(until_page):
        print("Page {} for {}".format(p, keyword))
        googlenews.getpage(p)
        sleep_countdown(timeout, print_step=2)
    
    # store results in df
    result = pd.DataFrame(googlenews.result())
    
    
    ## process result data
    # drop duplicates
    result.drop_duplicates(inplace=True)
    # keep specified columns
    result = result[keep_columns]
    # add column with keyword
    result['search_term'] = keyword
    
    # clear google news cache
    googlenews.clear()
    
    return result

In [7]:
from time import sleep

def query(lst, max_retries=1, idx_unsuccessful=list(), until_page=5, timeout=20) :
    """Handle failed query and handle raised exceptions
    
    Input
        lst: list with keywords for which to retrieve news
        max_retries: number of maximum retries
        until_page: maximum number of retrievd news page
        
    
    Return
        Inidces where max retries were reached
    """    
    for i in lst:
        # retry until max_retries reached
        for attempt in range(max_retries):        
            try:
                df_result = get_news(i, until_page=until_page, timeout=timeout)
        
            # handle query error
            except Exception as e:
                timeout += 5
                print("\t>>>EXCEPTION at {}: {}. Set timeout to {}\n".format(i, e, timeout))
                sleep_countdown(10)
                
            
            # query was successful: store results
            else:
                stamp = ps.timestamp_now()
                # merge news dataframes and export query results
                ps.make_csv(df_result, "news.csv", 'data/news', append=True)
                break

        # max_retries reached: store index of unsuccessful query
        else:
            ps.make_csv(pd.DataFrame([i]), "unsuccessful_queries.csv", 'data/news', append=True)
            print("i: {} appended to idx_unsuccessful\n".format(i))

## Run query for all search_terms
> **Issue**: Possibly rate limit exceeded, nothing is returned.

In [364]:
query(df_query_input.search_term[10:15], until_page=2, timeout=20)

10 8 6 4 2 cont.Page 0 for conflict 3M
10 8 6 4 2 cont.Page 1 for conflict 3M
	>>>EXCEPTION at conflict 3M: "None of [Index(['title', 'date', 'desc'], dtype='object')] are in the [columns]". Set timeout to 15

10 8 6 4 2 cont.Path created: data/news/unsuccessful_queries.csv
i: conflict 3M appended to idx_unsuccessful

15 13 11 9 7 5 3 1 cont.Page 0 for weapons 3M
15 13 11 9 7 5 3 1 cont.Page 1 for weapons 3M
	>>>EXCEPTION at weapons 3M: "None of [Index(['title', 'date', 'desc'], dtype='object')] are in the [columns]". Set timeout to 20

10 8 6 4 2 cont.Path created: data/news/unsuccessful_queries.csv
i: weapons 3M appended to idx_unsuccessful

20 

KeyboardInterrupt: 

In [317]:
## merge query input with query results
# import news data
df_news = pd.read_csv('./data/news/news.csv')
# merge with query input for ticker and firm name
df_query_input.merge(df_news, how='outer', on='search_term', right_index=True)

FileNotFoundError: [Errno 2] File ./data/news/news.csv does not exist: './data/news/news.csv'

# NYTimes API

* all nytimes article accessible

## Initialize API
https://pypi.org/project/pynytimes/

In [36]:
from pynytimes import NYTAPI

file_api_key = open('nytimes_api_key.txt', 'r') 
api_key = file_api_key.readlines()[0]
nyt = NYTAPI(api_key)

## Retrieve articles

### Article search

In [None]:
# Article search (beta)
from datetime import datetime

articles = nyt.article_search(
    query = "3M",
    results = 50,
    dates = {
        "begin": datetime(2019, 1, 31)
    },
    options = {
        "sort": "relevance",
        "sources": [
            "New York Times",
            "AP",
            "Reuters",
            "International Herald Tribune"
        ]
    }
)

In [51]:
articles = pd.DataFrame(articles)
articles.iloc[0,:][1]

'https://www.nytimes.com/video/us/elections/100000007293558/michelle-obama-speaks-dnc.html'

### Top stories

In [41]:
top_stories = nyt.top_stories()

# Get all the top stories from a specific category
top_science_stories = nyt.top_stories(section = "science")
pd.DataFrame(top_science_stories).head(2)

Unnamed: 0,section,subsection,title,abstract,url,uri,byline,item_type,updated_date,created_date,published_date,material_type_facet,kicker,des_facet,org_facet,per_facet,geo_facet,multimedia,short_url
0,health,,Pediatrics Group Offers ‘Long Overdue’ Apology...,The American Academy of Pediatrics recently jo...,https://www.nytimes.com/2020/08/20/health/pedi...,nyt://article/0e1e1c34-87f0-589c-b490-2451da15...,By Emma Goldberg,Article,2020-08-20T05:00:22-04:00,2020-08-20T05:00:22-04:00,2020-08-20T05:00:22-04:00,,,"[Discrimination, Black People, Race and Ethnic...","[American Academy of Pediatrics, American Medi...",[],[Southern States (US)],[{'url': 'https://static01.nyt.com/images/2020...,https://nyti.ms/3ghMlzo
1,health,,This Trawler’s Haul: Evidence That Antibodies ...,Three crew members aboard were spared when the...,https://www.nytimes.com/2020/08/19/health/coro...,nyt://article/de93d5ac-7ea4-543d-817d-ba63e0ea...,By Apoorva Mandavilli,Article,2020-08-19T23:45:36-04:00,2020-08-19T15:32:15-04:00,2020-08-19T15:32:15-04:00,,,"[Coronavirus (2019-nCoV), Antibodies, Boats an...","[Hutchinson, Fred, Cancer Research Center, Abb...",[],[Seattle (Wash)],[{'url': 'https://static01.nyt.com/images/2020...,https://nyti.ms/3aFM0W4


# News API

https://newsapi.org/docs/endpoints/everything

> **Observation:** Conduct a(n ESG) supervised sentiment analysis on news articles by defining positive or negative keywords to search for. For example, searching for "Microsoft scandal" through News api mainly yields negative news about the firm. On the other hand, searching for "Microsoft charity" likely returns positive news about the firm. The simple assumption that every search term we define as negative is indeed negative. This defines the labels for a supervised prediction task.

In [685]:
file_api_key = open('news_api_key.txt', 'r') 
api_key = file_api_key.readlines()[0]

url = ('http://newsapi.org/v2/top-headlines?'
       'country=us&'
       'apiKey={}'.format(api_key))

response = requests.get(url)
response.json()

{'status': 'ok',
 'totalResults': 38,
 'articles': [{'source': {'id': 'cnn', 'name': 'CNN'},
   'author': 'Mary Ilyushina, CNN',
   'title': 'Putin critic Navalny hospitalized in Russia after suspected poisoning: spokeswoman - CNN',
   'description': 'Russian opposition leader and outspoken Kremlin critic Alexei Navalny was placed on a ventilator and was unconscious in a hospital in Siberia Thursday after falling ill from suspected poisoning, his spokesperson said.',
   'url': 'https://www.cnn.com/2020/08/20/europe/russia-navalny-hospitalized-intl-hnk/index.html',
   'urlToImage': 'https://cdn.cnn.com/cnnnext/dam/assets/180123115306-alexey-navalny-january-2018-super-tease.jpg',
   'publishedAt': '2020-08-20T06:21:00Z',
   'content': 'Minsk, Belarus (CNN)Russian opposition leader and outspoken Kremlin critic Alexei Navalny was placed on a ventilator and was unconscious in a hospital in Siberia Thursday after falling ill from suspe… [+873 chars]'},
  {'source': {'id': 'cnn', 'name': 'CNN