### Assignment 1: Use the newsapi.org and IBM Watson API to extract entities that appear in the news

#### Author: Melody Shi
#### Written on: Sep 13, 2018

In [1]:
import requests
import pandas as pd
import json
import time

### Part 1: Use a Web API to get URLs for news articles

In [2]:
"""
"  Gets a list of urls returned by news API with specified news source
"  @param 'source' A news source to be used for requesting
"  @return A list of top 10 urls
"""

def getNews(source):
    endpoint = "https://newsapi.org/v2/top-headlines"
    url_list = []
    parameters = {
    'sources' : source.strip().replace(' ','-'), #takes care of the white space in a query
    'language' : 'en',
    'apiKey' : 'aea8a8a56de048ecb78948268e10f6bd'  
    }
    try:
        resp = requests.get(endpoint,params = parameters).json()    
        for article in resp['articles']:
            url_list.append(article['url'])
    except Exception as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        print(message)
        print("<----------Returning an empty list---------->")
    return url_list


In [3]:
getNews('bbc-news')
#getNews(' BBC News')  #works for conventional search query as well

['http://www.bbc.co.uk/news/world-australia-45517450',
 'http://www.bbc.co.uk/news/world-us-canada-45516937',
 'http://www.bbc.co.uk/news/world-us-canada-45517406',
 'http://www.bbc.co.uk/news/world-latin-america-45516234',
 'http://www.bbc.co.uk/news/uk-45515013',
 'http://www.bbc.co.uk/news/world-us-canada-45478934',
 'http://www.bbc.co.uk/news/world-asia-45506882',
 'http://www.bbc.co.uk/news/business-45514882',
 'http://www.bbc.co.uk/news/world-europe-45513912',
 'http://www.bbc.co.uk/news/world-us-canada-45511867']

### Part 2: Analyze a news article using IBM Watson Natural Language Understanding API

In [4]:
"""
"  Gets a list of dictionaries representing entities returned by IBM-Watson API with specified url
"  @param 'url' A link to be used for requesting
"  @param 'source' A news source to be used for requesting; defaults to None
"  @param 'limit' An int that specifies the maximum of entities allowed to return; defaults to 10
"  @return A list of dictionaries representing entities
"""

def extractEntities(url, source=None, limit=10):
    list_of_dict = []
    username = '1705c559-5471-464d-8d17-fd449cea6669'
    pwd = 'CifcuEyJ71S1'
    
    endpoint = "https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze"
    parameters = {
        'version': '2017-02-27',
    }
    headers = { 
        'Content-Type': 'application/json',
    }
    watson_options = {
      "url": url,
      "features": {
        "entities": {
          "sentiment": True,
          "emotion": True,
          "limit": limit
        }
      }
    }
    resp = requests.post(endpoint, 
                         data=json.dumps(watson_options), 
                         headers=headers, 
                         params=parameters, 
                         auth=(username, pwd) 
                        )
    
    for entity in resp.json()['entities']:        
        entity_dict = {}
        try:
            entity_dict['entity'] = entity['disambiguation']['name']  #IBM API may use dbpedia for name disambiguation
        except:
            entity_dict['entity'] = entity['text'] #if no disambiguation is made, get entity name using 'text' key 
        entity_dict['url'] = url
        if source != None:
            entity_dict['source'] = source #adds a source key only if source is specified
        entity_dict['relevance'] = entity['relevance'] 
        entity_dict['sentiment'] = entity['sentiment']['score']
        list_of_dict.append(entity_dict)
        
    return list_of_dict

In [5]:
url = 'http://www.bbc.co.uk/news/world-australia-45517450'
#extractEntities(url)
extractEntities(url,source='bbc-news') #with source specified

[{'entity': 'Scott Morrison',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.810038,
  'sentiment': 0.698462},
 {'entity': 'Fatman Scoop',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.277295,
  'sentiment': 0.0},
 {'entity': 'prime minister',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.228644,
  'sentiment': 0.0},
 {'entity': 'United States',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.207513,
  'sentiment': 0.0},
 {'entity': 'hundred dollar',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.207513,
  'sentiment': 0.0},
 {'entity': 'three weeks',
  'url': 'http://www.bbc.co.uk/news/world-australia-45517450',
  'source': 'bbc-news',
  'relevance': 0.207513,
  'sentiment': 0.0},
 {'entity': 'four hours

### Part 3: Extract the entities that appear in the news and create a dashboard

In [6]:
"""
"  Builds a dataframe aggregating information returned by News API and IBM Watson API
"  @param 'source' A news source to be used for requesting
"  @return A dataframe with desired information
"""

def main(source):
    url_list = getNews(source)
    df = pd.DataFrame({'A' : []}) #creates an empty dataframe
    count = 0  #counts links processed
    for url in url_list:
        count += 1
        try:
            list_of_dict = extractEntities(url, source=source)
            df = pd.DataFrame(list_of_dict).append(df, ignore_index=True, sort=True)
            print("...{:.2f}% done, processing link {}".format((count/len(url))*100,count))
        except Exception as ex:
            template = "An exception of type {0} occurred for"+ url+". Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            print(message)
    return df.drop(columns=['A'])

In [7]:
df_wsj = main('the-wall-street-journal')

...0.93% done, processing link 1
...1.74% done, processing link 2
...2.73% done, processing link 3
...4.08% done, processing link 4
...4.90% done, processing link 5
...4.72% done, processing link 6
...5.88% done, processing link 7
...6.78% done, processing link 8
...7.56% done, processing link 9
...8.20% done, processing link 10


In [8]:
df_ny = main('the-new-york-times')

...1.19% done, processing link 1
...2.11% done, processing link 2
...3.66% done, processing link 3
...4.04% done, processing link 4
...5.38% done, processing link 5
...8.82% done, processing link 6
...10.61% done, processing link 7
...11.27% done, processing link 8
...12.50% done, processing link 9
...11.49% done, processing link 10


### Result

In [9]:
pd.concat([df_wsj,df_ny], ignore_index=True)

Unnamed: 0,entity,relevance,sentiment,source,url
0,Recep Tayyip Erdoğan,0.654717,-0.734879,the-wall-street-journal,https://www.wsj.com/articles/erdogan-moves-to-...
1,17.75%,0.448016,0.000000,the-wall-street-journal,https://www.wsj.com/articles/erdogan-moves-to-...
2,24%,0.448016,0.000000,the-wall-street-journal,https://www.wsj.com/articles/erdogan-moves-to-...
3,Amnesty International,0.814628,-0.426390,the-wall-street-journal,https://www.wsj.com/articles/smartphones-elect...
4,Volkswagen Group,0.792835,-0.426390,the-wall-street-journal,https://www.wsj.com/articles/smartphones-elect...
5,Democratic Republic of the Congo,0.644475,-0.496871,the-wall-street-journal,https://www.wsj.com/articles/smartphones-elect...
6,Congo—Apple Inc.,0.441636,0.000000,the-wall-street-journal,https://www.wsj.com/articles/smartphones-elect...
7,two years,0.441636,0.000000,the-wall-street-journal,https://www.wsj.com/articles/smartphones-elect...
8,General Motors Co.,0.752458,-0.512372,the-wall-street-journal,https://www.wsj.com/articles/gm-recalls-one-mi...
9,GM,0.643666,-0.535937,the-wall-street-journal,https://www.wsj.com/articles/gm-recalls-one-mi...
