In [1]:
from __future__ import print_function
import requests
import pandas as pd
from datetime import datetime
from tqdm import tqdm, tqdm_notebook
from matplotlib import pyplot as plt

In [2]:
def getSources():
    source_url = 'https://newsapi.org/v1/sources?language=en'
    response = requests.get(source_url).json()
    sources = []
    for source in response['sources']:
        sources.append(source['id'])
    return sources

In [3]:
sources = getSources()
print('number of sources :', len(sources))
print('sources :', ', '.join(sources))

number of sources : 52
sources : abc-news-au, al-jazeera-english, ars-technica, associated-press, bbc-news, bbc-sport, bloomberg, breitbart-news, business-insider, business-insider-uk, buzzfeed, cnbc, cnn, engadget, entertainment-weekly, espn, espn-cric-info, financial-times, football-italia, fortune, four-four-two, fox-sports, google-news, hacker-news, ign, independent, mashable, mtv-news, mtv-news-uk, national-geographic, new-scientist, newsweek, new-york-magazine, nfl-news, polygon, recode, reddit-r-all, reuters, talksport, techcrunch, techradar, the-hindu, the-huffington-post, the-lad-bible, the-next-web, the-sport-bible, the-times-of-india, the-verge, the-wall-street-journal, the-washington-post, time, usa-today


In [None]:
f0aa6c641c1e4be4bc589d4cce60a597

In [4]:
def mapping():
    d = {}
    response = requests.get('https://newsapi.org/v1/sources?language=en')
    response = response.json()
    for s in response['sources']:
        d[s['id']] = s['category']
    return d

In [5]:
m = mapping()
print('category of reuters:', m['reuters'])
print('category of techcrunch:', m['techcrunch'])

category of reuters: general
category of techcrunch: technology


In [6]:
news = pd.read_csv("news.csv")

In [10]:
news['category'].unique()

array(['general', 'technology', 'sports', 'business', 'entertainment'],
      dtype=object)

In [8]:
news.head()

Unnamed: 0,author,title,description,url,urlToImage,publishedAt,source,category,scraping_date
0,https://www.abc.net.au/news/matthew-doran/5511636,AFP finds no evidence document used by Angus T...,The Australian Federal Police finds no evidenc...,http://www.abc.net.au/news/2020-05-12/no-evide...,https://www.abc.net.au/cm/rimage/11640468-16x9...,2020-05-12T08:23:18Z,abc-news-au,general,2020-05-12 15:28:23.621677
1,https://www.abc.net.au/news/david-lipson/7849048,New York's coronavirus outbreak struck as Visa...,"Visaya Hoffie, battling for her life following...",http://www.abc.net.au/news/2020-05-12/coronavi...,https://www.abc.net.au/cm/rimage/12226050-16x9...,2020-05-12T02:11:52Z,abc-news-au,general,2020-05-12 15:28:23.621677
2,https://www.abc.net.au/news/kath-sullivan/4929...,Red-meat processors have beef sales to China s...,Australia's red meat industry could be the lat...,http://www.abc.net.au/news/rural/2020-05-12/ch...,https://www.abc.net.au/cm/rimage/6900764-16x9-...,2020-05-12T02:19:48Z,abc-news-au,general,2020-05-12 15:28:23.621677
3,https://www.abc.net.au/news/karen-percy/5358026,Bupa aged care ordered to pay $6 million for s...,Aged care provider Bupa is fined $6 million fo...,http://www.abc.net.au/news/2020-05-12/bupa-age...,https://www.abc.net.au/cm/rimage/4451712-16x9-...,2020-05-12T07:42:25Z,abc-news-au,general,2020-05-12 15:28:23.621677
4,https://www.abc.net.au/news/peter-ryan/167104,Alan Jones' retirement will spark mourning or ...,Conservative radio king Alan Jones' influence ...,http://www.abc.net.au/news/2020-05-12/alan-jon...,https://www.abc.net.au/cm/rimage/4290904-16x9-...,2020-05-12T03:33:04Z,abc-news-au,general,2020-05-12 15:28:23.621677


In [None]:
def category(source, m):
    try:
        return m[source]
    except:
        return 'NC'

def getDailyNews():
    sources = getSources()
    key = 'c12a5c07a7bd42edbf54d59aca007a54'
    url = 'https://newsapi.org/v1/articles?source={0}&sortBy={1}&apiKey={2}'
    responses = []
    for i, source in tqdm_notebook(enumerate(sources), total=len(sources)):
        
        try:
            u = url.format(source, 'top', key)
        except:
            u = url.format(source, 'latest', key)
        
        response = requests.get(u)
        r = response.json()
        try:
            for article in r['articles']:
                article['source'] = source
            responses.append(r)
        except:
            print('Rate limit exceeded ... please wait and retry in 6 hours')
            return None
                
    articles = list(map(lambda r: r['articles'], responses))
    articles = list(functools.reduce(lambda x,y: x+y, articles))
    
    news = pd.DataFrame(articles)
    news = news.dropna()
    news = news.drop_duplicates()
    news.reset_index(inplace=True, drop=True)
    d = mapping()
    news['category'] = news['source'].map(lambda s: category(s, d))
    news['scraping_date'] = datetime.now()

    try:
        aux = pd.read_csv('news.csv')
        aux = aux.append(news)
        aux = aux.drop_duplicates('url')
        aux.reset_index(inplace=True, drop=True)
        aux.to_csv('news.csv', encoding='utf-8', index=False)
    except:
        news.to_csv('news.csv', index=False, encoding='utf-8')
        
    print('Done')
    
if __name__=='__main__':
    getDailyNews()

In [13]:
import functools