# Project to automate api access 
+ I added my api key as config.py file  PLease get your api key https://newsapi.org/docs/get-started 

In [1]:
import pandas as pd
import config
import math
from newsapi import NewsApiClient
import newspaper
import requests
from newspaper import fulltext

# Hit Api with credentials
newsapi = NewsApiClient(api_key=config.api_key)


### Grab all sources
+ read through available sources list and make df storing domain and source name
+ I hit a ton of sources below, we can clearly narrow it down 

In [2]:

sources = newsapi.get_sources()
new_orgs = sources["sources"]
my_sources = {}
for i, x in enumerate(new_orgs):
    my_sources[i] = (x['id'])
domains = sources["sources"]
my_domains = {}
for i, x in enumerate(domains):
    my_domains[i] = (x['url'])
sources = pd.Series(my_sources).to_frame("sources")
domains = pd.Series(my_domains).to_frame("domains")
query_keys_df = domains.join(sources)
print(query_keys_df)

                                        domains                      sources
0                        https://abcnews.go.com                     abc-news
1                    http://www.abc.net.au/news                  abc-news-au
2                    https://www.aftenposten.no                  aftenposten
3                      http://www.aljazeera.com           al-jazeera-english
4                            http://www.ansa.it                         ansa
5                         http://www.argaam.com                       argaam
6                        http://arstechnica.com                 ars-technica
7                        https://arynews.tv/ud/                     ary-news
8                           https://apnews.com/             associated-press
9                            http://www.afr.com  australian-financial-review
10                        https://www.axios.com                        axios
11                    http://www.bbc.co.uk/news                     bbc-news

### Choosing data sources
+ Lets attempt to grab some sources from different geographic locations as well as different idological perspectives

+ categorizing news sources
    + Traditional TV MSM
        +  http://us.cnn.com   
        +  http://www.cnbc.com 
        +  http://www.foxnews.com  
        +  http://www.msnbc.com  
        +  https://abcnews.go.com  
        +  http://www.nbcnews.com  
    + Traditional publications 
        +  http://www.nytimes.com  
        +  https://www.washingtonpost.com 
        
    + Internet Sources
        +  http://www.huffingtonpost.com 
        +  https://www.politico.com
        +  http://www.breitbart.com 
        +  https://news.google.com 
        +  https://www.buzzfeed.com 
        +  https://news.vice.com  
    + Financial publications
        +  http://www.economist.com
        +  http://www.bloomberg.com 
        +  http://www.businessinsider.com 
        +  http://www.wsj.com
        +  http://fortune.com  
        
    + News aggregators
        +  https://apnews.com/ 
        +  http://www.reuters.com 
    + foreign reporting
         + http://www.aljazeera.com  
         + http://www.bbc.co.uk/news   
         + https://www.jpost.com/  
         + http://timesofindia.indiatimes.com 
         + https://russian.rt.com 
         + https://www.theguardian.com/uk 
         + http://www.independent.co.uk  
         + http://www.telegraph.co.uk  


 
### Build Query String 

In [3]:

## Literally picking data sources from df i printed above 
a= query_keys_df.iloc[[0,3,8,11,16,17,18,20,22,23,24,39,41,44,62,82,83,93,98,99,111,114,117,119,121,124,127,128,132],[1]]
list_sources =a["sources"].tolist()

## build out string for query request 
myString = ",".join(list_sources)
myString

'abc-news,al-jazeera-english,associated-press,bbc-news,bloomberg,breitbart-news,business-insider,buzzfeed,cbs-news,cnbc,cnn,four-four-two,fox-sports,google-news-ar,infobae,nbc-news,news24,polygon,rt,rte,the-hill,the-irish-times,the-new-york-times,the-sport-bible,the-times-of-india,the-washington-post,usa-today,vice-news,xinhua-net'

## Now lets begin process of automating query calls
+ first lets build function to clean query returns

In [4]:
def clean_query(query):
    for x in query['articles']:
        try:
            x["source"] = x["source"]["name"]
        except:
            pass
        try:
            x['publishedAt'] = str.split(x['publishedAt'], "T")[0]
        except:
            pass
        try:
            del x['urlToImage']
        except KeyError:
            pass
    my_df = pd.DataFrame(query["articles"])
    return my_df

## Function to hit the api
+ Originally I had a loop here.  instead I figured I would just build a function that takes start data, end data, query term(candidate)
    + The original code kept giving me a query limit reached result, so I decided to change up strategy and search one day at a time
    + After we hit the papers at the start for past 30 days, we will only need 1 day at a time going forward.
    + I built in some print statements for error handeling, which you will see below in the block after this

In [22]:
import sys
import time
candidates_list=[]
total_count=[]

# Make first call
def hit_api(start,end,q,myString):
    
    ## catch bug with formatted strings for dates
    if end < 10:
        start_str = "0"+ str(start)
        end_str = "0"+ str(end)
    elif end==10:
        start_str = "0"+ str(start)
        end_str = str(end)
    else :
        start_str = str(start)
        end_str = str(end)
      
    ## API query
    all_articles = newsapi.get_everything(q=q,
                                          sources=myString,
                                          domains='https://apnews.com/,http://www.nytimes.com',
                                          language='en',
                                          from_param='2019-09-{}'.format(start_str),
                                          to='2019-09-{}'.format(end_str),
                                          sort_by='relevancy',
                                          page_size=100,
                                          page=1)
    
    ## get count
    total_pages = math.ceil(all_articles["totalResults"]/100)
    print("query will return: "+ str(all_articles["totalResults"]))
    
    ## store count to check versus dimension of df later
    total_count.append(all_articles["totalResults"])
    
    ## Clen query 
    all_articles = clean_query(all_articles)
    
    ## append to list
    candidates_list.append(all_articles)
    return(candidates_list)


## Built out a loop 
+ Simple, look at first day of September to last day incrementing start and end by 1 each time



In [23]:
for x in range(2,31):
    df=hit_api(x,x+1,'Bernie|Sanders',myString)
    
## collapse list on itself to build big df
Bernie_df = pd.concat(df)
Bernie_df= Bernie_df.reset_index(drop=True)



query will return: 48
query will return: 76
query will return: 94
query will return: 84
query will return: 56
query will return: 47
query will return: 60
query will return: 81
query will return: 116
query will return: 151
query will return: 268
query will return: 213
query will return: 64
query will return: 83
query will return: 106
query will return: 127
query will return: 152
query will return: 136
query will return: 86
query will return: 68
query will return: 76
query will return: 103
query will return: 95
query will return: 65
query will return: 57
query will return: 36
query will return: 29
query will return: 59
query will return: 131


## Data sanity checks

### Issues
+ The counts don't match up between expected and actual because the query results are limited to first 100 hits
    + Won't be an issue with less publications, but we need to be careful splitting candidate calls up.
+ Not all links are unique
    + This is expected given internet sources reposting articles from other sources

In [31]:
##validate that df shape[0] is equal to expected query count
print("we should expect: {} articles".format(sum(total_count)))
print("we have: {} articles".format(Bernie_df.shape[0]))


## test 100 results theory
over_100 = sum([x-100 for x in total_count if x>100 ])
print("{} results came into query that exceeded 100 hits in a day.  Thus we lose any hit over 100 in a day".format(over_100))


##unique links
unique_array = Bernie_df.url.unique()
print("we have {} unique links".format(unique_array.shape[0]))




we should expect: 2767 articles
we have: 2264 articles
503 results came into query that exceeded 100 hits in a day.  Thus we lose any hit over 100 in a day
we have 1234 unique links


In [16]:
duplicateRowsDF = Bernie_df[Bernie_df.duplicated("url")]
duplicateRowsDF.shape

(1030, 7)

## Display duplicates

In [17]:
Bernie_df.sort_values(by=['url'])[1:20]


Unnamed: 0,author,content,description,publishedAt,source,title,url
1792,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-24,Google News,"AM QUICKIE: September 24th, 2019 w/ Lucie Stei...",http://feedproxy.google.com/~r/MajorityReport/...
628,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-11,Google News,"AM QUICKIE: September 11th, 2019 w/ Lucie Stei...",http://feedproxy.google.com/~r/MajorityReport/...
1280,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-18,Google News,"AM QUICKIE: September 18th, 2019 w/ Lucie Stei...",http://feedproxy.google.com/~r/MajorityReport/...
1543,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-20,Google News,"AM QUICKIE: September 20th, 2019 w/ Sam Seder ...",http://feedproxy.google.com/~r/MajorityReport/...
1470,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-20,Google News,"AM QUICKIE: September 20th, 2019 w/ Sam Seder ...",http://feedproxy.google.com/~r/MajorityReport/...
1169,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-17,Google News,"AM QUICKIE: September 17th, 2019 w/ Lucie Stei...",http://feedproxy.google.com/~r/MajorityReport/...
1288,,,Welcome to Majority.FM 's AM QUICKIE! Brought ...,2019-09-17,Google News,"AM QUICKIE: September 17th, 2019 w/ Lucie Stei...",http://feedproxy.google.com/~r/MajorityReport/...
1675,"Tom Woods, Tom Woods",,Bernie Sanders is proposing a nationwide progr...,2019-09-24,Google News,Ep. 1498 Against Bernie's National Rent Control,http://feedproxy.google.com/~r/TheTomWoodsShow...
1881,"Tom Woods, Tom Woods",,Bernie Sanders is proposing a nationwide progr...,2019-09-24,Google News,Ep. 1498 Against Bernie's National Rent Control,http://feedproxy.google.com/~r/TheTomWoodsShow...
1775,"Tom Woods, Tom Woods",,Bernie Sanders is proposing a nationwide progr...,2019-09-24,Google News,Ep. 1498 Against Bernie's National Rent Control,http://feedproxy.google.com/~r/TheTomWoodsShow...


### Hitting newspaper 3k Api with links
+ We can deal with duplicates and streamlining the above later
+ to demonstrate a working product I feed our url into newspaper 3k 
+ Id say it takes 5-10 seconds per article to fetch complete text

In [10]:
list_full_text=[]
for link in Bernie_df['url'][0:10]:
    html = requests.get(link).text
    text = fulltext(html)
    list_full_text.append(text)

In [11]:
list_full_text[6]

'Ed O\'Keefe CBS News\n\nEd O\'Keefe is a CBS News political correspondent based in Washington, D.C., covering the 2020 presidential campaign for all CBS News broadcasts and platforms.\n\nIn the early months of the campaign, he has interviewed several Democratic contenders, including Steve Bullock, Pete Buttigieg, Julian Castro, Kirsten Gillibrand, Kamala Harris, Amy Klobuchar, Elizabeth Warren and Bernie Sanders and Republican William Weld.\n\nSince joining CBS News in April 2018, O\'Keefe has contributed to coverage of the 2018 midterm elections, the contentious confirmation hearing for Supreme Court Justice Brett Kavanaugh, the funerals of former president George H.W. Bush and former Arizona Senator John McCain, the record-long federal government shutdown and the blackface scandal and sexual misconduct allegations that rocked Virginia state government in early 2019.\n\nTrending News CBS News Internship Program\n\nBefore joining CBS News, O\'Keefe spent nearly 13 years at The Washing

## Conclusion
+ we need to functionize streamline and clean up query calls.
+ sorry my python is rusty.