# Project to automate api access 
+ I added my api key as config.py file  PLease get your api key https://newsapi.org/docs/get-started 

In [1]:
import pandas as pd
import config
import math
from newsapi import NewsApiClient
import newspaper
import requests
from newspaper import fulltext
import time
# Hit Api with credentials
newsapi = NewsApiClient(api_key=config.api_key)


### Grab all sources
+ read through available sources list and make df storing domain and source name
+ I hit a ton of sources below, we can clearly narrow it down 

In [2]:
import csv
sources = newsapi.get_sources()
new_orgs = sources["sources"]
my_sources = {}
for i, x in enumerate(new_orgs):
    my_sources[i] = (x['id'])
domains = sources["sources"]
my_domains = {}
for i, x in enumerate(domains):
    my_domains[i] = (x['url'])
sources = pd.Series(my_sources).to_frame("sources")
domains = pd.Series(my_domains).to_frame("domains")
query_keys_df = domains.join(sources)
(query_keys_df)
#query_keys_df.to_csv('lists.csv')
#3,16,17,24,40,77,124,123,18,117,81

Unnamed: 0,domains,sources
0,https://abcnews.go.com,abc-news
1,http://www.abc.net.au/news,abc-news-au
2,https://www.aftenposten.no,aftenposten
3,http://www.aljazeera.com,al-jazeera-english
4,http://www.ansa.it,ansa
5,http://www.argaam.com,argaam
6,http://arstechnica.com,ars-technica
7,https://arynews.tv/ud/,ary-news
8,https://apnews.com/,associated-press
9,http://www.afr.com,australian-financial-review


### Choosing data sources
+ Lets attempt to grab some sources from different geographic locations as well as different idological perspectives

+ categorizing news sources
    + Traditional TV MSM
        +  http://us.cnn.com   
        +  http://www.cnbc.com 
        +  http://www.foxnews.com  
        +  http://www.msnbc.com  
        +  https://abcnews.go.com  
        +  http://www.nbcnews.com  
    + Traditional publications 
        +  http://www.nytimes.com  
        +  https://www.washingtonpost.com 
        
    + Internet Sources
        +  http://www.huffingtonpost.com 
        +  https://www.politico.com
        +  http://www.breitbart.com 
        +  https://news.google.com 
        +  https://www.buzzfeed.com 
        +  https://news.vice.com  
    + Financial publications
        +  http://www.economist.com
        +  http://www.bloomberg.com 
        +  http://www.businessinsider.com 
        +  http://www.wsj.com
        +  http://fortune.com  
        
    + News aggregators
        +  https://apnews.com/ 
        +  http://www.reuters.com 
    + foreign reporting
         + http://www.aljazeera.com  
         + http://www.bbc.co.uk/news   
         + https://www.jpost.com/  
         + http://timesofindia.indiatimes.com 
         + https://russian.rt.com 
         + https://www.theguardian.com/uk 
         + http://www.independent.co.uk  
         + http://www.telegraph.co.uk  


 
### Build Query String 

In [3]:

## Literally picking data sources from df i printed above 
a= query_keys_df.iloc[[16,17,24],[1]]
b= query_keys_df.iloc[[40,43,77],[1]]
list_sources =a["sources"].tolist()

## build out string for query request 
myString = ",".join(list_sources)

list_sources2 = b["sources"].tolist()
myString2 = ",".join(list_sources2)
myString2

'fox-news,google-news,msnbc'

## Now lets begin process of automating query calls
+ first lets build function to clean query returns

In [4]:
def clean_query(query):
    for x in query['articles']:
        try:
            x["source"] = x["source"]["name"]
        except:
            pass
        try:
            x['publishedAt'] = str.split(x['publishedAt'], "T")[0]
        except:
            pass
        try:
            del x['urlToImage']
        except KeyError:
            pass
    my_df = pd.DataFrame(query["articles"])
    return my_df

## Function to hit the api
+ Originally I had a loop here.  instead I figured I would just build a function that takes start data, end data, query term(candidate)
    + The original code kept giving me a query limit reached result, so I decided to change up strategy and search one day at a time
    + After we hit the papers at the start for past 30 days, we will only need 1 day at a time going forward.
    + I built in some print statements for error handeling, which you will see below in the block after this

In [5]:
import sys
import time
candidates_list=[]
total_count=[]

# Make first call
def hit_api(start,end,q,myString2):
    
    ## catch bug with formatted strings for dates
    if end < 10:
        start_str = "0"+ str(start)
        end_str = "0"+ str(end)
    elif end==10:
        start_str = "0"+ str(start)
        end_str = str(end)
    else :
        start_str = str(start)
        end_str = str(end)
      
    ## API query
    all_articles = newsapi.get_everything(q=q,
                                          sources=myString2,
                                          language='en',
                                          from_param='2019-09-{}'.format(start_str),
                                          to='2019-09-{}'.format(end_str),
                                          sort_by='relevancy',
                                          page_size=100,
                                          page=1)
    
    ## get count
    total_pages = math.ceil(all_articles["totalResults"]/100)
    print("query will return: "+ str(all_articles["totalResults"]))
    
    ## store count to check versus dimension of df later
    total_count.append(all_articles["totalResults"])
    
    ## Clen query 
    all_articles = clean_query(all_articles)
    
    ## append to list
    candidates_list.append(all_articles)
    return(candidates_list)


## Built out a loop 
+ Simple, look at first day of September to last day incrementing start and end by 1 each time



In [6]:
for x in range(20,31):
    df=hit_api(x,x+1,'Biden OR Joe Biden',myString2)
    
## collapse list on itself to build big df
Bernie_df = pd.concat(df)
Bernie_df= Bernie_df.reset_index(drop=True)



query will return: 48
query will return: 36
query will return: 82
query will return: 130
query will return: 145
query will return: 149
query will return: 121
query will return: 72
query will return: 37
query will return: 53
query will return: 650


## Data sanity checks

### Issues
+ The counts don't match up between expected and actual because the query results are limited to first 100 hits
    + Won't be an issue with less publications, but we need to be careful splitting candidate calls up.
+ Not all links are unique
    + This is expected given internet sources reposting articles from other sources

In [7]:
##validate that df shape[0] is equal to expected query count
print("we should expect: {} articles".format(sum(total_count)))
print("we have: {} articles".format(Bernie_df.shape[0]))


## test 100 results theory
over_100 = sum([x-100 for x in total_count if x>100 ])
print("{} results came into query that exceeded 100 hits in a day.  Thus we lose any hit over 100 in a day".format(over_100))


##unique links
unique_array = Bernie_df.url.unique()
print("we have {} unique links".format(unique_array.shape[0]))




we should expect: 1523 articles
we have: 828 articles
695 results came into query that exceeded 100 hits in a day.  Thus we lose any hit over 100 in a day
we have 521 unique links


In [8]:
duplicateRowsDF = Bernie_df[Bernie_df.duplicated("url")]
duplicateRowsDF.shape

(307, 7)

## Display duplicates

In [9]:
Bernie_df.sort_values(by=['url'])
Bernie_df.shape

(828, 7)

### Hitting newspaper 3k Api with links
+ We can deal with duplicates and streamlining the above later
+ to demonstrate a working product I feed our url into newspaper 3k 
+ Id say it takes 5-10 seconds per article to fetch complete text

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}

counts=0
list_full_text=[]
for link in Bernie_df['url']:
    counts+=1
    print(counts)
    html = requests.get(link,headers=headers).text
    try: text = fulltext(html)
    except: 
        print("no words found")
        text="no words found"
        list_full_text.append(text)
        continue
    list_full_text.append(text)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
no words found
15
16
17
18
19
20
21
22
23
no words found
24
25
26
27
28
29
30
no words found
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
no words found
120
121
122
123
124
125
126
127
128
129
no words found
130
131
132
133
134
no words found
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
no words found
153
no words found
154
155
156
no words found
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
no words found
197
198
no words found
199
200
201
202
203
204
no words found
205
206
207
208
no words found
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
2

Bernie_df["url"]

## Add text to previous dataframe

In [None]:
Bernie_df['full_art']=pd.Series(list_full_text)

In [None]:
Bernie_df[:20]['full_art']

In [None]:
import time
Bernie_df.to_csv('data_'+str(time.time())+'.csv')

## Conclusion
+ we need to functionize streamline and clean up query calls.
+ sorry my python is rusty.

In [None]:
time.time()