# OUTLINE 


- Run job three times per day 
- Gets current date 
- Checks current date folder 
- compares news array to written files
- If news array has items that are not in written files
- Regenerate news array with missing items
- call news api, get news for updated news array
- Save to json  
- Also save all files to csv (can overwrite)


In [142]:
# IMPORT STATEMENTS 
import json
import os
from datetime import date
from newsapi import NewsApiClient
import pandas as pd

# IMPORT API KEYS
f = open("../keys/api.txt", "r")
keys = f.read()
f.close()
ACCESS_TOKEN = keys


#-------------------------------------------------------------------------------------------
#   PARMS SECTION 
#-------------------------------------------------------------------------------------------

# READING IN 
NewsFileNameArray = []                         # A LIST OF FILENAMES 
data = []                                      # ALL NEWS DATA FROM LOCAL NEWS JSON FILES

# REQUESTING NEW DATA
newsapi = NewsApiClient(api_key=ACCESS_TOKEN)  # INITIALISING newsapi object
news_keyname_array = ['bbc-news', 'abc-news','cnn','fox-news','independent','mirror','metro','daily-mail', 'Theguardian.com' , 'Sky.com', 'the-new-york-times', 'al-jazeera-english']
news_array = []


#-------------------------------------------------------------------------------------------
#   READING LOCAL FILES 
#-------------------------------------------------------------------------------------------
# GET TODAYS DATE 
today = date.today()
print("Today's date:", today)

# CREATE A NEW DIRECTORY FOR TODAY IF NOT EXIST
if os.path.isdir("data/" + str(today)):
	print('Dir Already exists')
else:
	os.mkdir("data/" + str(today))

print('')

# ITERATE THROUGH DATA FOLDER
directory = "data/" + str(today)
for filename in os.listdir(directory):
    if filename.endswith(".json"): 
        newsName = filename[:-5]
        NewsFileNameArray.append(newsName)
        with open(str(directory) + "/" + str(filename)) as json_file:
            data.append(json.load(json_file))
        continue
    else:
        continue


print('The news sources already obtained for this day are: ')
print(NewsFileNameArray)
print('')
print('the number of saved sources are: ')
print(len(data))



#-------------------------------------------------------------------------------------------
#   UPDATING ARRAY
#-------------------------------------------------------------------------------------------


#   ONLY MAKE CALLS FOR NEWS SOURCES WE DON'T ALREADY HAVE 
#   (THIS MEANS UPDATING OUR ARRAY TO REMOVE ALREADY EXISTING SAVED NEWS SOURCES)

for item in NewsFileNameArray:
    print('removing ' + str(item))
    news_keyname_array.remove(str(item))


if not news_keyname_array:
    print('All data obtained for today, exiting.')
    exit()

print('The sources still required are ' + str(news_keyname_array))
print('')



#-------------------------------------------------------------------------------------------
#   REQUESTING MORE NEWS 
#-------------------------------------------------------------------------------------------

# Init API
print('PULLING NEWS HEADLINES - PLEASE WAIT .... ')
print('')
for item in news_keyname_array:
    print('processing ' + str(item + ' headlines'))
    news_key = item
    json_item = newsapi.get_top_headlines(sources=news_key)
    if json_item['totalResults'] == 0:
        print("Request for the " + str(item) + " news source is empty, skipping")
        print('')
        continue
    news_array.append(json_item)
    print('COMPLETE - writing to file .......')
    print('')
    file_name= str("data/" + str(today) + "/" + str(news_key) + ".json")
    with open(file_name, 'w') as fp:
        json.dump(json_item, fp)

        
#-------------------------------------------------------------------------------------------
#   BUILDING REPORT 
#-------------------------------------------------------------------------------------------

# BUILD A PANDAS DATA FRAME 
df = pd.DataFrame(columns=['source','author','title','description','url', 'requested_date','publishedAt','content'])

# CLEAR OUT DATA ARRAY AND REPOPULATE WITH NEW UPDATED JSON FILES (COULD BE MORE EFFICIENT BUT MEH)

data = []

# ITERATE THROUGH DATA FOLDER
directory = "data/" + str(today)
for filename in os.listdir(directory):
    if filename.endswith(".json"): 
        newsName = filename[:-5]
        NewsFileNameArray.append(newsName)
        with open(str(directory) + "/" + str(filename)) as json_file:
            data.append(json.load(json_file))
        continue
    else:
        continue


# Iterate through data array and write to csv

x = 0 
for news_outlet in range (0, len(data)):
    for article_number in range (0, len(data[news_outlet]['articles'])):
        source         = data[news_outlet]['articles'][article_number]['source']['name']
        author         = data[news_outlet]['articles'][article_number]['author']
        title          = data[news_outlet]['articles'][article_number]['title']
        description    = data[news_outlet]['articles'][article_number]['description']
        url            = data[news_outlet]['articles'][article_number]['url']
        publishedAt    = data[news_outlet]['articles'][article_number]['publishedAt']
        requested_date = today
        content        = data[news_outlet]['articles'][article_number]['content']
        df = df.append([{ 'source': source, 'author': author, 'title': title, 'description': description, 'url':url, 'publishedAt': publishedAt, 'requested_date': requested_date, 'content': content    }])
        x = x + 1 

print('PROCESSING COMPLETE')
print('number of articles processed are : ' + str(x))

        
df.to_csv("data/" + str(today) + "/" + 'output.csv')

Today's date: 2019-12-31
Dir Already exists

The news sources already obtained for this day are: 
['cnn', 'the-new-york-times', 'al-jazeera-english', 'bbc-news', 'independent', 'fox-news', 'abc-news']

the number of saved sources are: 
7
removing cnn
removing the-new-york-times
removing al-jazeera-english
removing bbc-news
removing independent
removing fox-news
removing abc-news
The sources still required are ['mirror', 'metro', 'daily-mail', 'Theguardian.com', 'Sky.com']

PULLING NEWS HEADLINES - PLEASE WAIT .... 

processing mirror headlines
Request for the mirror news source is empty, skipping

processing metro headlines
Request for the metro news source is empty, skipping

processing daily-mail headlines
Request for the daily-mail news source is empty, skipping

processing Theguardian.com headlines
Request for the Theguardian.com news source is empty, skipping

processing Sky.com headlines
Request for the Sky.com news source is empty, skipping

PROCESSING COMPLETE
number of article

In [145]:
# IMPORT STATEMENTS 
import json
import os
from datetime import date
from newsapi import NewsApiClient
import pandas as pd

# IMPORT API KEYS
f = open("../keys/api.txt", "r")
keys = f.read()
f.close()
ACCESS_TOKEN = keys
newsapi = NewsApiClient(api_key=ACCESS_TOKEN)
data = newsapi.get_sources()

In [159]:
for x in range(0, len(data['sources'])):
    print(data['sources'][x]['id'])

abc-news
abc-news-au
aftenposten
al-jazeera-english
ansa
argaam
ars-technica
ary-news
associated-press
australian-financial-review
axios
bbc-news
bbc-sport
bild
blasting-news-br
bleacher-report
bloomberg
breitbart-news
business-insider
business-insider-uk
buzzfeed
cbc-news
cbs-news
cnbc
cnn
cnn-es
crypto-coins-news
der-tagesspiegel
die-zeit
el-mundo
engadget
entertainment-weekly
espn
espn-cric-info
financial-post
focus
football-italia
fortune
four-four-two
fox-news
fox-sports
globo
google-news
google-news-ar
google-news-au
google-news-br
google-news-ca
google-news-fr
google-news-in
google-news-is
google-news-it
google-news-ru
google-news-sa
google-news-uk
goteborgs-posten
gruenderszene
hacker-news
handelsblatt
ign
il-sole-24-ore
independent
infobae
info-money
la-gaceta
la-nacion
la-repubblica
le-monde
lenta
lequipe
les-echos
liberation
marca
mashable
medical-news-today
msnbc
mtv-news
mtv-news-uk
national-geographic
national-review
nbc-news
news24
new-scientist
news-com-au
newsweek
new-

In [158]:
data['sources'][0]['id']

'abc-news'