# OUTLINE 


- Run job three times per day 
- Gets current date 
- Checks current date folder 
- compares news array to written files
- If news array has items that are not in written files
- Regenerate news array with missing items
- call news api, get news for updated news array
- Save to json  
- Also save all files to csv (can overwrite)


# MODIFICATION 

- Compare dataframes
- no longer reducing news_array
- simply removing anything that is already on file

In [1]:
# IMPORT STATEMENTS 
import json
import os
from datetime import date
from newsapi import NewsApiClient
import pandas as pd

# IMPORT API KEYS
f = open("../keys/api.txt", "r")
keys = f.read()
f.close()
ACCESS_TOKEN = keys


#-------------------------------------------------------------------------------------------
#   PARMS SECTION 
#-------------------------------------------------------------------------------------------

# READING IN 
NewsFileNameArray = []                         # A LIST OF FILENAMES 
data = []                                      # ALL NEWS DATA FROM LOCAL NEWS JSON FILES

# REQUESTING NEW DATA
newsapi = NewsApiClient(api_key=ACCESS_TOKEN)  # INITIALISING newsapi object
news_keyname_array = ['bbc-news', 'abc-news','cnn','fox-news','independent','mirror','metro','daily-mail', 'Theguardian.com' , 'Sky.com', 'the-new-york-times', 'al-jazeera-english', 'reuters', 'the-hill' , 'breitbart-news', 'the-verge', 'the-huffington-post']
news_array = []


#-------------------------------------------------------------------------------------------
#   CREATE NEW FOLDER FOR TODAY IF NOT EXIST
#-------------------------------------------------------------------------------------------
# GET TODAYS DATE 
today = date.today()
print("Today's date:", today)

# CREATE A NEW DIRECTORY FOR TODAY IF NOT EXIST
if os.path.isdir("data/" + str(today)):
    print('Dir Already exists')
else:
    os.mkdir("data/" + str(today))

print('')

#-------------------------------------------------------------------------------------------
#   REQUESTING MORE NEWS 
#-------------------------------------------------------------------------------------------

# Init API and save to news_array
# WRITE TO JSON
print('PULLING NEWS HEADLINES - PLEASE WAIT .... ')
print('')
for item in news_keyname_array:
    print('processing ' + str(item + ' headlines'))
    news_key = item
    json_item = newsapi.get_top_headlines(sources=news_key)
    if json_item['totalResults'] == 0:
        print("Request for the " + str(item) + " news source is empty, skipping")
        print('')
        continue
    news_array.append(json_item)
    print('COMPLETE - appending to array .......')
    print('')


#-------------------------------------------------------------------------------------------
#   BUILDING REPORT 
#-------------------------------------------------------------------------------------------

# BUILD A PANDAS DATA FRAME 
df = pd.DataFrame(columns=['source','author','title','description','url', 'requested_date','publishedAt','content'])


# Iterate through DATA array and write to csv
print('iterating through ')
x = 0 
for news_outlet in range (0, len(news_array)):
    for article_number in range (0, len(news_array[news_outlet]['articles'])):
        source         = news_array[news_outlet]['articles'][article_number]['source']['name']
        author         = news_array[news_outlet]['articles'][article_number]['author']
        title          = news_array[news_outlet]['articles'][article_number]['title']
        description    = news_array[news_outlet]['articles'][article_number]['description']
        url            = news_array[news_outlet]['articles'][article_number]['url']
        publishedAt    = news_array[news_outlet]['articles'][article_number]['publishedAt']
        requested_date = today
        content        = news_array[news_outlet]['articles'][article_number]['content']
        df = df.append([{ 'source': source, 'author': author, 'title': title, 'description': description, 'url':url, 'publishedAt': publishedAt, 'requested_date': requested_date, 'content': content    }])
        x = x + 1 

print('PROCESSING COMPLETE')
print('number of articles processed are : ' + str(x))

# imported is old data
# df is the new data
# combined is merged 



Today's date: 2020-01-03
Dir Already exists

PULLING NEWS HEADLINES - PLEASE WAIT .... 

processing bbc-news headlines
COMPLETE - appending to array .......

processing abc-news headlines
COMPLETE - appending to array .......

processing cnn headlines
COMPLETE - appending to array .......

processing fox-news headlines
COMPLETE - appending to array .......

processing independent headlines
COMPLETE - appending to array .......

processing mirror headlines
Request for the mirror news source is empty, skipping

processing metro headlines
Request for the metro news source is empty, skipping

processing daily-mail headlines
Request for the daily-mail news source is empty, skipping

processing Theguardian.com headlines
Request for the Theguardian.com news source is empty, skipping

processing Sky.com headlines
Request for the Sky.com news source is empty, skipping

processing the-new-york-times headlines
COMPLETE - appending to array .......

processing al-jazeera-english headlines
COMPLETE

In [2]:
df

Unnamed: 0,source,author,title,description,url,requested_date,publishedAt,content
0,BBC News,BBC News,Nobel Prize-winning scientist retracts paper,A Nobel laureate is being praised for retracti...,http://www.bbc.co.uk/news/world-us-canada-5098...,2020-01-03,2020-01-03T21:04:31Z,Image copyrightMillennium PrizeImage caption\r...
0,BBC News,BBC News,Oil prices jump after top Iranian general killed,Analysts warn the move could escalate tensions...,http://www.bbc.co.uk/news/business-50982313,2020-01-03,2020-01-03T19:03:10Z,Image copyrightGetty Images\r\nOil prices have...
0,BBC News,BBC News,Private jet firm: Rogue employee helped Ghosn ...,"""We don't really look at people's faces,"" one ...",http://www.bbc.co.uk/news/world-asia-50987703,2020-01-03,2020-01-03T17:30:49Z,Image copyrightReutersImage caption\r\n Carlos...
0,BBC News,BBC News,'You need to leave': Australians warned over f...,People are told to flee as extreme heat and wi...,http://www.bbc.co.uk/news/world-australia-5098...,2020-01-03,2020-01-03T16:34:33Z,Image copyrightAFPImage caption\r\n More than ...
0,BBC News,BBC News,How strong is Iran's military?,Iran has vowed revenge after the killing of a ...,http://www.bbc.co.uk/news/world-middle-east-50...,2020-01-03,2020-01-03T15:32:18Z,Image copyrightGetty ImagesImage caption\r\n I...
0,BBC News,BBC News,Man 'neutralised' after stabbing in Paris suburb,"Passers-by stabbed in Paris suburb, police say...",http://www.bbc.co.uk/news/world-europe-50986028,2020-01-03,2020-01-03T14:05:17Z,French police have shot dead a man who stabbed...
0,BBC News,BBC News,Deceived women win GirlsDoPorn legal battle,Some models were driven to the verge of suicid...,http://www.bbc.co.uk/news/technology-50982051,2020-01-03,2020-01-03T13:16:20Z,Image copyrightGetty Images\r\nA US judge has ...
0,BBC News,BBC News,Cloud seeding aims to protect flood-hit Jakarta,Planes are firing salt into clouds in a bid to...,http://www.bbc.co.uk/news/world-asia-50981453,2020-01-03,2020-01-03T11:04:54Z,Image copyrightReutersImage caption\r\n The ra...
0,BBC News,BBC News,Tackling the Earth's orbiting space junk,How the Japanese firm Astroscale is set to sta...,http://www.bbc.co.uk/news/business-50976292,2020-01-03,2020-01-03T00:11:51Z,
0,BBC News,BBC News,"Man opens time capsule, 29 years later",Mitch Brogan wrote predictions with his grandf...,http://www.bbc.co.uk/news/world-us-canada-5085...,2020-01-03,2020-01-02T21:24:31Z,Image copyrightCourtesy Mitch Brogan Image cap...


In [3]:
df.duplicated(subset='description')

0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0     True
0     True
0     True
0     True
0     True
0     True
0     True
0     True
0     True
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
     ...  
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
0    False
Length: 109, dtype: bool

AttributeError: 'DataFrame' object has no attribute 'row'