## Data Collection
Pull data from the NY Times API for each day.

#### 1 Imports
Reference libraries required; use datetime to determine the time range.  

Requires secrets.py that contains API key for the New York Times.

In [None]:
import requests
import json
from datetime import timedelta
from datetime import datetime
import uuid
import time

from secrets import api_key

#### 2 Get Data From API
Methods used to request the data from the NY Times API and creates JSON

In [None]:
def getArticlesForDate(searchDate):
    ''' Gets the articles for the date provided from the NY Times website
    
    Accepts : searchDate (datetime) date to query for records for
            
    Returns : (array) list of articles, in dictionary, found for that date
                title: (string) title of article
                category: (string) news desk that the article is from
                id: (string) unique identifier of the news article; created when downloaded
                sourceurl: (string) Url of the news article on the NY Times website
                content: (string) Text of the news article
                imageurl: (string) Url to image from the news article; could be empty
                publishdate: (int) Date when the article was published, yyyyMMdd
    '''
    
    #- Prepare Results
    results = []
    
    
    #- Create Parameters
    searchDateString = convertDateToString(searchDate)
    
    filterQuery = f'news_desk:("Business" "Business Day" "Technology" "Personal Tech" "Politics" ) ' \
                f'AND pub_date:({searchDateString})'

    parameters = {
        'fq' : filterQuery,
        'api-key' : api_key
        }
        
    
    #- Query API
    print(f"Requesting data from API: {searchDateString}")
    
    baseNytUrl = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
    
    response = requests.request("GET", baseNytUrl, params=parameters)
    
    
    # Check Response
    if (response.status_code == requests.codes.ok):
        
        # Get Json
        responseJson = response.json()
        
        
        for doc in responseJson['response']['docs']:
            results.append({
                "title" : doc['headline']['main'],
                "category": doc['news_desk'],
                "id": str(uuid.uuid4()),
                "sourceurl": doc['web_url'],
                "publishdate": convertDateToInt(searchDate),
                "imageurl": getArticleImageUrl(doc),
                "content": doc['lead_paragraph']
            })              
            
        
    else:
        print(f'Unable to get data. Date: {searchDateString} Code: {response.status_code}')
    
        raise Exception("Unable to get data")
        
    
    
    return results

In [None]:
def convertDateToString(searchDate):
    ''' Converts the provided date to string in "YYYY-MM-DD" format
    
    Accepts : searchDate (datetime) date to convert to string
    
    Returns : (string) formated string
    '''
    
    return searchDate.strftime('%Y-%m-%d')

In [None]:
def getArticleImageUrl(doc):
    ''' Gets the url to article; ensure the media type is image
    
    Accepts : Dictionary of the article
    
    Returns : (string) Url to the image; could be empty
    '''
    
    sourceUrl = ""
    
    for media in doc["multimedia"]:
        
        if (media['type'] == 'image'):
            sourceUrl = f'https://static01.nyt.com/{media["url"]}'
            
            break
            
    return sourceUrl

In [None]:
def convertDateToInt(searchDate):
    ''' Converts provided date to int in "YYYYMMDD" format
    
    Accepts : searchDate (datetime) date to convert to int
    
    Returns : (int) converted date
    '''

    return int(searchDate.strftime('%Y%m%d'))

In [None]:
def getArticlesForDataRange(startSearchDate, endSearchDate):
    ''' Gets articles between the two dates provided
    
    Accepts : startSearchDate (datetime) date when to start the search
              endSearcDate (datetime) date when to stop the search; this date is included
    
    Returns : (array) list of articles, in dictionary, found for that date
                title: (string) title of article
                category: (string) news desk that the article is from
                id: (string) unique identifier of the news article; created when downloaded
                sourceurl: (string) Url of the news article on the NY Times website
                content: (string) Text of the news article
                imageurl: (string) Url to image from the news article; could be empty
                publishdate: (int) Date when the article was published, yyyyMMdd
    '''
    
    #- Verify Dates
    if (startSearchDate > endSearchDate):
        raise Exception("Invalid dates")
    
    
    #- Get Articles for Date
    allResults = []
    canContinueSearch = True
    processSearchDate = startSearchDate
    
    
    while (canContinueSearch == True):
        
        #- Get Records
        dayResults = getArticlesForDate(processSearchDate)
        
        
        #- Merge Lists
        allResults = allResults + dayResults
        
        
        #- Sleep
        # API is limited to 10 calls per minute
        time.sleep(6)
        
        
        #- Prepare Next Date
        processSearchDate = processSearchDate + timedelta(days=1)
        
        if (processSearchDate > endSearchDate):
            canContinueSearch = False
    
    
    
    return allResults

#### 3 Save Results
Saves the results to JSON file to disk

In [None]:
def saveResultsToJson(searchResults):
    ''' Stores the results to disk as 'articles.json', same folder as where the python script is running
    
    Accepts : searchResults (array) list of articles, in dictionary, found for search period
                title: (string) title of article
                category: (string) news desk that the article is from
                id: (string) unique identifier of the news article; created when downloaded
                sourceurl: (string) Url of the news article on the NY Times website
                content: (string) Text of the news article
                imageurl: (string) Url to image from the news article; could be empty
                publishdate: (int) Date when the article was published, yyyyMMdd
    
    Returns : undefined
    '''
    
    #- Create Container
    container = {
        "createdate" : convertDateToInt(datetime.now()),
        "articles" : searchResults
        }
    
    
    #- Json
    with open('articles.json', 'w') as fp:
        json.dump(container, fp)
    
    
    

#### 4 Start Search
Provide start and end date range for the search

In [None]:
#- Set Date Range
startSearchDate = datetime.datetime(2018,1,1)
endSearchDate = datetime.datetime(2018,1,3)


#- Get Articles
searchResults = getArticlesForDataRange(startSearchDate, endSearchDate)

print(f"--> Complete getting articles. Total Articles: {len(searchResults)}")


#- Save To Disk
saveResultsToJson(searchResults)


print("--> Completed saving to disk")
