### Fetch Data from NYT API for 'Road Accidents' from 2012 to 2016

#### Step - 1: Import Modules

In [1]:
import requests
import os
import json
import time
from wordcloud import WordCloud, STOPWORDS
# Custom module for location names
import locations
import string

#### Step 2: Check API key

In [2]:
nyt_api_key = os.getenv('NYT_ACCESS_KEY')
articlesearch_url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'
print('Value of the environment variable for api-key: ', nyt_api_key)

Value of the environment variable for api-key:  d49f3c0b89e942cb9f94f89a30e69e7f


#### Step 3: Prepare request parameters

In [3]:
# Request Map for Accidents
reqparams = {'api-key':nyt_api_key, 'q':'road accident','begin_date':'20120101', 'end_date':'20161231', 'page':3}

#### Step 4: Prepare function to save all documents in proper drectory structure for a given json

In [4]:
def save_docs(jsonObj):
    # Save the response documents (first 10) in the 'data/accidents/<locations>' folders
    for doc in jsonObj['response']['docs']:        
        doc_location = None
        # we are only interested in articles that state the location of accident
        for kw in doc['keywords']:
            if not doc_location:                            
                if kw.get('name') == 'glocations':
                    location_keyword = kw.get('value')
                    location_val = None
                    
                    # Check presence of parentheses. For example, Delhi (India). We are interested in Country name only.
                    if '(' in location_keyword:
                        location_val = location_keyword[location_keyword.find("(")+1:location_keyword.find(")")]
                    else:
                        location_val = location_keyword
                    
                    # Now check the value in list of countries, US states and NYC patterns                                   
                    if location_val in locations.countries:
                        doc_location = location_val
                    elif location_val in locations.us_states:
                        doc_location = 'US/' + location_val
                    elif location_val.strip() in ['NYC', 'New York City']:
                        doc_location = 'NYC'
        ## Save doc in doc_location
        if doc_location:
            doc_path = '../Data/Accidents/' + doc_location + '/' + doc['_id'] + '.json'
            # Create corresponding drectory
            dir_name = os.path.dirname(doc_path)
            if not os.path.exists(dir_name):
                os.makedirs(dir_name)
            # Dump the json into file   
            with open(doc_path, 'w') as doc_file:
                json.dump(doc, doc_file)
    # End of doc save

#### Step 5: Send HTTP GET requests to the Article Search API on NYT for the request map above

In [5]:
# Send HTTP request
r = requests.get(articlesearch_url, params = reqparams)
if r.status_code == 200:
    # Parse json response
    jsonObj = json.loads(r.text)
    # get total hits to prepare for dynamically getting remaining docs
    hits = jsonObj['response']['meta']['hits']
    print('Hits count:', hits)
    remaining_pages = int(hits / 10)
    if hits % 10 == 0:
        remaining_pages = remaining_pages - 1
    print('Remaining pages:', remaining_pages)
    
    save_docs(jsonObj)
    print('Done saving the docs for first request.')                        

else:
    print('Failed to get docs for first page')
    print(r.text)    
                        
                            

Hits count: 2579
Remaining pages: 257
Done saving the docs for first request.


#### Step 6: Send requets for remaining pages using 'hits' from 1st request

In [6]:
# Get remaining articles in a loop
for page_number in range(1, remaining_pages + 1):
    if page_number > 120:
        break
    if page_number % 5: 
        time.sleep(2)
    reqparams['page'] = page_number        
    r = requests.get(articlesearch_url, params = reqparams)
    if r.status_code == 200:
        jsonObj = json.loads(r.text)
        save_docs(jsonObj)
    else:
        print('Failed to get docs for page:', page_number)
        print(r.text)
print('Done saving documents for remaining pages')            

Failed to get docs for page: 5
{"message":"API rate limit exceeded"}

Failed to get docs for page: 16

Failed to get docs for page: 20
{"message":"API rate limit exceeded"}

Failed to get docs for page: 50
{"message":"API rate limit exceeded"}

Failed to get docs for page: 55
{"message":"API rate limit exceeded"}

Failed to get docs for page: 60
{"message":"API rate limit exceeded"}

Failed to get docs for page: 65
{"message":"API rate limit exceeded"}

Failed to get docs for page: 80
{"message":"API rate limit exceeded"}

Failed to get docs for page: 90
{"message":"API rate limit exceeded"}

Failed to get docs for page: 120
{"message":"API rate limit exceeded"}

Done saving documents for remaining pages
