Author: Authors: Salvatore Giorgi, Daniel Roy Sadek Habib, Douglas Bellew, Garrick Sherman, and Brenda Curtis (modified and commented by Naomi Baes) 

Source: https://osf.io/uya29/ 

Aim: This script is designed to download article metadata from the New York Times Archive API, extract article URLs for a specified range of years and save the URLs into text files organized by year. 
- It provides error handling for rate limiting (HTTP status code 429) and checks for the presence of required environment variables and file paths. It saves URLs after processing them for each year (in case the script is interrupted etc.)
- The script also checks if the output directory exists and creted it if it does not. If the directory already exists, it will simply use that directory

Note: To run this script successfully, you need to get your API key from the NYT Developer site and save it in a file called "nyt_api_key.txt"

In [1]:
# Setup logging configuration
import logging
import time
import os
import requests

logger = logging.getLogger('NYT downloader')
logging.basicConfig(level=logging.INFO)

# Constants
API_KEY_FILE_PATH = 'nyt_api_key.txt'
BASE_URL = 'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={key}'

# Function to read API key from file
def read_api_key_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            api_key = file.read().strip()
        return api_key
    except FileNotFoundError:
        logger.error(f"API key file '{file_path}' not found.")
        raise
    except Exception as e:
        logger.error(f"Error reading API key file: {e}")
        raise

# Read the API key from the file
try:
    NYT_KEY = read_api_key_from_file(API_KEY_FILE_PATH)
except Exception:
    logger.error("Unable to retrieve API key. Exiting.")
    exit(1)

# Set the API key as an environment variable
os.environ['NYT_KEY'] = NYT_KEY

# fetch_from_api Function: This function constructs the URL for making requests to the New York Times Archive API 
# based on the specified year, month, and API key. It then sends a GET request to the API and handles the case where 
# the API responds with HTTP status code 429 (indicating too many requests) by waiting for 60 seconds before retrying the request.
def fetch_from_api(year, month, key, retry_count=0):
    url = BASE_URL.format(year=year, month=month, key=key)
    logger.info('Fetching {}'.format(url))
    try:
        res = requests.get(url)
        res.raise_for_status()  # Raise an exception for any HTTP error status
        return res.json()
    except requests.HTTPError as e:
        if e.response.status_code == 429:
            if retry_count < MAX_RETRY:
                # Exponential backoff: Wait for an exponentially increasing amount of time before retrying.
                delay = (2 ** retry_count) * RETRY_DELAY
                logger.error('Got HTTP 429: Too many requests. Retrying in {} seconds.'.format(delay))
                time.sleep(delay)
                return fetch_from_api(year, month, key, retry_count + 1)
            else:
                logger.error('Max retry limit reached. Unable to fetch data.')
                raise
        else:
            logger.error('HTTP Error: {}'.format(e))
            raise

# extract_urls Function: This function takes the JSON response from the New York Times API and extracts the web URLs of 
# the articles from it.
def extract_urls(api_results):
    urls = []
    for doc in api_results['response']['docs']:
        url = doc['web_url']
        urls.append(url)
    logger.info('Extracted {} urls'.format(len(urls)))
    return urls

# Save urls to file
def save_urls(urls, save_location, year):
    url_file = os.path.join(save_location, f'urls_{year}.txt')
    logger.info('Saving urls to {}'.format(url_file))
    with open(url_file, 'w') as out:
        for url in urls:
            print(url, file=out)

# main Function: This function is the entry point of the script. It takes parameters for the start and end years, as well as 
# an optional parameter for the save location of the output file. Inside the function, it checks if the environment variable 
# containing the New York Times API key is set. If not, it prints a message prompting the user to set the API key and exits. 
# It also checks if the specified save location already exists, printing a message and exiting if it does.
# Inside the main function's loop over the range of years and months, it calls fetch_from_api to get article metadata for each 
# month and then calls extract_urls to extract the URLs from the metadata. The URLs are then appended to a list.
# After fetching all the URLs for a year, the script saves them to a file named 'urls_{year}.txt' in the specified save location.
def main(start_year, end_year, save_location='nyt'):
    key = os.environ.get('NYT_KEY')
    if key is None:
        logger.error('Please set the "{}" environment variable before running.'.format(NYT_KEY))
        return
    
    if not os.path.exists(save_location):
       os.mkdir(save_location)

    logger.info('Downloading article metadata for {}-{}'.format(start_year, end_year))
    urls = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year != start_year or month != 1:
                time.sleep(6)
            try:
                article_metadata = fetch_from_api(year, month, key)
                url_chunk = extract_urls(article_metadata)
                urls += url_chunk
            except Exception as e:
                logger.error('Error processing data for {}/{}: {}'.format(year, month, e))
                continue
        
        # Save urls after processing each year
        save_urls(urls, save_location, year)
        urls = []  # Clear the urls list for the next year

# Define arguments
start_year = 2011
end_year = 2023 # then go until 2023
RETRY_DELAY = 60
MAX_RETRY = 10  # Define the maximum number of retry attempts
save_location = "C:/Users/naomi/OneDrive/COMP80004_PhDResearch/RESEARCH/DATA/CORPORA/MEDIA/NYT/output"

# Call main function
main(start_year, end_year, save_location)

INFO:NYT downloader:Downloading article metadata for 2011-2023
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/1.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT downloader:Extracted 8478 urls
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/2.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT downloader:Extracted 7917 urls
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/3.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT downloader:Extracted 9026 urls
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/4.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT downloader:Extracted 8751 urls
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/5.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT downloader:Extracted 8732 urls
INFO:NYT downloader:Fetching https://api.nytimes.com/svc/archive/v1/2011/6.json?api-key=yHXr5g15kLco2gm87B9GKpQ1USXUQZPO
INFO:NYT do