In [86]:

import os
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
from dateutil.relativedelta import relativedelta
import configparser
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [87]:
end = datetime.date(1899, 12, 1) #datetime.date.today()
start = datetime.date(1871, 10, 1) #end - relativedelta(years=1)

In [88]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]

In [89]:
print(months_in_range)

[['1871', '10'], ['1871', '11'], ['1871', '12'], ['1872', '1'], ['1872', '2'], ['1872', '3'], ['1872', '4'], ['1872', '5'], ['1872', '6'], ['1872', '7'], ['1872', '8'], ['1872', '9'], ['1872', '10'], ['1872', '11'], ['1872', '12'], ['1873', '1'], ['1873', '2'], ['1873', '3'], ['1873', '4'], ['1873', '5'], ['1873', '6'], ['1873', '7'], ['1873', '8'], ['1873', '9'], ['1873', '10'], ['1873', '11'], ['1873', '12'], ['1874', '1'], ['1874', '2'], ['1874', '3'], ['1874', '4'], ['1874', '5'], ['1874', '6'], ['1874', '7'], ['1874', '8'], ['1874', '9'], ['1874', '10'], ['1874', '11'], ['1874', '12'], ['1875', '1'], ['1875', '2'], ['1875', '3'], ['1875', '4'], ['1875', '5'], ['1875', '6'], ['1875', '7'], ['1875', '8'], ['1875', '9'], ['1875', '10'], ['1875', '11'], ['1875', '12'], ['1876', '1'], ['1876', '2'], ['1876', '3'], ['1876', '4'], ['1876', '5'], ['1876', '6'], ['1876', '7'], ['1876', '8'], ['1876', '9'], ['1876', '10'], ['1876', '11'], ['1876', '12'], ['1877', '1'], ['1877', '2'], ['1877

In [90]:

def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + 'HejIZPA3o6sinrCI3LBrvGXYUKVviaY8'
    try:
        response = requests.get(url, verify=False).json()
    except Exception:
        return None
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [], 
        'abstract': [],
        'snippet': [], 
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'news_desk': [],
        'section_name': [],
        'subsection_name': [],
        'word_count': [],
        'keywords': [],
        'web_url': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section_name' in article:
                data['section_name'].append(article['section_name'])
            else:
                data['section_name'].append(None)
            if 'subsection_name' in article:
                data['subsection_name'].append(article['subsection_name'])
            else:
                data['subsection_name'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'abstract' in article:
                data['abstract'].append(article['abstract'])
            else:
                data['abstract'].append(None)
            if 'snippet' in article:
                data['snippet'].append(article['snippet'])
            else:
                data['snippet'].append(None)
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            if 'news_desk' in article:
                data['news_desk'].append(article['news_desk'])
            else:
                data['news_desk'].append(None)
            if 'word_count' in article:
                data['word_count'].append(article['word_count'])
            else:
                data['word_count'].append(None)
            if 'web_url' in article:
                data['web_url'].append(article['web_url'])
            else:
                data['web_url'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))


In [91]:
get_data(months_in_range)

Date range: ['1871', '10'] to ['1899', '12']
Saving headlines/1871-10.csv...
Saving headlines/1871-11.csv...
Saving headlines/1871-12.csv...
Saving headlines/1872-1.csv...
Saving headlines/1872-2.csv...
Saving headlines/1872-3.csv...
Saving headlines/1872-4.csv...
Saving headlines/1872-5.csv...
Saving headlines/1872-6.csv...
Saving headlines/1872-7.csv...
Saving headlines/1872-8.csv...
Saving headlines/1872-9.csv...
Saving headlines/1872-10.csv...
Saving headlines/1872-11.csv...
Saving headlines/1872-12.csv...
Saving headlines/1873-1.csv...
Saving headlines/1873-2.csv...
Saving headlines/1873-3.csv...
Saving headlines/1873-4.csv...
Saving headlines/1873-5.csv...
Saving headlines/1873-6.csv...
Saving headlines/1873-7.csv...
Saving headlines/1873-8.csv...
Saving headlines/1873-9.csv...
Saving headlines/1873-10.csv...
Saving headlines/1873-11.csv...
Saving headlines/1873-12.csv...
Saving headlines/1874-1.csv...
Saving headlines/1874-2.csv...
Saving headlines/1874-3.csv...
Saving headlines

In [None]:
response

In [None]:
import requests
your_key = 'HejIZPA3o6sinrCI3LBrvGXYUKVviaY8'
url = 'https://api.nytimes.com/svc/archive/v1/2022/11.json?&api-key=' + your_key
r = requests.get(url)
json_data = r.json()

In [None]:
json_data