In [14]:
import requests
import app
import config
import pandas as pd
import datetime as dt
import time
import os

### Goal:

My aim with this notebook is to visit the ["Most Popular" books](https://www.gutenberg.org/ebooks/search/?sort_order=downloads) on Project Gutenberg. I will tab to the next page, and ultimately retrieve the top 250 books on Project Gutenberg.

I will first need to retrieve each book's unique identifier (e.g. Frankenstein's is 84). Then, I will use those identifiers to "build" URLs for each book, corresponding to their full text, as a plain text file. 

I will then use my punctuation tracker application to count the number of words in each book, and count their punctuations. I will add each of these data points to its own column and then save everything as a final DataFrame that can then be plotted and explored.

In [8]:
search_query = 'biology'
dates = pd.date_range('1922-04-21', '2022-04-21', freq='MS').strftime("%Y%m%d").tolist()

In [10]:
def send_request(dates):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + dates[0] + '/' + dates[1] + '.json?api-key=' + config.api_key
    response = requests.get(url).json()
    time.sleep(6)
    return response

In [15]:
def is_valid(article):
    '''An article is only worth checking if it is in range, and has a headline.'''
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return has_headline

In [12]:
def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 

In [13]:
def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [None]:
url = f"http://api.nytimes.com/svc/search/v2/articlesearch.json?q={search_query}&begin_date={begin_date}&end_date={end_date}&api-key={config.api_key}"