# Data Loading
This notebook shows the process of using the Chronicling America API to search for a selection of newspapers and download the XML OCR data.

## Imports

In [24]:
import requests # for retrieving web data
from xml.etree import ElementTree as ET # for parsing XML
import os # for file handling

import time # for respecting rate limits

import time
from threading import Lock
from concurrent import futures 

## Globals and Config
This cell defines global variables and notebook-wide configurations.

In [25]:
verbose = True # set this flag for detailed output from cells.

DATA_DIR = './data/'   # path to data directory
TXT_PATH = 'pages.txt' # path to text file to hold page IDs
XML_DIR = 'xml/'       # path to directory within data directory to store XML files

DATA_URL   = 'https://chroniclingamerica.loc.gov/' # path for file downloads
SEARCH_URL = 'https://chroniclingamerica.loc.gov/search/pages/results/' # path for id search

## Finding Pages
This cell uses the [Chronicling America API](https://chroniclingamerica.loc.gov/about/api/) to find relevant pages from the collection. The example below retrieves a JSON summary of the first 1000 results for New York newspapers containing the word "California" between 1900 and 1914 and stores the ID of each page in a text file.

In [26]:
options = {
    "state": "New York",

    "dateFilterType" : "yearRange",
    "date1"          : "1900",
    "date2"          : "1914",

    "ortext"     : "",
    "andtext"    : "California",
    "phrasetext" : ""
}
n_pages = 50 # 1000 items at 20 items per response

base_qstr = '&'.join(f'{key}={value.replace(" ", "+")}' for key, value in options.items()) + '&format=json'

with open(DATA_DIR + TXT_PATH, 'w') as fp:
    for n in range(1, n_pages + 1):
        response = requests.get(f'{SEARCH_URL}?{base_qstr}&page={n}').json()
        if verbose:
            print(f'received response for page {n}, containing items {response["startIndex"]}-{response["startIndex"] + 20}')
        for item in response['items']:
            fp.write(item['id'] + '\n')

received response for page 1, containing items 1-21
received response for page 2, containing items 21-41
received response for page 3, containing items 41-61
received response for page 4, containing items 61-81
received response for page 5, containing items 81-101
received response for page 6, containing items 101-121
received response for page 7, containing items 121-141
received response for page 8, containing items 141-161
received response for page 9, containing items 161-181
received response for page 10, containing items 181-201
received response for page 11, containing items 201-221
received response for page 12, containing items 221-241
received response for page 13, containing items 241-261
received response for page 14, containing items 261-281
received response for page 15, containing items 281-301
received response for page 16, containing items 301-321
received response for page 17, containing items 321-341
received response for page 18, containing items 341-361
received re

## Collecting XML
This cell retrieves OCR XML files from list of page IDs generated above.

In [27]:
SUCCESS, SKIP, FAIL = 0, 1, 2

BURST_WINDOW, BURST_MAX = 60, 20
CRAWL_WINDOW, CRAWL_MAX = 10, 20

def download_many_xml(ids: list[str], max_workers=4) -> int:
    burst_times: list[float] = []
    crawl_times: list[float] = []
    burst_lock = Lock()
    crawl_lock = Lock()

    def record():
        """Record timestamp when request is made."""
        nonlocal burst_times, crawl_times
        with burst_lock: burst_times.append(time.time())
        with crawl_lock: crawl_times.append(time.time())
    
    def check_limits() -> tuple[float, float]:
        """Check rate limits and return required wait time if any."""
        nonlocal burst_times, crawl_times

        # remove old timestamps
        now = time.time()
        burst_times = [t for t in burst_times if now - t < BURST_WINDOW]
        crawl_times = [t for t in crawl_times if now - t < BURST_WINDOW]

        # check burst limit
        burst_wait = float(0)
        with burst_lock:
            now = time.time() # need to refresh since it might take time to get the lock
            if len(burst_times) > BURST_MAX:
                burst_wait = max(0, burst_times[0] + BURST_WINDOW - now)
        
        crawl_wait = float(0)
        with crawl_lock:
            now = time.time() # need to refresh since it might take time to get the lock
            if len(burst_times) > BURST_MAX:
                burst_wait = max(0, burst_times[0] + BURST_WINDOW - now)

        return (burst_wait, crawl_wait)
    
    start = time.time()
    if verbose:
        print(f'Downloading xml for {len(ids)} page IDs')

    def download_single(id: str):
        if os.path.exists(f'{DATA_DIR}{XML_DIR}{id.replace("/", "")}.xml'):
            return SKIP
    
        wait_time = max(check_limits())
        if wait_time > 0:
            if verbose:
                print(f'Rate limit reached; waiting {wait_time:.2f} seconds')
            time.sleep(wait_time)

        try:
            record()
            response = requests.get(f'{DATA_URL}{id}ocr.xml')
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            if response.status_code == 429:
                print(f'ERROR: Rate limit exceeded')
            else:
                print(f'bad response for {DATA_URL}{id}ocr.xml: HTTP code', response.status_code)
            return FAIL
        
        xml = ET.ElementTree(ET.fromstring(response.content))
        ET.indent(xml, space="\t", level=0)

        filename = id.replace('/', '')
        with open(f'{DATA_DIR}{XML_DIR}{filename}.xml', 'w'): pass
        xml.write(f'{DATA_DIR}{XML_DIR}{filename}.xml')

        return SUCCESS
    
    with futures.ThreadPoolExecutor(max_workers) as exec:
        outcomes = list(exec.map(download_single, ids))

    if verbose:
        print(
            f'Processed {len(ids)} ids in {time.time() - start:.2f} seconds:',
            f'{outcomes.count(SUCCESS)} downloaded,', 
            f'{outcomes.count(SKIP)} already present,', 
            f'{outcomes.count(FAIL)} failed'
        )
    return outcomes.count(SUCCESS) + outcomes.count(FAIL)

id_list: list[str] = []
with open(f'{DATA_DIR}{TXT_PATH}', 'r') as fp:
    while(id := fp.readline()):
        id_list.append(id[:-1])
        
n_success = download_many_xml(id_list)

Downloading xml for 1000 page IDs
Rate limit reached; waiting 32.83 seconds
Rate limit reached; waiting 30.97 seconds
Rate limit reached; waiting 30.59 seconds
Rate limit reached; waiting 30.44 seconds
Rate limit reached; waiting 0.92 seconds
Rate limit reached; waiting 0.74 seconds
Rate limit reached; waiting 0.70 seconds
Rate limit reached; waiting 0.02 seconds
Rate limit reached; waiting 1.30 seconds
Rate limit reached; waiting 1.24 seconds
Rate limit reached; waiting 0.43 seconds
Rate limit reached; waiting 2.21 seconds
Rate limit reached; waiting 2.15 seconds
Rate limit reached; waiting 1.85 seconds
Rate limit reached; waiting 0.01 seconds
Rate limit reached; waiting 4.42 seconds
Rate limit reached; waiting 4.09 seconds
Rate limit reached; waiting 4.07 seconds
Rate limit reached; waiting 3.68 seconds
Rate limit reached; waiting 34.91 seconds
Rate limit reached; waiting 34.89 seconds
Rate limit reached; waiting 34.87 seconds
Rate limit reached; waiting 32.53 seconds
Rate limit reac

In [1]:
import matplotlib
matplotlib.__version__

'3.7.2'