# Data Loading
This notebook shows the process of using the Chronicling America API to search for a selection of newspapers and download the XML OCR data.

## Imports

In [1]:
import requests # for retrieving web data
from xml.etree import ElementTree as ET # for parsing XML
import os # for file handling

import time # for respecting rate limits

import time
from threading import Lock
from concurrent import futures 

## Globals and Config
This cell defines global variables and notebook-wide configurations.

In [2]:
verbose = True # set this flag for detailed output from cells.

DATA_DIR = './data/'   # path to data directory
TXT_PATH = 'pages.txt' # path to text file to hold page IDs
XML_DIR = 'xml/'       # path to directory within data directory to store XML files

DATA_URL   = 'https://chroniclingamerica.loc.gov/' # path for file downloads
SEARCH_URL = 'https://chroniclingamerica.loc.gov/search/pages/results/' # path for id search

## Finding Pages
This cell uses the [Chronicling America API](https://chroniclingamerica.loc.gov/about/api/) to find relevant pages from the collection. The example below retrieves a JSON summary of the first 1000 results for New York newspapers containing the word "California" between 1900 and 1914 and stores the ID of each page in a text file.

In [3]:
options = {
    "state": "New York",

    "dateFilterType" : "yearRange",
    "date1"          : "1900",
    "date2"          : "1914",

    "ortext"     : "",
    "andtext"    : "California",
    "phrasetext" : ""
}
n_pages = 50 # 1000 items at 20 items per response

base_qstr = '&'.join(f'{key}={value.replace(" ", "+")}' for key, value in options.items()) + '&format=json'

with open(DATA_DIR + TXT_PATH, 'w') as fp:
    for n in range(1, n_pages + 1):
        response = requests.get(f'{SEARCH_URL}?{base_qstr}&page={n}').json()
        if verbose:
            print(f'received response for page {n}, containing items {response["startIndex"]}-{response["startIndex"] + 20}')
        for item in response['items']:
            fp.write(item["id"] + '\n')

received response for page 1, containing items 1-21
received response for page 2, containing items 21-41
received response for page 3, containing items 41-61
received response for page 4, containing items 61-81
received response for page 5, containing items 81-101
received response for page 6, containing items 101-121
received response for page 7, containing items 121-141
received response for page 8, containing items 141-161
received response for page 9, containing items 161-181
received response for page 10, containing items 181-201
received response for page 11, containing items 201-221
received response for page 12, containing items 221-241
received response for page 13, containing items 241-261
received response for page 14, containing items 261-281
received response for page 15, containing items 281-301
received response for page 16, containing items 301-321
received response for page 17, containing items 321-341
received response for page 18, containing items 341-361
received re

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

## Collecting XML
This cell retrieves OCR XML files from list of page IDs generated above.

In [4]:
SUCCESS, SKIP, FAIL = 0, 1, 2

BURST_WINDOW, BURST_MAX = 90, 20
CRAWL_WINDOW, CRAWL_MAX = 15, 20

def download_many_xml(ids: list[str], max_workers=4) -> int:
    burst_times: list[float] = []
    crawl_times: list[float] = []
    burst_lock = Lock()
    crawl_lock = Lock()

    def record():
        """Record timestamp when request is made."""
        nonlocal burst_times, crawl_times
        with burst_lock: burst_times.append(time.time())
        with crawl_lock: crawl_times.append(time.time())
    
    def check_limits() -> tuple[float, float]:
        """Check rate limits and return required wait time if any."""
        nonlocal burst_times, crawl_times

        # remove old timestamps
        now = time.time()
        burst_times = [t for t in burst_times if now - t < BURST_WINDOW]
        crawl_times = [t for t in crawl_times if now - t < BURST_WINDOW]

        # check burst limit
        burst_wait = float(0)
        with burst_lock:
            now = time.time() # need to refresh since it might take time to get the lock
            if len(burst_times) > BURST_MAX:
                burst_wait = max(0, burst_times[0] + BURST_WINDOW - now)
        
        crawl_wait = float(0)
        with crawl_lock:
            now = time.time() # need to refresh since it might take time to get the lock
            if len(burst_times) > BURST_MAX:
                burst_wait = max(0, burst_times[0] + BURST_WINDOW - now)

        return (burst_wait, crawl_wait)
    
    start = time.time()
    if verbose:
        print(f'Downloading xml for {len(ids)} page IDs')

    def download_single(id: str):
        if os.path.exists(f'{DATA_DIR}{XML_DIR}{id.replace("/", "")}.xml'):
            return SKIP
    
        wait_time = max(check_limits())
        if wait_time > 0:
            if verbose:
                print(f'Rate limit reached; waiting {wait_time:.2f} seconds')
            time.sleep(wait_time)

        try:
            record()
            response = requests.get(f'{DATA_URL}{id}ocr.xml')
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            if response.status_code == 429:
                print(f'ERROR: Rate limit exceeded for {DATA_URL}{id}ocr.xml')
            else:
                print(f'bad response for {DATA_URL}{id}ocr.xml: HTTP code', response.status_code)
            return FAIL
        
        xml = ET.ElementTree(ET.fromstring(response.content))
        ET.indent(xml, space="\t", level=0)

        filename = id.replace('/', '')
        with open(f'{DATA_DIR}{XML_DIR}{filename}.xml', 'w'): pass
        xml.write(f'{DATA_DIR}{XML_DIR}{filename}.xml')

        return SUCCESS
    
    with futures.ThreadPoolExecutor(max_workers) as exec:
        outcomes = list(exec.map(download_single, ids))

    if verbose:
        print(
            f'Processed {len(ids)} ids in {time.time() - start:.2f} seconds:',
            f'{outcomes.count(SUCCESS)} downloaded,', 
            f'{outcomes.count(SKIP)} already present,', 
            f'{outcomes.count(FAIL)} failed'
        )
    return outcomes.count(SUCCESS) + outcomes.count(FAIL)

id_list: list[str] = []
with open(f'{DATA_DIR}{TXT_PATH}', 'r') as fp:
    while(id := fp.readline()):
        id_list.append(id[:-1])
        
n_success = download_many_xml(id_list)

Downloading xml for 980 page IDs
Rate limit reached; waiting 80.19 seconds
Rate limit reached; waiting 79.92 seconds
Rate limit reached; waiting 79.73 seconds
Rate limit reached; waiting 79.57 seconds
Rate limit reached; waiting 1.92 seconds
Rate limit reached; waiting 1.85 seconds
Rate limit reached; waiting 0.47 seconds
Rate limit reached; waiting 0.23 seconds
Rate limit reached; waiting 76.55 seconds
Rate limit reached; waiting 76.52 seconds
Rate limit reached; waiting 76.02 seconds
Rate limit reached; waiting 1.76 seconds
Rate limit reached; waiting 0.60 seconds
Rate limit reached; waiting 0.44 seconds
bad response for https://chroniclingamerica.loc.gov//lccn/sn83030214/1910-12-25/ed-1/seq-30/ocr.xml: HTTP code 520
Rate limit reached; waiting 68.43 seconds
Rate limit reached; waiting 67.62 seconds
Rate limit reached; waiting 67.15 seconds
Rate limit reached; waiting 67.11 seconds
Rate limit reached; waiting 1.51 seconds
Rate limit reached; waiting 1.26 seconds
Rate limit reached; w

In [7]:
response = requests.get("https://chroniclingamerica.loc.gov//lccn/sn83030193/1914-11-19/ed-1/seq-4/ocr.xml")

In [8]:
for key, value in response.headers.items():
    print(key, value, sep=": ")

Date: Fri, 14 Feb 2025 02:41:13 GMT
Content-Type: text/xml
Transfer-Encoding: chunked
Connection: keep-alive
access-control-allow-origin: *
referrer-policy: no-referrer-when-downgrade
strict-transport-security: max-age=3600; preload
x-content-type-options: nosniff
last-modified: Fri, 20 May 2011 23:41:17 GMT
Cache-Control: max-age=31536000
expires: Sat, 14 Feb 2026 02:00:30 GMT
content-disposition: inline; filename=service-ndnp-nn-batch_nn_janet_ver01-data-sn83030193-00280765983-1914111901-0460.xml
x-app-cache-tag: storage-services, storage-services/service
CF-Cache-Status: HIT
Age: 2443
Vary: Accept-Encoding
Server: cloudflare
CF-RAY: 9119b2a8dfa27bf5-LAX
Content-Encoding: gzip
