# Data Loading
This notebook shows the process of using the Chronicling America API to search for a selection of newspapers and download the XML OCR data.

## Imports

In [4]:
import requests # for retrieving web data
from xml.etree import ElementTree as ET # for parsing XML
from time import sleep # for waiting to avoid rate limitation

## Globals and Config
This cell defines global variables and notebook-wide configurations.

In [5]:
verbose = True # set this flag for detailed output from cells.

DATA_DIR = './data/'   # path to data directory
TXT_PATH = 'pages.txt' # path to text file to hold page IDs

DATA_URL   = 'https://chroniclingamerica.loc.gov/' # path for file downloads
SEARCH_URL = 'https://chroniclingamerica.loc.gov/search/pages/results/' # path for id search

## Finding Pages
This cell uses the [Chronicling America API](https://chroniclingamerica.loc.gov/about/api/) to find relevant pages from the collection. The example below retrieves a JSON summary of the first 1000 results for New York newspapers containing the word "California" between 1900 and 1914 and stores the ID of each page in a text file.

In [26]:
options = {
    "state": "New York",

    "dateFilterType" : "yearRange",
    "date1"          : "1900",
    "date2"          : "1914",

    "ortext"     : "",
    "andtext"    : "California",
    "phrasetext" : ""
}
n_pages = 50 # 1000 items at 20 items per response

base_qstr = '&'.join(f'{key}={value.replace(" ", "+")}' for key, value in options.items()) + '&format=json'

with open(DATA_DIR + TXT_PATH, 'w') as fp:
    for n in range(1, n_pages + 1):
        response = requests.get(f'{SEARCH_URL}?{base_qstr}&page={n}').json()
        if verbose:
            print(f'received response for page {n}, containing items {response["startIndex"]}-{response["startIndex"] + 20}')
        for item in response['items']:
            fp.write(item['id'] + '\n')

received response for page 1, containing items 1-21
received response for page 2, containing items 21-41
received response for page 3, containing items 41-61
received response for page 4, containing items 61-81
received response for page 5, containing items 81-101
received response for page 6, containing items 101-121
received response for page 7, containing items 121-141
received response for page 8, containing items 141-161
received response for page 9, containing items 161-181
received response for page 10, containing items 181-201
received response for page 11, containing items 201-221
received response for page 12, containing items 221-241
received response for page 13, containing items 241-261
received response for page 14, containing items 261-281
received response for page 15, containing items 281-301
received response for page 16, containing items 301-321
received response for page 17, containing items 321-341
received response for page 18, containing items 341-361
received re

## Collecting XML
This cell retrieves OCR XML files from list of page IDs generated above.

In [None]:
XML_DIR = 'xml/'

def download_xml(id: str) -> str | None:
    try:
        response = requests.get(f'{DATA_URL}{id}ocr.xml')
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        if verbose:
            print(f'bad response for {DATA_URL}{id}ocr.xml')
            return None

    xml = ET.ElementTree(ET.fromstring(response.content))
    ET.indent(xml, space="\t", level=0)

    filename = id.replace('/', '')
    with open(f'{DATA_DIR}{XML_DIR}{filename}.xml', 'w'): pass
    xml.write(f'{DATA_DIR}{XML_DIR}{filename}.xml')

    return filename

id_list = []
with open(f'{DATA_DIR}{TXT_PATH}') as fp:
    while(id := fp.readline()):
        id_list.append(id[:-1])
        
for id in id_list:
    download_xml(id)
    sleep(1) ## see https://libraryofcongress.github.io/data-exploration/loc.gov%20JSON%20API/Chronicling_America/README.html#rate-limits-and-definitions

In [33]:
download_xml('/lccn/sn83030272/1914-10-11/ed-1/seq-52/')

bad response for https://chroniclingamerica.loc.gov//lccn/sn83030272/1914-10-11/ed-1/seq-52/ocr.xml
