This cell defines the base URL for *Chronicling* and downloads the list of 2871 batch names as strings, with examples shown as output.

In [11]:
from bs4 import BeautifulSoup
import requests

BASE_URL = "https://chroniclingamerica.loc.gov/data/batches/"

soup = BeautifulSoup(requests.get(BASE_URL).content, 'html.parser')
batches_all = [batch.find('a').get_text() for batch in soup.find_all('tr')[2:]]
print(f'total batches: {len(batches_all)}\nexamples:')
print(*batches_all[:5], sep='\n')

total batches: 2871
examples:
ak_albatross_ver01/
ak_arcticfox_ver02/
ak_arctictern_ver01/
ak_belugawhale_ver01/
ak_bluewhale_ver01/


This cell samples 7 random batches from the full list of batches for exploratory analysis (seeded for reproducibility).

In [37]:
import random
random.seed(2002)
batches =random.sample(batches_all, 7)
print('selected batches:\n   ', '\n    '.join(batches))

selected batches:
    ak_arcticfox_ver02/
    ohi_lima_ver01/
    me_bangor_ver02/
    vtu_eden_ver01/
    tu_carla_ver01/
    dlc_bravo_ver01/
    in_fairmount_ver02/


This cell creates directories and summary files for selected batches.

In [152]:
import os
import xml.etree.ElementTree as ET
issues = 0

for batch in batches:
    batch_xml = ET.fromstring(requests.get(f'{BASE_URL}{batch}data/batch.xml').content)

    dir = f'./data/{batch}/'
    if not os.path.exists(dir):
        os.mkdir(dir)

    with open(f'{dir}batch_info.csv', 'w') as fp:
        fp.write('lccn, issue_date, edition_order, path, url\n')
        for issue in batch_xml.findall('{http://www.loc.gov/ndnp}issue'):
            lccn, issueDate, editionOrder = issue.attrib["lccn"], issue.attrib["issueDate"], issue.attrib["editionOrder"]
            url = '/'.join((issue.text or "").replace('./', '').split('/')[0:3]) + '/'
            path = f'{lccn}{issueDate}{editionOrder}'
            fp.write(f'{lccn},{issueDate},{editionOrder},{path},{url}\n')

            issues += 1
        fp.write('{\n')

print(f'wrote info for {issues} issues')

wrote info for 8829 issues


This cell estimates the storage requirements of the selected batches.

In [156]:
import csv

def get_gb(size: str):
    number, unit = size.split(' ')
    if unit == "KB":
        return float(number) / 1e6
    elif unit=="MB":
        return float(number) / 1e3
    else:
        raise ValueError

jp2_sizes, pdf_sizes, total_pages = [], [], []
for i, batch in enumerate(batches):
    with open(f'./data/{batch}/batch_info.csv', 'r') as fp:
        issue_jp2_sizes, issue_pdf_sizes, issue_pages = [], [], []

        # sample 5 issues from batch
        issues = list(csv.reader(fp))
        sample_rows = random.sample(issues, 5)
        for row in sample_rows:
            sample_jp2_size, sample_pdf_size, sample_pages = 0, 0, 0

            soup = BeautifulSoup(requests.get(f'{BASE_URL}{batch}data/{row[4]}').content, 'html.parser')
            for row in soup.find_all('tr')[2:]:
                format, size = row.find_all('td')[0].find('a').text.split('.')[1], row.find_all('td')[2].text
                if format == 'jp2':
                    sample_jp2_size += get_gb(size)
                if format == 'pdf':
                    sample_pdf_size += get_gb(size)
                    sample_pages += 1
            
            issue_jp2_sizes.append(sample_jp2_size)
            issue_pdf_sizes.append(sample_pdf_size)
            issue_pages.append(sample_pages)
        
        # estimate total batch size, discarding lowest and highest samples
        avg_jp2_size = (sum(issue_jp2_sizes) - min(issue_jp2_sizes) - max(issue_jp2_sizes)) / 3
        avg_pdf_size = (sum(issue_pdf_sizes) - min(issue_pdf_sizes) - max(issue_pdf_sizes)) / 3
        avg_pages    = (sum(issue_pages) - min(issue_pages) - max(issue_pages)) / 3

        jp2_sizes.append(avg_jp2_size * len(issues))
        pdf_sizes.append(avg_pdf_size * len(issues))
        total_pages.append(avg_pages * len(issues))

print(f'jp2 sizes (GB):', jp2_sizes)
print(f'pdf sizes (GB):', pdf_sizes)
print(f'total pages   :', total_pages)

jp2 sizes (GB): [41.76291999999999, 8.65162, 36.166806666666666, 42.32690000000001, 56.06825, 31.964480000000002, 89.20483]
pdf sizes (GB): [9.88739228, 1.2857075233333333, 12.174497440000003, 6.997127316666667, 8.8028941, 12.832773333333334, 9.504823900000002]
total pages   : [11184.0, 1832.0, 9846.666666666668, 8922.666666666666, 12250.0, 4661.333333333333, 6996.0]
