# Build master JSON files

This notebook cell aggregates per-item `metadata.json` and `ocr_text.json` found under `output/items/*/` into master files written to `output/metadata.json` and `output/ocr_text.json`.

In [1]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Any


def _read_json(path: Path) -> Any:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


items_root = Path("output/items")
output_root = Path("output")

item_dirs = sorted([p for p in items_root.glob("*/") if p.is_dir()])

master_metadata: dict[str, Any] = {}
master_ocr: dict[str, Any] = {}

missing_metadata: list[str] = []
missing_ocr: list[str] = []

for item_dir in item_dirs:
    item_id = item_dir.name

    metadata_path = item_dir / "metadata.json"
    if metadata_path.exists():
        master_metadata[item_id] = _read_json(metadata_path)
    else:
        missing_metadata.append(item_id)

    ocr_path = item_dir / "ocr_text.json"
    if ocr_path.exists():
        master_ocr[item_id] = _read_json(ocr_path)
    else:
        missing_ocr.append(item_id)

(output_root / "metadata.json").write_text(
    json.dumps(master_metadata, indent=2, ensure_ascii=False),
    encoding="utf-8",
)
(output_root / "ocr_text.json").write_text(
    json.dumps(master_ocr, indent=2, ensure_ascii=False),
    encoding="utf-8",
)

print(f"Found {len(item_dirs)} item directories under {items_root}")
print(f"Wrote output/metadata.json with {len(master_metadata)} items")
print(f"Wrote output/ocr_text.json with {len(master_ocr)} items")

if missing_metadata:
    print(f"Missing metadata.json for {len(missing_metadata)} items (first 10): {missing_metadata[:10]}")
if missing_ocr:
    print(f"Missing ocr_text.json for {len(missing_ocr)} items (first 10): {missing_ocr[:10]}")


Found 722 item directories under output/items
Wrote output/metadata.json with 722 items
Wrote output/ocr_text.json with 722 items


In [1]:
from __future__ import annotations

import json
from pathlib import Path

import pandas as pd

metadata_path = Path("output/metadata.json")

metadata_raw = json.loads(metadata_path.read_text(encoding="utf-8"))

rows = []
for item_id, payload in metadata_raw.items():
    if isinstance(payload, dict):
        row = {"item_id": item_id, **payload}
    else:
        row = {"item_id": item_id, "value": payload}
    rows.append(row)

metadata_df = pd.json_normalize(rows, sep=".")

print(metadata_df.shape)
metadata_df.head()

(722, 17)


Unnamed: 0,item_id,item_url,title,fields.Long Island Black and White Aerial Photographs Collection,fields.Author,fields.Creator,fields.Date,fields.Coverage,fields.Source,fields.Format,fields.Description,links.native,links.medium,links.thumbnail,fields.Relation,fields.Subject,fields.Language
0,1,https://commons.library.stonybrook.edu/long-is...,Academic Commons,1_61 Author [unknown] Preview Creator [unknown...,[unknown],[unknown],1938,1938 Shinnecock aerial Route 32,Unknown compilation source | 1 | 480754 | 0.46 MB,image/tif,1938 aerial photo of West Shinnecock Bay and R...,https://commons.library.stonybrook.edu/context...,https://commons.library.stonybrook.edu/long-is...,https://commons.library.stonybrook.edu/long-is...,,,
1,2,https://commons.library.stonybrook.edu/long-is...,Academic Commons,1_63 Author [unknown] Preview Creator [unknown...,[unknown],[unknown],1938,1938 Shinnecock aerial Shinnecock Bay,Unknown compilation source | 1 | 480754 | 0.46 MB,image/tif,"1938 aerial photo of East Shinnecock Bay, Far ...",https://commons.library.stonybrook.edu/context...,https://commons.library.stonybrook.edu/long-is...,https://commons.library.stonybrook.edu/long-is...,,,
2,3,https://commons.library.stonybrook.edu/long-is...,Academic Commons,1_65 Author [unknown] Preview Creator [unknown...,[unknown],[unknown],1938,1938 Shinnecock aerial Taylor Creek,Unknown compilation source | 1 | 480754 | 0.46 MB,image/tif,1938 aerial photo of Taylor Creek,https://commons.library.stonybrook.edu/context...,https://commons.library.stonybrook.edu/long-is...,https://commons.library.stonybrook.edu/long-is...,,,
3,4,https://commons.library.stonybrook.edu/long-is...,Academic Commons,1_67 Author [unknown] Preview Creator [unknown...,[unknown],[unknown],1938,1938 Shinnecock aerial Wickapogue,Unknown compilation source | 1 | 480754 | 0.46 MB,image/tif,"1938 aerial photo of Agawam Lake, Old Town Pon...",https://commons.library.stonybrook.edu/context...,https://commons.library.stonybrook.edu/long-is...,https://commons.library.stonybrook.edu/long-is...,,,
4,5,https://commons.library.stonybrook.edu/long-is...,Academic Commons,59-61_66-67.60x Author [unknown] Preview Creat...,[unknown],[unknown],1947,1947 Aerial Nassau Bayville,Unknown compilation source | 1 | 13617520 | 12...,image/jpeg,"1947 aerial image of Bayville, Beaver Lake, an...",https://commons.library.stonybrook.edu/context...,https://commons.library.stonybrook.edu/long-is...,https://commons.library.stonybrook.edu/long-is...,,,


In [2]:
metadata_df['fields.Date'].value_counts()

fields.Date
1938-07-07    375
1930          157
1938-08-03     80
1970-05-30     50
1947-09-23     35
1947-09-01     17
1938            4
1947            2
1970            2
Name: count, dtype: int64

In [3]:
metadata_df['fields.Coverage'].value_counts()

fields.Coverage
Suffolk County, 1930                                                                        146
Nassau County, 1938 | AAA-1:20000                                                            80
Suffolk County, 1970 | SCS-1:50000                                                           50
Suffolk County, 1947 | PMA-1:20000                                                           35
Nassau County, 1947 | PMA-1:20000                                                            17
                                                                                           ... 
Shinnecock Inlet | Suffolk County, 1938 | AAA-1:20000                                         1
Mattituck / Mattituck Hill / Southampton / Southold | Suffolk County, 1938 | AAA-1:20000      1
Soutampton/ Sag Harbor | Suffolk County, 1938 | AAA-1:20000                                   1
Central Islip / Patchogue / Port Jefferson | Suffolk County, 1938 | AAA-1:20000               1
1938 Shinnecock aerial R

In [5]:
metadata_df[['fields.Description', 'fields.Coverage']]

Unnamed: 0,fields.Description,fields.Coverage
0,1938 aerial photo of West Shinnecock Bay and R...,1938 Shinnecock aerial Route 32
1,"1938 aerial photo of East Shinnecock Bay, Far ...",1938 Shinnecock aerial Shinnecock Bay
2,1938 aerial photo of Taylor Creek,1938 Shinnecock aerial Taylor Creek
3,"1938 aerial photo of Agawam Lake, Old Town Pon...",1938 Shinnecock aerial Wickapogue
4,"1947 aerial image of Bayville, Beaver Lake, an...",1947 Aerial Nassau Bayville
...,...,...
717,"Aerial(s) Local Government Zoning,24cm X 24cm","Suffolk County, 1947 | PMA-1:20000"
718,"Aerial(s) Local Government Zoning,24cm X 24cm","Suffolk County, 1947 | PMA-1:20000"
719,"Aerial(s) Local Government Zoning,24cm X 24cm","Suffolk County, 1947 | PMA-1:20000"
720,"Aerial(s) Local Government Zoning,24cm X 24cm","Suffolk County, 1947 | PMA-1:20000"
