In [1]:
#!/usr/bin/env python3
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path
from typing import Counter as CounterType, Mapping


def collect_paragraph_types(root: Path) -> CounterType[str]:
    """Return counts of every paragraph 'type' in scraper JSON files under root.

    Args:
        root: Directory that contains the scraped en-json tree.
    Returns:
        Counter mapping paragraph type -> occurrence count.
    Example:
        >>> collect_paragraph_types(Path("external/python-scripture-scraper/_output/en-json"))  # doctest: +SKIP
    """
    counts: CounterType[str] = Counter()
    for path in root.rglob("*.json"):
        data: Mapping = json.loads(path.read_text())
        for p in data.get("paragraphs", []):
            ptype = p.get("type", "<missing>")
            counts[ptype] += 1
    return counts


if __name__ == "__main__":
    base = Path("external/python-scripture-scraper/_output/en-json")
    for ptype, count in collect_paragraph_types(base).most_common():
        print(f"{ptype:20s} {count}")

verse                42513
study-paragraph      1907
chapter-title        1584
study-footnotes      1579
paragraph            244
section-title        166
book-title           92
book-subtitle        10
chapter-subtitle     8
image                3


In [2]:
#!/usr/bin/env python3
from __future__ import annotations

import json
from pathlib import Path
from typing import Iterable


ALLOWED_VERSE_TYPES = {"verse", "study-footnotes"}


def chapters_with_intermingled(root: Path) -> Iterable[tuple[Path, list[str]]]:
    """Yield chapters whose non-verse paragraphs still lack paragraphCategory."""

    for path in root.rglob("*.json"):
        data = json.loads(path.read_text())
        paragraphs = data.get("paragraphs", [])
        if not isinstance(paragraphs, list):
            continue
        types = [p.get("type", "<missing>") for p in paragraphs]
        verse_idxs = [i for i, t in enumerate(types) if t in ALLOWED_VERSE_TYPES]
        if len(verse_idxs) < 2:
            continue
        span_start, span_end = min(verse_idxs), max(verse_idxs)
        span_paragraphs = paragraphs[span_start : span_end + 1]
        missing_categories = sorted(
            {
                p.get("type", "<missing>")
                for p in span_paragraphs
                if (p.get("type") not in ALLOWED_VERSE_TYPES)
                and not p.get("paragraphCategory")
            }
        )
        if missing_categories:
            yield path, missing_categories


if __name__ == "__main__":
    base = Path("external/python-scripture-scraper/_output/en-json")
    for chap_path, offenders in chapters_with_intermingled(base):
        rel = chap_path.relative_to(base)
        print(f"{rel}: {', '.join(offenders)}")

In [None]:
#!/usr/bin/env python3
from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path
from typing import DefaultDict, Dict, Iterable, Set, Tuple

ALLOWED_TYPES: Set[str] = {
    "verse",
    "chapter-title",
    "study-paragraph",
    "study-footnotes",
    "book-title",
    "book-subtitle",
}


def paragraph_missing_category(paragraph: dict) -> bool:
    """Return True when the paragraph type is unclassified and should be flagged."""

    if not paragraph:
        return False
    ptype = paragraph.get("type", "<missing>")
    if ptype in ALLOWED_TYPES:
        return False
    return not paragraph.get("paragraphCategory")


def chapter_number(data: Dict, path: Path) -> str:
    """Return chapter number from JSON, falling back to filename stem."""

    number = data.get("number")
    if number:
        return str(number)
    stem = path.stem
    return stem.rsplit("-", 1)[-1]


def find_outlier_chapters(root: Path) -> DefaultDict[Tuple[str, str], Set[str]]:
    """Map (para_type, book) -> set of chapter numbers lacking paragraphCategory."""

    outliers: DefaultDict[Tuple[str, str], Set[str]] = defaultdict(set)
    for path in root.rglob("*.json"):
        data = json.loads(path.read_text())
        book = path.parent.name
        chap = chapter_number(data, path)
        paragraphs = data.get("paragraphs")
        if not isinstance(paragraphs, list):
            continue
        missing_types = {
            p.get("type", "<missing>")
            for p in paragraphs
            if paragraph_missing_category(p)
        }
        for t in missing_types:
            outliers[(t, book)].add(chap)
    return outliers


def print_pivot(outliers: DefaultDict[Tuple[str, str], Set[str]]) -> None:
    """Print pivot table of remaining missing categories."""

    print("para_type,book,chapters,n_chapters")
    for ptype, book in sorted(outliers):
        chapters = sorted(outliers[(ptype, book)], key=lambda x: (len(x), x))
        chap_str = " ".join(chapters)
        print(f"{ptype},{book},{chap_str},{len(chapters)}")


def main() -> None:
    """Scan scraper output and report paragraph types still needing categories."""

    root = Path("external/python-scripture-scraper/_output/en-json")
    pivot = find_outlier_chapters(root)
    print_pivot(pivot)


if __name__ == "__main__":
    main()


para_type,book,chapters,n_chapters
chapter-subtitle,moses,1 2 3 4 5 6 7 8,8
image,abraham,fac-1 fac-2 fac-3,3
paragraph,1-nephi,1,1
paragraph,2-nephi,1,1
paragraph,3-nephi,1 11,2
paragraph,4-nephi,1,1
paragraph,abraham,fac-1 fac-2 fac-3,3
paragraph,alma,1 5 7 9 17 21 36 38 39 45,10
paragraph,helaman,1 7 13,3
paragraph,jacob,1,1
paragraph,moroni,9,1
paragraph,mosiah,9 23,2
paragraph,psalms,3 4 5 6 7 8 9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 34 35 36 37 38 39 40 41 42 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 92 98 100 101 102 103 108 109 110 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 138 139 140 141 142 143 144 145,116
section-title,jst-1-chronicles,21,1
section-title,jst-1-corinthians,7 15,2
section-title,jst-1-john,2 3 4,3
section-title,jst-1-peter,3,1
section-title,jst-1-samuel,16,1
section-title,jst-1-thessalonians,4,1
section-title,jst-1-timot

In [4]:
"""Utilities to tally HTML tags and classes in scraped chapter content.

Run directly to print counts:
    uv run python scripts/html_inventory.py --root external/python-scripture-scraper/_output/en-json
"""

from __future__ import annotations

from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Tuple

import json
from bs4 import BeautifulSoup


@dataclass(slots=True)
class HtmlInventory:
    """Aggregated counts of tags and classes.

    Attributes:
        tag_counts: occurrences keyed by tag name.
        class_counts: occurrences keyed by individual class name.
        tag_class_counts: occurrences keyed by (tag, class) pairs.

    Example:
        >>> inv = HtmlInventory(Counter({"p": 2}), Counter({"verse": 1}), Counter({("p", "verse"): 1}))
        >>> inv.tag_counts["p"]
        2
    """

    tag_counts: Counter[str]
    class_counts: Counter[str]
    tag_class_counts: Counter[Tuple[str, str]]


def collect_html_inventory(root: Path) -> HtmlInventory:
    """Walk JSON chapter files under ``root`` and tally HTML tags/classes.

    Args:
        root: Directory containing chapter JSON files (split by chapter).

    Returns:
        HtmlInventory with counters populated from all ``contentHtml`` values.
    """

    assert root.exists(), f"Missing data directory: {root}"
    tag_counts: Counter[str] = Counter()
    class_counts: Counter[str] = Counter()
    tag_class_counts: Counter[Tuple[str, str]] = Counter()

    for path in root.rglob("*.json"):
        with path.open("r", encoding="utf-8") as fh:
            data = json.load(fh)
        paragraphs: Iterable[dict] = (
            data.get("paragraphs", []) if isinstance(data, dict) else []
        )
        for para in paragraphs:
            html = para.get("contentHtml")
            if not html:
                continue
            soup = BeautifulSoup(html, "html.parser")
            for el in soup.find_all(True):
                tag_counts[el.name] += 1
                for cls in el.get("class", []):
                    class_counts[cls] += 1
                    tag_class_counts[(el.name, cls)] += 1

    return HtmlInventory(
        tag_counts=tag_counts,
        class_counts=class_counts,
        tag_class_counts=tag_class_counts,
    )


def _top(counter: Counter, limit: int = 25):
    """Return the most common items up to ``limit`` entries."""

    return counter.most_common(limit)


def main(
    root: str = "external/python-scripture-scraper/_output/en-json",
    limit: int = 25,
    pair_limit: int = 100,
) -> None:
    """Print tag/class frequency tables for the scraped chapters.

    Args:
        root: Base directory containing chapter JSON files.
        limit: Number of tag and class rows to display.
        pair_limit: Number of tag/class pair rows to display.
    """

    inventory = collect_html_inventory(Path(root))

    def fmt(title: str, items):
        print(f"\n{title} (top {len(items)}):")
        for key, count in items:
            print(f"  {key}: {count}")

    fmt("Tags", _top(inventory.tag_counts, limit))
    fmt("Classes", _top(inventory.class_counts, limit))
    fmt("Tag/Class pairs", _top(inventory.tag_class_counts, pair_limit))


if __name__ == "__main__":
    main()


Tags (top 11):
  a: 117393
  li: 72807
  sup: 47192
  span: 28505
  ul: 25615
  small: 24189
  em: 1788
  strong: 143
  i: 88
  br: 39
  img: 3

Classes (top 6):
  scripture-ref: 70193
  footnote-link: 47200
  marker: 47192
  clarity-word: 21579
  small-caps: 6895
  uppercase: 31

Tag/Class pairs (top 6):
  ('a', 'scripture-ref'): 70193
  ('a', 'footnote-link'): 47200
  ('sup', 'marker'): 47192
  ('span', 'clarity-word'): 21579
  ('span', 'small-caps'): 6895
  ('span', 'uppercase'): 31


In [None]:
#!/usr/bin/env python3
from __future__ import annotations

import json
import random
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import DefaultDict, List, Tuple


def summarize_word_seq(text: str, limit: int = 10) -> str:
    tokens = re.findall(r"\S+", text)
    return " ".join(tokens[:limit]) or "(empty)"


def collect_paragraph_metadata(root: Path) -> Tuple[Counter[str], DefaultDict[str, List[Tuple[str, str, str]]]]:
    counter: Counter[str] = Counter()
    candidates: DefaultDict[str, List[Tuple[str, str, str]]] = defaultdict(list)

    for json_path in root.rglob("*.json"):
        data = json.loads(json_path.read_text())
        paragraphs = data.get("paragraphs", [])
        if not isinstance(paragraphs, list):
            continue
        rel_path = json_path.relative_to(root)
        abbreviation = data.get("abbrev") or data.get("name")
        ref_label = abbreviation or str(rel_path)
        for paragraph in paragraphs:
            category = paragraph.get("paragraphCategory") or "<missing>"
            counter[category] += 1
            candidates[category].append((ref_label, paragraph.get("id", "<no-id>"), summarize_word_seq(paragraph.get("content", ""))))
    return counter, candidates


def main() -> None:
    root = Path("external/python-scripture-scraper/_output/en-json")
    counts, candidates = collect_paragraph_metadata(root)
    for category, count in counts.most_common():
        print(f"{category}: {count}")
        sample = candidates.get(category, [])
        if sample:
            for ref_label, para_id, excerpt in random.sample(sample, min(5, len(sample))):
                print(f"  {ref_label}#{para_id}: {excerpt}")
        print()


if __name__ == "__main__":
    main()
