In [11]:
import json
import sys
from rich import print as rprint
from collections import defaultdict

from pathlib import Path
from itertools import islice

nb_dir = Path.cwd()

project_root = nb_dir.parent.parent

sys.path.insert(0, str(project_root))


In [12]:
people_records_prepped_file = Path(project_root / "data/people/people_records_prepped.json")
books2people_prepped_file = Path(project_root / "data/people/books2people_prepped.json")
folder_matched = Path(project_root / "data/matched")

validation_log_file = Path(project_root / "data/logs/validation_failed_log.json")

In [13]:
people_records = []
books2people_records = []

with open(people_records_prepped_file, "r") as f:
    people_records = json.load(f)

with open(books2people_prepped_file, "r") as f:
    books2people_records = json.load(f)

books2people_dict = defaultdict(list)
for entry in books2people_records:
    books2people_dict[entry["composite_id"]].append(entry)

people_dict = {person["unified_id"]: person for person in people_records}


In [None]:
processed_file_count = 0
validated_entries_total = 0
failed_entries_total = 0

# all failed entries will be collected, grouped by file name. They will be separated between "not_found" for those where the composite_id doesn't appear in the lookup dict, and those where the expected total and counted total of people don't match OR have an "oops" unified_id will be in "issues". There I'll save the full entry + the
# All validated entries will be collected in the report, grouped by file. If there is a mismatch between expected and actual roles, the entire entry will be stored, together with a report. If there are no issues, only the composite_id will be stored.


failed_entries = {}
validated_report = {}
people_not_validated = {}
unified_id_found = set()

for file in folder_matched.iterdir():
    validated_entries = {}
    validated_count = 0
    validated_with_issues_count = 0
    failed_count = 0
    entry_count = 0
    ids_found = 0

    validated_report[file.stem] = {
        "issues": {},
        "no_issues": []
    }

    # validated_report[file.stem] = {
    #     "issues": [
    #         {"composite_id": {
    #             "entry": entry,
    #             "report": report}
    #         }
    #     ],
    #     "no_issues": [composite_id]
    # }

    failed_entries[file.stem] = {
        "not_found": {},
        "issues": {}
    }

    # failed_entries[file.stem] = {
    #     "not_found": [
    #         {"composite_id": entry}
    #     ],
    #     "issues": [
    #         {"composite_id": {
    #             "entry": entry,
    #             "report": report
    #         }}
    #     ]
    # }


    if not file.exists():
        raise FileNotFoundError(f"{file} doesn't exist!")

    with open(file, "r") as f:
        books = json.load(f)
        #rprint(books)
        processed_file_count += 1

    for composite_id, book in books.items():
        entry_count += 1
        authors = book["parsed_entry"].get("authors") or []
        editors = book["parsed_entry"].get("editors") or []
        contributors = book["parsed_entry"].get("contributors") or []
        translator = book["parsed_entry"]["translator"]
        is_translation = book["parsed_entry"]["is_translation"]

        authors_exp = len(authors)
        editors_exp = len(editors)
        contributors_exp = len(contributors)
        translator_exp = 1 if translator else 0
        expected_total = authors_exp + editors_exp + contributors_exp + translator_exp
        # print(expected_total)


        # validation logic goes here
        if composite_id in books2people_dict:
            ids_found += 1
            books2people_data = books2people_dict[composite_id]
            unified_ids_in_book = [entry["unified_id"] for entry in books2people_data]

            author = 0
            editor = 0
            contributor = 0
            translator = 0
            total = 0
            author_mismatch_count = 0
            editor_mismatch_count = 0
            contributor_mismatch_count = 0
            translator_mismatch_count = 0


            author_mismatch = False
            editor_mismatch = False
            contributor_mismatch = False
            translator_mismatch = False

            for entry in books2people_data:
                unified_id = entry["unified_id"]
                is_author = entry["is_author"]
                is_editor = entry["is_editor"]
                is_contributor = entry["is_contributor"]
                is_translator = entry["is_translator"]

                oops_id = False

                if unified_id == "oops":
                    oops_id = True
                    failed_entries[file.stem]["issues"][composite_id] = {
                        "book": book,
                        "report": {
                            "oops": oops_id
                        }
                    }
                    failed_entries_total += 1
                    continue

                if is_author:
                    author += 1
                    total +=1

                if is_editor:
                    editor += 1
                    total +=1

                if is_contributor:
                    contributor += 1
                    total +=1

                if is_translator:
                   translator += 1
                   total +=1

            if not authors_exp == author:
                author_mismatch = True
                author_mismatch_count += 1

            if not editors_exp == editor:
                editor_mismatch = True
                editor_mismatch_count += 1

            if not contributors_exp == contributor:
                contributor_mismatch = True
                contributor_mismatch_count += 1

            if not translator_exp == translator:
                translator_mismatch = True
                translator_mismatch_count +=1

            # Check whether the overall count matches - this currently decides whether an entry fails or not
            if not expected_total == total:
                totals_mismatch = True
                failed_entries[file.stem]["issues"][composite_id] = {
                    "book": book,
                    "report": {
                        "expected_total": expected_total,
                        "found total": total,
                        "authors_exp": authors_exp,
                        "authors": author,
                        "editors_exp": editors_exp,
                        "editors": editors,
                        "contributors_exp": contributors_exp,
                        "contributors": contributors,
                        "translator_exp": translator_exp,
                        "translator": translator
                    }
                }
                failed_count +=1

            # overall number of people matches, counts as validated
            else:
                totals_mismatch = False

                # check people data
                unified_id_found.update(unified_ids_in_book)

                has_role_mismatch = any([author_mismatch, editor_mismatch, contributor_mismatch, translator_mismatch])
                if not oops_id and not has_role_mismatch:
                    validated_entries[composite_id] = {
                        "books": book["parsed_entry"],
                        "admin":book["parsed_entry"].pop("administrative"),
                        "books2people": books2people_data
                    }
                    validated_report[file.stem]["no_issues"].append(composite_id)
                    validated_count +=1

                if not oops_id and has_role_mismatch:
                    validated_entries[composite_id] = {
                        "books": book["parsed_entry"],
                        "admin":book["parsed_entry"].pop("administrative"),
                        "books2people": books2people_data
                    }
                    validated_report[file.stem]["issues"][composite_id] = {
                    "book": book,
                    "report": {
                        "expected_total": expected_total,
                        "found total": total,
                        "authors_exp": authors_exp,
                        "authors": author,
                        "editors_exp": editors_exp,
                        "editors": editors,
                        "contributors_exp": contributors_exp,
                        "contributors": contributors,
                        "translator_exp": translator_exp,
                        "translator": translator
                    }
                }

                    validated_with_issues_count +=1

            failed_entries[file.stem]["not_found"][composite_id] = {"book": book}
            failed_count += 1

        # if validated:
        #     validated_count += 1



    percent = (ids_found / entry_count ) * 100
    print(f"{file.name}: {entry_count} books, {ids_found} were matched")
    print(f"that's {percent:.2f}%")
    rprint(f"{validated_count} have a matching total")
    rprint(f"{failed_count} have failed")
    rprint(f"found {author_mismatch_count} author mismatches")

for unified_id, person in people_dict.items():
    if unified_id not in unified_id_found:
        people_not_validated[unified_id] = person


{'clauss_manfred', 'schneider_carl', 'gehrke_hans_joachim', 'friedell_egon', 'rostovtzeff_michael', 'brelich_angelo', 'bordt_michael', 'uehli_ernst', 'andreae_bernard', 'lippold_adolf', 'koerner_joseph_l', 'lindner_manfred', 'fuhrmann_manfred', 'koyre_alexandre', 'hopper_r_j', 'hose_martin', 'giebel_marion', 'ruck_carl_a', 'maertin_ralf_peter', 'maas_paul', 'dihle_albrecht', 'hoffmann_ernst', 'droysen_johann_g', 'lauenstein_diether', 'dalby_andrew', 'schadewaldt_wolfgang', 'rozanskij_ivan_d', 'malteso_georg_t', 'grant_michael', 'kesser_armin', 'stoll_h_w', 'fögen_marie_t', 'finley_m_i', 'kaschnitz_marie_l', 'meier_mischa', 'cahniotis_angelos', 'carandini_andrea', 'holscher_tonio', 'mann_christian', 'meier_christian', 'altrichter_rudolf', 'colpe_carsten', 'strümpell_ludwig', 'winterling_aloys', 'callies_horst', 'cumont_franz', 'curtius_ludwig', 'ramler_carl_w', 'wolfram_herwig', 'hofmann_albert', 'stroh_wilfried', 'tressan', 'sporschil_johann', 'iwersen_julia', 'wiesner_joseph', 'schwab

{'clauss_manfred', 'schneider_carl', 'gehrke_hans_joachim', 'julien_catherine', 'friedell_egon', 'rousso_henry', 'rostovtzeff_michael', 'brelich_angelo', 'crowley_roger', 'bordt_michael', 'uehli_ernst', 'andreae_bernard', 'schidrowitz_leo', 'matl_josef', 'bretholz_berthold', 'lippold_adolf', 'koerner_joseph_l', 'lindner_manfred', 'fuhrmann_manfred', 'koyre_alexandre', 'hopper_r_j', 'hose_martin', 'giebel_marion', 'ruck_carl_a', 'maertin_ralf_peter', 'maas_paul', 'dihle_albrecht', 'hoffmann_ernst', 'droysen_johann_g', 'lauenstein_diether', 'dalby_andrew', 'schadewaldt_wolfgang', 'bacci_massimo_l', 'rozanskij_ivan_d', 'malteso_georg_t', 'grant_michael', 'kesser_armin', 'stoll_h_w', 'fögen_marie_t', 'bolzenthal_heinrich', 'daru_pierre', 'boucheron_patrick', 'grataloup_christian', 'finley_m_i', 'kaschnitz_marie_l', 'meier_mischa', 'cahniotis_angelos', 'carandini_andrea', 'holscher_tonio', 'mann_christian', 'meier_christian', 'altrichter_rudolf', 'demel_jaroslav', 'colpe_carsten', 'strümpel