In [87]:
import json
import sys
from rich import print as rprint
from collections import defaultdict

from pathlib import Path
from itertools import islice

nb_dir = Path.cwd()

project_root = nb_dir.parent.parent

sys.path.insert(0, str(project_root))


In [88]:
people_records_prepped_file = Path(project_root / "data/people/people_records_prepped.json")
books2people_prepped_file = Path(project_root / "data/people/books2people_prepped.json")
folder_matched = Path(project_root / "data/matched")

validation_log_file = Path(project_root / "data/logs/validation_failed_log.json")

In [89]:
people_records = []
books2people_records = []

with open(people_records_prepped_file, "r") as f:
    people_records = json.load(f)

with open(books2people_prepped_file, "r") as f:
    books2people_records = json.load(f)

books2people_dict = defaultdict(list)
for entry in books2people_records:
    books2people_dict[entry["composite_id"]].append(entry)

people_dict = {person["unified_id"]: person for person in people_records}


In [90]:
processed_file_count = 0
validated_entries_total = 0
failed_entries_total = 0

failed_entries = {}

for file in folder_matched.iterdir():
    validated_entries = {}
    validated_count = 0
    failed_count = 0
    entry_count = 0
    ids_found = 0


    if not file.exists():
        raise FileNotFoundError(f"{file} doesn't exist!")

    with open(file, "r") as f:
        books = json.load(f)
        #rprint(books)
        processed_file_count += 1

    for composite_id, book in books.items():
        entry_count += 1
        authors = book["parsed_entry"].get("authors") or []
        editors = book["parsed_entry"].get("editors") or []
        contributors = book["parsed_entry"].get("contributors") or []
        translator = book["parsed_entry"]["translator"]
        is_translation = book["parsed_entry"]["is_translation"]

        authors_exp = len(authors)
        editors_exp = len(editors)
        contributors_exp = len(contributors)
        translator_exp = 1 if translator else 0
        expected_total = authors_exp + editors_exp + contributors_exp + translator_exp
        # print(expected_total)


        # validation logic goes here
        if composite_id in books2people_dict:
            ids_found += 1
            books2people_data = books2people_dict[composite_id]


            author = 0
            editor = 0
            contributor = 0
            translator = 0
            total = 0
            author_mismatch_count = 0

            author_mismatch = False
            editor_mismatch = False
            contributor_mismatch = False
            translator_mismatch = False


            for entry in books2people_data:
                unified_id = entry["unified_id"]
                is_author = entry["is_author"]
                is_editor = entry["is_editor"]
                is_contributor = entry["is_contributor"]
                is_translator = entry["is_translator"]

                if unified_id == "oops":
                    failed_entries.update(entry)
                    failed_entries_total += 1
                    continue

                if is_author:
                    author += 1
                    total +=1

                if is_editor:
                    editor += 1
                    total +=1

                if is_contributor:
                    contributor += 1
                    total +=1

                if is_translator:
                   translator += 1
                   total +=1

            if not authors_exp == author:
                author_mismatch = True
                author_mismatch_count += 1

            if not editors_exp == editor:
                editor_mismatch = True
            if not contributors_exp == contributor:
                contributor_mismatch = True
            if not translator_exp == translator:
                translator_mismatch = True

            if not expected_total == total:
                totals_mismatch = True
            else:
                totals_mismatch = False

            if totals_mismatch == False:
                validated_count +=1
            else:
                failed_count += 1
                # validated_entries.update(entry)

        # if validated:
        #     validated_count += 1

    percent = (ids_found / entry_count ) * 100
    print(f"{file.name}: {entry_count} books, {ids_found} were matched")
    print(f"that's {percent:.2f}%")
    rprint(f"{validated_count} have a matching total")
    rprint(f"{failed_count} have failed")
    rprint(f"found {author_mismatch_count} author mismatches")

griechenland.json: 228 books, 217 were matched
that's 95.18%


geschichte.json: 41 books, 38 were matched
that's 92.68%
