# Matches Per Author Name Parsing Rule

This notebook reports the number of matches per author name parsing rule.

In [1]:
from copy import deepcopy
from typing import Dict, List, Union
from statistics import stdev

import pandas as pd


def get_name_source(visualizations: List[Dict]) -> List[Dict]:
    name_source = []
    for d in visualizations:
        authors = d["authors"]
        if authors is None:
            continue
        for author in authors:
            name_source.append(
                {
                    "author": author,
                    "source": d["source"]["name"],
                }
            )
    name_source = pd.DataFrame(name_source).drop_duplicates().to_dict("records")
    return name_source


def get_rules_matches(rules: List[Dict], visualizations: List[Dict]) -> List[Dict]:
    name_source = get_name_source(visualizations)
    rules_matches = deepcopy(rules)
    rules_matches = [{"rule": d, "matches": []} for d in rules_matches]
    for d in rules_matches:
        for ns in name_source:
            if d["rule"]["substring"] in ns["author"]:
                d["matches"].append(deepcopy(ns))
    return rules_matches


def profile(values: List[Union[int, float]]) -> None:
    print("N =", len(values))

    min_value = min(values)
    n_min = sum([d == min_value for d in values])
    print(f"Min = {min_value} ({n_min} times)")

    max_value = max(values)
    n_max = sum([d == max_value for d in values])
    print(f"Max = {max_value} ({n_max} times)")

    print("Mean =", sum(values) / len(values))
    print("SD =", stdev(values))

In [2]:
import sys

sys.path.append("../")

from builders.authors.rules.replace_substring import rules
from builders import build_visualizations
from _loader import load_zipped_processed_metadata

path_data_sources = "../../data-sources"
dataset_paths = [
    f"{path_data_sources}/alabama-maps",
    f"{path_data_sources}/british-library-collection-items",
    f"{path_data_sources}/british-library-images-online",
    f"{path_data_sources}/david-rumsey-map-collection",
    f"{path_data_sources}/gallica",
    f"{path_data_sources}/internet-archive",
    f"{path_data_sources}/library-of-congress",
    f"{path_data_sources}/telefact",
]
processed_metadata = load_zipped_processed_metadata(dataset_paths)
visualizations = build_visualizations(processed_metadata)
rules_matches = get_rules_matches(rules, visualizations)
rules_matches_stats = deepcopy(rules_matches)
for entry in rules_matches_stats:
    entry["nMatches"] = len(entry["matches"])
    entry["matchedSources"] = [*set(d["source"] for d in entry["matches"])]
    entry["nMatchedSources"] = len(entry["matchedSources"])

print("----------")
print("statistics for #matched entries for each rule:")
profile([d["nMatches"] for d in rules_matches_stats])
print("----------")
print("statistics for #matched sources for each rule")
profile([d["nMatchedSources"] for d in rules_matches_stats])
print("----------")

----------
statistics for #matched entries for each rule:
N = 214
Min = 0 (3 times)
Max = 8 (1 times)
Mean = 1.0981308411214954
SD = 0.6010134333935282
----------
statistics for #matched sources for each rule
N = 214
Min = 0 (3 times)
Max = 2 (4 times)
Mean = 1.0046728971962617
SD = 0.18122337399123967
----------
