In [None]:
import os
import pandas as pd
from functions_for_metrics import *

# from textdistance import DamerauLevenshtein
# import swifter

# --------------------------------------------------------------------
# Input: dataframe with token lists
# Expecting columns like in your notebook:
#   - df_wd_dataset["alias_name_list"]
#   - df_wd_dataset["label_name_list"]
# plus any id/person/label/alias columns you want to carry over.
# --------------------------------------------------------------------
df_wd_dataset = pd.read_parquet("wd_dataset.parquet")  # as in your notebook

# --------------------------------------------------------------------
# 1) Define “configurations” (methods + hyperparameters)
# Adjust the lists to match what you actually want to iterate over.
# --------------------------------------------------------------------
CONFIGS = [
    # Daitch–Mokotoff (DM) – usually phonetic code equality
    {"name": "DM", "type": "phonetic", "variant": "daitch_mokotoff"},

    # Beider–Morse (BM) – often two variants: exact vs approx
    {"name": "BM_exact", "type": "phonetic", "variant": "beider_morse_exact"},
    {"name": "BM_approx", "type": "phonetic", "variant": "beider_morse_approx"},

    # Jaro-Winkler (JW) – thresholded similarity on token pairs
    {"name": "NJW_0.1", "type": "distance", "variant": "jw", "max_dist": 0.1},
    {"name": "NJW_0.2", "type": "distance", "variant": "jw", "max_dist": 0.2},

    # Levenshtein (LEV) – thresholded distance/similarity on token pairs
    {"name": "LEV_1", "type": "distance", "variant": "lev", "max_dist": 1},
    {"name": "LEV_2", "type": "distance", "variant": "lev", "max_dist": 2},

    # Levenshtein (LEV) – thresholded distance/similarity on token pairs
    {"name": "NLEV_0.1", "type": "distance", "variant": "n_lev", "max_dist": 0.1},
    {"name": "NLEV_0.2", "type": "distance", "variant": "n_lev", "max_dist": 0.2},
    {"name": "NLEV_0.3", "type": "distance", "variant": "n_lev", "max_dist": 0.3},
    {"name": "NLEV_0.4", "type": "distance", "variant": "n_lev", "max_dist": 0.4},

]

OUTPUT_DIR = "coverage_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)




In [None]:
# --------------------------------------------------------------------
# 2) Build a vocabulary of unique words
# --------------------------------------------------------------------
namewords = (
    pd.concat(
        [df_wd_dataset["alias_name_list"].explode(), df_wd_dataset["label_name_list"].explode()],
        ignore_index=True,
    )
    .drop_duplicates()
    .to_frame("word")
    .set_index("word")
)

words = namewords.index.to_series()

In [5]:
# --------------------------------------------------------------------
# 3) “Matcher” factory: for each config, create matched_frozensets
# This is the only part that differs between DM/BM/JW/LEV.

# --------------------------------------------------------------------
def build_matched_pairs_frozensets(config, words):

    if config["type"] == "distance":
        words_list = list(words)
        variant = config["variant"]
        max_dist = config["max_dist"]
        distance_df = distance_metrics_calculator(words_list, variant)
        matched_frozensets = get_distance_frozensets(distance_df, max_dist)

        return matched_frozensets

    if config["type"] == "phonetic":
        variant = config["variant"]
        print(variant)
        matched_frozensets = build_phonetic_matched_frozensets(words, variant)
        return matched_frozensets


    raise ValueError(f"Unknown config type: {config['type']}")


In [6]:

# --------------------------------------------------------------------
# 4) Main loop:
# - compute matched_frozensets for config
# - compute coverage per row
# - save one file per config
# --------------------------------------------------------------------
for config in CONFIGS:
    config_name = config["name"]
    print(f"Computing coverage for config: {config_name}")

    # 0) Get matched_frozensets
    matched_frozensets = build_matched_pairs_frozensets(config, words)

    df_wd_dataset['key'] = 1

    # 1) Left side (one row at a time): label records that will be matched to candidates
    df_left = df_wd_dataset[["label_name_list", "person", "polish_label", "key"]]

    # 2) Right side (candidate pool): alias records that share the blocking key
    df_candidates = df_wd_dataset[["name_variant", "alias_name_list", "key"]]

    # 3) Compute candidate matches for each left row (each call returns a DataFrame)
    per_row_matches = df_left.swifter.apply(
        lambda left_row: row_coverage_calculator(
            left_row,
            df_candidates,
            treshold=0.5, # setting this variable helps in reducing memory use making the code more effective
            matched_frozensets=matched_frozensets,
            list_1_name="label_name_list",
            list_2_name="alias_name_list",
        ),
        axis=1,
    )

    # 4) per_row_matches is a Series of DataFrames; convert to a Python list
    match_frames = per_row_matches.tolist()

    # 5) (Optional but recommended) drop empty frames to avoid concat edge-cases
    match_frames = [m for m in match_frames if m is not None and not m.empty]

    # 6) Concatenate into one output DataFrame
    df_out = pd.concat(match_frames, ignore_index=True) if match_frames else pd.DataFrame()

    df_out.head()
    # Keep the file small & readable: keep only core cols + the new coverage
    keep_cols = [c for c in ["person", "polish_label", "name_variant", "label_name_list", "alias_name_list"] if c in df_out.columns]
    keep_cols += [f"coverage_value"]

    out_path = os.path.join(OUTPUT_DIR, f"coverage_{config_name}.parquet")
    df_out[keep_cols].to_parquet(out_path, index=False)


Computing coverage for config: BM_exact
beider_morse_exact


Pandas Apply: 100%|██████████| 1464/1464 [00:07<00:00, 192.34it/s]
Pandas Apply: 100%|██████████| 1000/1000 [00:13<00:00, 72.59it/s]


Computing coverage for config: BM_approx
beider_morse_approx


Pandas Apply: 100%|██████████| 1464/1464 [00:09<00:00, 147.90it/s]
Pandas Apply: 100%|██████████| 1000/1000 [00:13<00:00, 72.94it/s]
