# Evaluation of coverage outputs (best F2 / recall / threshold)

## Purpose
This notebook loads multiple saved *candidate-pair result files* (one per configuration/metric),
evaluates them at several `match_count` thresholds, and produces a summary table with the
**best F2**, **recall**, and the corresponding **threshold**, as reported in the article.



In [8]:
import os
import glob
import pandas as pd

# Ground-truth pairs (must contain: plLabel, alias, and index)
GOLD_FILE = "wd_dataset.parquet"   # adjust if your gold is elsewhere

# Saved result files: one per configuration (e.g., coverage_JW_0.2.parquet etc.)
RESULTS_GLOB = "coverage_outputs/*.parquet"  # adjust path if needed

# Thresholds to test (matching your previous loop: 0.5..1.0) (minimum coverage to be considered a match)
THRESHOLDS = [t / 10 for t in range(5, 11)]



In [9]:
df_gold = pd.read_parquet(GOLD_FILE)

required_gold_cols = {"polish_label", "name_variant", "person"}
missing = required_gold_cols - set(df_gold.columns)
if missing:
    raise ValueError(f"Gold file missing columns: {missing}")

# Gold set of true pairs for fast set operations
gold_pairs = set(map(tuple, df_gold[["polish_label", "name_variant"]].to_records(index=False)))

print("Gold pairs:", len(gold_pairs))
df_gold.head(3)

Gold pairs: 991


Unnamed: 0,person,name_variant,polish_label,alias_name_list,label_name_list
0,Q163043,Józefa Maria ks. Wettin,Maria Józefa Wettyn,"[Wettin, ks., Maria, Józefa]","[Maria, Józefa, Wettyn]"
1,Q163043,Maria Józefa Karolina Saska,Maria Józefa Wettyn,"[Karolina, Maria, Saska, Józefa]","[Maria, Józefa, Wettyn]"
2,Q25776,Владислав Юзеф Сапєга ч. Ліс,Władysław Jozafat Sapieha,"[ч., Сапєга, Юзеф, Владислав, Ліс]","[Władysław, Sapieha, Jozafat]"


In [10]:
def fbeta(precision: float, recall: float, beta: float = 2.0) -> float:
    """
    Compute F_beta score (weighted harmonic mean of precision and recall),  in this case default is f2
    """
    if precision <= 0.0 and recall <= 0.0:
        return 0.0
    beta2 = beta * beta
    denom = (beta2 * precision) + recall
    return ((1 + beta2) * precision * recall / denom) if denom > 0 else 0.0


In [11]:
def evaluate_result_df(df_result: pd.DataFrame, gold_pairs: set, df_gold: pd.DataFrame, thresholds) -> pd.DataFrame:
    """
    Evaluate a single result dataframe over multiple thresholds.

    Returns a dataframe with one row per threshold containing:
    precision, recall, f2, counts, retrieved amount, etc.
    """
    rows = []

    for thr in thresholds:
        # Filter by coverage threshold
        df_filt = df_result[df_result["coverage_value"] >= thr]

        # False negatives: gold indices not retrieved (same logic you used)
        false_negative = df_gold[~df_gold["person"].isin(df_filt["person"])]
        # Predicted pair set at this threshold
        pred_pairs = set(map(tuple, df_filt[["polish_label", "name_variant"]].to_records(index=False)))

        # Set-based TP/FP
        tp_set = pred_pairs & gold_pairs
        fp_set = pred_pairs - gold_pairs

        tp = len(tp_set)
        fp = len(fp_set)
        fn = len(false_negative)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f2 = fbeta(precision, recall, beta=2.0)

        rows.append(
            {
                "threshold": thr,
                "retrieved": len(df_filt),
                "tp": tp,
                "fp": fp,
                "fn": fn,
                "precision": precision,
                "recall": recall,
                "f2": f2,
            }
        )

    return pd.DataFrame(rows)


In [12]:
result_files = sorted(glob.glob(RESULTS_GLOB))
if not result_files:
    raise FileNotFoundError(f"No files matched: {RESULTS_GLOB}")

summary_rows = []

for path in result_files:
    df_res = pd.read_parquet(path)

    eval_table = evaluate_result_df(df_res, gold_pairs=gold_pairs, df_gold=df_gold, thresholds=THRESHOLDS)

    # Select best threshold by F2 (ties broken by higher recall, then higher precision)
    best = (
        eval_table.sort_values(["f2", "recall", "precision"], ascending=False)
        .iloc[0]
        .to_dict()
    )
    print(eval_table)
    summary_rows.append(
        {
            "file": os.path.basename(path),
            "best_threshold": best["threshold"],
            "precision": best["precision"],
            "recall": best["recall"],
            "f2": best["f2"],
            "retrieved": int(best["retrieved"]),
            "tp": int(best["tp"]),
            "fp": int(best["fp"]),
            "fn": int(best["fn"]),
        }
    )

# Export summary of best values
df_summary = pd.DataFrame(summary_rows).sort_values(["f2", "recall"], ascending=False)
df_summary.to_csv("summary_best_values.csv")


   threshold  retrieved   tp      fp  fn  precision  recall        f2
0        0.5     953209  955  520343   0   0.001832     1.0  0.009093
1        0.6     802095  803  438536   0   0.001828     1.0  0.009072
2        0.7     589510  606  323817   0   0.001868     1.0  0.009270
3        0.8     561948  562  308491   0   0.001818     1.0  0.009027
4        0.9     258296  287  141437   0   0.002025     1.0  0.010044
5        1.0     257995  287  141231   0   0.002028     1.0  0.010058
   threshold  retrieved   tp      fp  fn  precision  recall        f2
0        0.5     949501  955  518265   0   0.001839     1.0  0.009129
1        0.6     790668  802  432222   0   0.001852     1.0  0.009192
2        0.7     575000  604  315982   0   0.001908     1.0  0.009467
3        0.8     547496  559  300710   0   0.001855     1.0  0.009209
4        0.9     248340  284  136103   0   0.002082     1.0  0.010326
5        1.0     248051  284  135903   0   0.002085     1.0  0.010341


In [None]:
from collections import defaultdict

variant_to_rows = defaultdict(set)
for idx, variants in sets.items():
    for variant in variants:
        variant_to_rows[variant].add(idx)


phonetic_matches_series = pd.Series({
    idx: matching_rows(idx, variants, variant_to_rows)
    for idx, variants in sets.items()
}).dropna()
phonetic_matches_list = phonetic_matches_series.explode().reset_index().apply(lambda x: set([x['index'], x[0]]), axis=1).to_list()
matched_frozensets = set(frozenset(pair) for pair in phonetic_matches_list)
print(len(matched_frozensets))



NameError: name 'Union' is not defined