# Evaluation of coverage outputs (best F2 / recall / threshold)

## Purpose
This notebook loads multiple saved *candidate-pair result files* (one per configuration/metric),
evaluates them at several `match_count` thresholds, and produces a summary table with the
**best F2**, **recall**, and the corresponding **threshold**, as reported in the article.



In [None]:
import os
import glob
import pandas as pd

# Ground-truth pairs (must contain: plLabel, alias, and index)
GOLD_FILE = "wd_dataset.parquet"   # adjust if your gold is elsewhere

# Thresholds to test (matching your previous loop: 0.5..1.0) (minimum coverage to be considered a match)
THRESHOLDS = [t / 10 for t in range(6, 11)]

# Number of samples from each class (balanced dataset size = 2*N) to separate the training data (used in other metrics)
N_SAMPLES_PER_CLASS = 50

# 
RANDOM_SEED = 42



In [None]:
df_gold = pd.read_parquet(GOLD_FILE)

required_gold_cols = {"polish_label", "name_variant", "person"}
missing = required_gold_cols - set(df_gold.columns)
if missing:
    raise ValueError(f"Gold file missing columns: {missing}")

# Gold set of true pairs for fast set operations
gold_pairs = set(map(tuple, df_gold[["polish_label", "name_variant"]].to_records(index=False)))

print("Gold pairs:", len(gold_pairs))
df_gold.head(3)

# value possible previously used for threshold for lower memory usage
prev_mem_based_threshold = 0.5


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob

# ============================================================================
# Load all coverage output files and merge into single feature matrix
# ============================================================================

COVERAGE_DIR = Path("coverage_outputs")

# Find all parquet files in coverage_outputs/
coverage_files = sorted(glob.glob(str(COVERAGE_DIR / "*.parquet")))

print(f"Found {len(coverage_files)} coverage files:")
for f in coverage_files:
    print(f"  - {Path(f).name}")

# Load each file and extract the coverage column (named by config)
dfs = []

for fpath in coverage_files:
    df_temp = pd.read_parquet(fpath)
    print(len(df_temp))
    # Extract config name from filename: coverage_<CONFIG>.parquet -> CONFIG
    config_name = Path(fpath).stem.replace("coverage_", "")
    
    # Keep only the index columns + the coverage score, rename coverage column
    coverage_col = "coverage_value"
  
    
    df_temp = df_temp[["polish_label", "name_variant",  "coverage_value"]].copy()
    df_temp = df_temp.rename(columns={coverage_col: f"coverage_{config_name}"})
    dfs.append(df_temp)

# ============================================================================
# Merge all coverage DataFrames on index + label + variant
# (outer join to keep all pairs, even if missing in some configs)
# ============================================================================

if dfs:

    df_merged = dfs[0]
    
    for df_next in dfs[1:]:
        df_merged = pd.merge(
            df_merged,
            df_next,
            on=["polish_label", "name_variant"],
            how="outer"
        )
        df_merged= df_merged.drop_duplicates()
    
    print(f"\nMerged shape: {df_merged.shape}")
    print(f"Columns: {df_merged.columns.tolist()}")
    
   
    # =========================================================================
    # Load gold pairs and add truth label
    # =========================================================================
    df_gold = pd.read_parquet("wd_dataset.parquet")
    
    # Create a gold set for matching
    gold_pairs = set(map(tuple, df_gold[["polish_label", "name_variant"]].to_records(index=False)))
    
    # Add truth column: True if (label, variant) is in gold set
    df_merged["truth"] = df_merged.apply(
        lambda row: (row["polish_label"], row["name_variant"]) in gold_pairs,
        axis=1
    )
    
    print(f"\nAdded truth label:")
    print(f"  Positive (True) pairs: {df_merged['truth'].sum()}")
    print(f"  Negative (False) pairs: {(~df_merged['truth']).sum()}")
    
    # =========================================================================
    # Optional: Add text length features
    # =========================================================================
    df_merged["len_label"] = df_merged["polish_label"].str.split().str.len()
    df_merged["len_variant"] = df_merged["name_variant"].str.split().str.len()
    
    print(f"\nFinal df_merged shape: {df_merged.shape}")
    print(f"Final columns: {df_merged.columns.tolist()}")
    
    df_merged.head(10)


     # =========================================================================
    # Fill missing coverage values with previously defined threshold (for lower memory usage)
    # =========================================================================
    if prev_mem_based_threshold > 0:
        coverage_cols = [c for c in df_merged.columns if c.startswith("coverage_")]
        for col in coverage_cols:
            df_merged[col] = df_merged[col].fillna(prev_mem_based_threshold)
    
else:
    print("ERROR: No coverage files found or loaded successfully")


# Add merge_col for later filtering
df_merged["merge_col"] = df_merged.apply(lambda row: tuple([row.polish_label, row.name_variant]), axis=1)

df_gold["merge_col"] = df_gold.apply(lambda row: tuple([row.polish_label, row.name_variant]), axis=1)

# save merged df for future 
df_merged.to_parquet("df_merged.parquet")


In [None]:
y_full = df_merged["truth"].astype(int).values

rng = np.random.default_rng(RANDOM_SEED)

pos_idx = np.where(y_full == 1)[0]
neg_idx = np.where(y_full == 0)[0]

n = min(N_SAMPLES_PER_CLASS, len(pos_idx), len(neg_idx))

sample_idx = np.concatenate([
    rng.choice(pos_idx, size=n, replace=False),
    rng.choice(neg_idx, size=n, replace=False),
])


mask = ~np.isin(np.arange(len(df_merged)), sample_idx)
df_not_in_sample = df_merged[mask]

# Or with pandas:
df_merged_test = df_merged.loc[~df_merged.index.isin(sample_idx)]


In [None]:
def fbeta(precision: float, recall: float, beta: float = 2.0) -> float:
    """
    Compute F_beta score (weighted harmonic mean of precision and recall).
    Default is F2 (recall-weighted).
    
    Formula:
    F_β = (1 + β²) × (P × R) / (β² × P + R)
    """
    if precision <= 0.0 and recall <= 0.0:
        return 0.0
    
    beta2 = beta * beta
    numerator = (1 + beta2) * precision * recall
    denominator = (beta2 * precision) + recall
    
    return numerator / denominator if denominator > 0 else 0.0


In [None]:
def evaluate_result_df(df_result: pd.DataFrame, gold_pairs: set, df_gold: pd.DataFrame, thresholds, coverage_col_name) -> pd.DataFrame:
    """
    Evaluate a single result dataframe over multiple thresholds.

    Returns a dataframe with one row per threshold containing:
    precision, recall, f2, counts, retrieved amount, etc.
    """
    rows = []

    for thr in thresholds:
        # Filter by coverage threshold
        df_filt = df_result[df_result[coverage_col_name] >= thr]

        # False negatives: gold indices not retrieved (same logic you used)
        false_negative = df_gold[~df_gold["merge_col"].isin(df_filt["merge_col"])]
        # Predicted pair set at this threshold
        pred_pairs = set(map(tuple, df_filt[["polish_label", "name_variant"]].to_records(index=False)))

        # Set-based TP/FP
        tp_set = pred_pairs & gold_pairs
        fp_set = pred_pairs - gold_pairs

        tp = len(tp_set)
        fp = len(fp_set)
        fn = len(false_negative)

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f2 = fbeta(precision, recall, beta=2.0)

        rows.append(
            {
                "threshold": thr,
                "retrieved": len(df_filt),
                "tp": tp,
                "fp": fp,
                "fn": fn,
                "precision": precision,
                "recall": recall,
                "f2": f2,
            }
        )

    return pd.DataFrame(rows)


In [None]:
coverage_cols = [c for c in df_merged.columns if c.startswith("coverage_")]

summary_rows = []
for c in coverage_cols:
        
    df_res = df_merged[["name_variant", "merge_col", "polish_label",c]]

    eval_table = evaluate_result_df(df_res, gold_pairs=gold_pairs, df_gold=df_gold, thresholds=THRESHOLDS, coverage_col_name=c)

    # Select best threshold by F2 (ties broken by higher recall, then higher precision)
    best = (
        eval_table.sort_values(["f2", "recall", "precision"], ascending=False)
        .iloc[0]
        .to_dict()
    )
    print(eval_table)
    summary_rows.append(
        {
            "file": os.path.basename(c.replace("coverage_", "")),
            "best_threshold": best["threshold"],
            "precision": best["precision"],
            "recall": best["recall"],
            "f2": best["f2"],
            "retrieved": int(best["retrieved"]),
            "tp": int(best["tp"]),
            "fp": int(best["fp"]),
            "fn": int(best["fn"]),
        }
    )

# Export summary of best values
df_summary = pd.DataFrame(summary_rows).sort_values(["f2", "recall"], ascending=False)
df_summary.to_csv("summary_best_values.csv")
