In [3]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Paths
repo_root = Path.cwd()
while not (repo_root / "data" / "quality" / "duplicate_id_report.csv").exists():
    if repo_root.parent == repo_root:
        raise FileNotFoundError("Could not find data/quality/duplicate_id_report.csv from current directory.")
    repo_root = repo_root.parent

out_dir = repo_root / "data" / "governance" / "identity_resolution_policy"
out_dir.mkdir(parents=True, exist_ok=True)

dup = pd.read_csv(repo_root / "data" / "quality" / "duplicate_id_report.csv")

n_ids = dup["application_id"].nunique()
counts = dup["classification"].value_counts()
n_conflict = int(counts.get("conflict", 0))
n_versioned = int(counts.get("versioned", 0))

# Canonical reason (unique reasons)
reasons = sorted(set(dup["canonical_reason"].dropna().astype(str).unique().tolist()))
reason_text = ", ".join(reasons) if reasons else "n/a"

# Build short examples text (truncate for display)
examples = []
for _, r in dup.iterrows():
    ex = str(r.get("example_differences", "")).strip()
    if len(ex) > 80:
        ex = ex[:77] + "..."
    examples.append(f"{r['application_id']}: {r['classification']} | diffs: {ex}")

plt.figure(figsize=(10, 4.8))
plt.axis("off")

title = "Identity Resolution — Duplicate Summary (Evidence)"
lines = [
    f"Total duplicated application_ids: {n_ids}",
    f"Conflict duplicates: {n_conflict}   |   Versioned duplicates: {n_versioned}",
    "",
    f"Canonical selection rule observed: {reason_text}",
    "Governance note: fallback selection indicates a provenance/auditability gap (missing/unparseable timestamps).",
    "",
    "Examples:",
    *[f"• {e}" for e in examples],
]

plt.text(0.02, 0.95, title, fontsize=18, weight="bold", va="top")
plt.text(0.02, 0.88, "\n".join(lines), fontsize=12, va="top")

out_path = out_dir / "duplicate_types_evidence_card.png"
plt.tight_layout()
plt.savefig(out_path, dpi=200, bbox_inches="tight")
plt.close()

In [4]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

try:
    from IPython.display import display
except Exception:
    display = print


def find_repo_root(start: Path) -> Path:
    """Walk upwards until we find the expected repo markers/files."""
    start = start.resolve()
    for p in [start] + list(start.parents):
        if (p / "data" / "quality" / "duplicate_id_report.csv").exists() and (p / "data" / "curated" / "applications_analysis.csv").exists():
            return p
    raise FileNotFoundError("Repo root not found. Expected data/quality/duplicate_id_report.csv and data/curated/applications_analysis.csv.")


def pick_column(df: pd.DataFrame, preferred_exact: list[str], preferred_contains: list[str]) -> str | None:
    """
    Pick a column from df.
    1) exact match (case-insensitive)
    2) contains match (case-insensitive, substring)
    Returns None if not found.
    """
    cols = list(df.columns)
    cols_lower = {c.lower(): c for c in cols}

    # exact
    for cand in preferred_exact:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]

    # contains
    for substr in preferred_contains:
        substr_l = substr.lower()
        for c in cols:
            if substr_l in c.lower():
                return c

    return None


# Locate repo root and define paths 
repo_root = find_repo_root(Path.cwd())
dup_path = repo_root / "data" / "quality" / "duplicate_id_report.csv"
ana_path = repo_root / "data" / "curated" / "applications_analysis.csv"

out_dir = repo_root / "data" / "governance" / "identity_resolution_policy"
out_dir.mkdir(parents=True, exist_ok=True)

#  Load files 
dup = pd.read_csv(dup_path)
ana = pd.read_csv(ana_path)


# --- Identify the join key in both datasets (application_id vs _id etc.) ---
dup_id_col = pick_column(
    dup,
    preferred_exact=["application_id", "_id", "id", "app_id"],
    preferred_contains=["application", "app_", "_id"]
)
ana_id_col = pick_column(
    ana,
    preferred_exact=["application_id", "_id", "id", "app_id"],
    preferred_contains=["application", "app_", "_id"]
)

if dup_id_col is None or ana_id_col is None:
    raise KeyError(
        f"Could not find a join key.\n"
        f"dup columns: {list(dup.columns)}\n"
        f"ana columns: {list(ana.columns)}"
    )

# Normalize id column name to "application_id" for merging
dup_norm = dup.copy()
ana_norm = ana.copy()
dup_norm = dup_norm.rename(columns={dup_id_col: "application_id"})
ana_norm = ana_norm.rename(columns={ana_id_col: "application_id"})

#  Pick group columns (gender + age band) robustly 
gender_col = pick_column(
    ana_norm,
    preferred_exact=["gender", "applicant_gender", "gender_clean", "clean_gender"],
    preferred_contains=["gender"]
)
age_col = pick_column(
    ana_norm,
    preferred_exact=["age_band", "applicant_age_band", "agegroup", "age_group"],
    preferred_contains=["age_band", "agegroup", "age_group", "age"]
)

print("\nDetected columns:")
print(" - join key:", "application_id")
print(" - gender col:", gender_col)
print(" - age col:", age_col)

#  Evidence: keep only essential fields from duplicate log 
needed_dup_cols = ["application_id"]
for c in ["dup_count", "classification", "canonical_reason", "example_differences", "canonical_row_id"]:
    if c in dup_norm.columns:
        needed_dup_cols.append(c)

dup_ids = dup_norm[needed_dup_cols].drop_duplicates()

# Save an explicit evidence file (good for governance pack)
dup_ids.to_csv(out_dir / "duplicated_ids_evidence.csv", index=False)

# Merge classification onto analysis 
df = ana_norm.merge(
    dup_ids[["application_id", "classification"]].drop_duplicates(),
    on="application_id",
    how="left"
)
df["is_duplicated"] = df["classification"].notna().astype(int)
df["is_conflict"] = (df["classification"] == "conflict").astype(int)

def group_check(col_name: str) -> pd.DataFrame:
    base = df[col_name].fillna("missing").value_counts().rename_axis(col_name).reset_index(name="n_total")
    sub = (
        df[df["is_duplicated"] == 1][col_name]
        .fillna("missing")
        .value_counts()
        .rename_axis(col_name)
        .reset_index(name="n_duplicated")
    )
    conf = (
        df[df["is_conflict"] == 1][col_name]
        .fillna("missing")
        .value_counts()
        .rename_axis(col_name)
        .reset_index(name="n_conflict")
    )

    out = base.merge(sub, on=col_name, how="left").merge(conf, on=col_name, how="left").fillna(0)
    out["n_duplicated"] = out["n_duplicated"].astype(int)
    out["n_conflict"] = out["n_conflict"].astype(int)
    out["dup_rate"] = out["n_duplicated"] / out["n_total"]
    out["conflict_rate"] = out["n_conflict"] / out["n_total"]
    return out.sort_values("n_total", ascending=False)

# Produce and save group checks 
if gender_col is not None:
    gender_check = group_check(gender_col)
    gender_check.to_csv(out_dir / "duplicate_group_check_gender.csv", index=False)
    display(gender_check)
else:
    print("\nWARNING: No gender column detected in applications_analysis.csv. Skipping gender check.")
    print("Available columns:", list(ana_norm.columns))

if age_col is not None:
    age_check = group_check(age_col)
    age_check.to_csv(out_dir / "duplicate_group_check_age_band.csv", index=False)
    display(age_check)
else:
    print("\nWARNING: No age/age_band column detected in applications_analysis.csv. Skipping age check.")
    print("Available columns:", list(ana_norm.columns))


Detected columns:
 - join key: application_id
 - gender col: clean_gender
 - age col: age_band


Unnamed: 0,clean_gender,n_total,n_duplicated,n_conflict,dup_rate,conflict_rate
0,Female,250,0,0,0.0,0.0
1,Male,247,1,0,0.004049,0.0
2,missing,3,1,1,0.333333,0.333333


Unnamed: 0,age_band,n_total,n_duplicated,n_conflict,dup_rate,conflict_rate
0,35-44,174,1,0,0.005747,0.0
1,25-34,152,0,0,0.0,0.0
2,45-54,87,0,0,0.0,0.0
3,55-64,56,0,0,0.0,0.0
4,65+,13,0,0,0.0,0.0
5,<25,13,0,0,0.0,0.0
6,missing,5,1,1,0.2,0.2
