In [3]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Paths
repo_root = Path.cwd()
while not (repo_root / "data" / "quality" / "duplicate_id_report.csv").exists():
    if repo_root.parent == repo_root:
        raise FileNotFoundError("Could not find data/quality/duplicate_id_report.csv from current directory.")
    repo_root = repo_root.parent

out_dir = repo_root / "data" / "governance" / "identity_resolution_policy"
out_dir.mkdir(parents=True, exist_ok=True)

dup = pd.read_csv(repo_root / "data" / "quality" / "duplicate_id_report.csv")

n_ids = dup["application_id"].nunique()
counts = dup["classification"].value_counts()
n_conflict = int(counts.get("conflict", 0))
n_versioned = int(counts.get("versioned", 0))

# Canonical reason (unique reasons)
reasons = sorted(set(dup["canonical_reason"].dropna().astype(str).unique().tolist()))
reason_text = ", ".join(reasons) if reasons else "n/a"

# Build short examples text (truncate for display)
examples = []
for _, r in dup.iterrows():
    ex = str(r.get("example_differences", "")).strip()
    if len(ex) > 80:
        ex = ex[:77] + "..."
    examples.append(f"{r['application_id']}: {r['classification']} | diffs: {ex}")

plt.figure(figsize=(10, 4.8))
plt.axis("off")

title = "Identity Resolution — Duplicate Summary (Evidence)"
lines = [
    f"Total duplicated application_ids: {n_ids}",
    f"Conflict duplicates: {n_conflict}   |   Versioned duplicates: {n_versioned}",
    "",
    f"Canonical selection rule observed: {reason_text}",
    "Governance note: fallback selection indicates a provenance/auditability gap (missing/unparseable timestamps).",
    "",
    "Examples:",
    *[f"• {e}" for e in examples],
]

plt.text(0.02, 0.95, title, fontsize=18, weight="bold", va="top")
plt.text(0.02, 0.88, "\n".join(lines), fontsize=12, va="top")

out_path = out_dir / "duplicate_types_evidence_card.png"
plt.tight_layout()
plt.savefig(out_path, dpi=200, bbox_inches="tight")
plt.close()