In [5]:
from pathlib import Path
import sys
import re
import pandas as pd
import numpy as np

try:
    from IPython.display import display
except Exception:
    display = print


# Helpers: robust path finding
def _is_repo_root(p: Path) -> bool:
    if not (p / "data").is_dir():
        return False
    code_markers = [p / "src", p / ".git", p / "pyproject.toml", p / "requirements.txt"]
    return any(m.exists() for m in code_markers)


def find_repo_root(start: Path) -> Path | None:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if _is_repo_root(p):
            return p
    for p in [start] + list(start.parents):
        if (p / "data" / "curated").is_dir() or (p / "data" / "quality").is_dir():
            return p
    return None


def first_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p is not None and p.exists() and p.is_file():
            return p
    return None


def rglob_first(base: Path, filename: str) -> Path | None:
    if base is None or not base.exists():
        return None
    try:
        for p in base.rglob(filename):
            if p.is_file():
                return p
    except Exception:
        return None
    return None


def find_dataset_path(repo_root: Path | None, filename: str) -> Path:
    candidates: list[Path] = []

    if repo_root is not None:
        candidates += [
            repo_root / "data" / "curated" / filename,
            repo_root / "data" / "quality" / filename,
            repo_root / "data" / "quality" / "reports" / "pre" / filename,
            repo_root / "data" / "quality" / "reports" / "post" / filename,
            repo_root / "data" / "quality" / "duplicates" / filename,
            repo_root / "data" / "quality" / "catalogs" / filename,
        ]

    candidates += [
        Path("/mnt/data") / filename,
        Path("/mnt/data") / filename.replace(".csv", ".xlsx"),
    ]

    found = first_existing(candidates)
    if found is not None:
        return found

    if repo_root is not None:
        found = rglob_first(repo_root / "data", filename)
        if found is not None:
            return found

    found = rglob_first(Path("/mnt/data"), filename)
    if found is not None:
        return found

    searched = "\n".join([f"  - {p}" for p in candidates] + [
        "  - (searched under repo_root/data/**)" if repo_root is not None else "  - (repo_root not found; skipped repo search)",
        "  - (searched under /mnt/data/**)",
    ])
    raise FileNotFoundError(f"Could not find {filename}.\nSearched:\n{searched}")


# Locate repo + output folder
START = Path.cwd()
REPO_ROOT = find_repo_root(START)


if REPO_ROOT is not None:
    repo_root_str = str(REPO_ROOT)
    if repo_root_str not in sys.path:
        sys.path.insert(0, repo_root_str)

# Force outputs to pseudonymization_study
OUT_DIR = (
    REPO_ROOT / "data" / "governance" / "pseudonymization_study"
    if REPO_ROOT is not None
    else Path("/mnt/data") / "pseudonymization_study"
)
OUT_DIR.mkdir(parents=True, exist_ok=True)


# Find files
ANALYSIS_PATH = find_dataset_path(REPO_ROOT, "applications_analysis.csv")
CURATED_FULL_PATH = find_dataset_path(REPO_ROOT, "applications_curated_full.csv")

# Load analysis
analysis = pd.read_csv(ANALYSIS_PATH)

required_analysis = ["applicant_pseudo_id", "pseudo_id_source", "pseudo_id_fallback_used_flag"]
direct_pii_cols = [
    "raw_applicant_full_name",
    "raw_applicant_email",
    "raw_applicant_ssn",
    "raw_applicant_ip_address",
    "raw_applicant_date_of_birth",
    "clean_email",
    "clean_date_of_birth",
]

required_present = {c: (c in analysis.columns) for c in required_analysis}
direct_pii_present_in_analysis = [c for c in direct_pii_cols if c in analysis.columns]

# Normalize fallback flag to boolean 
if "pseudo_id_fallback_used_flag" in analysis.columns:
    if analysis["pseudo_id_fallback_used_flag"].dtype == object:
        analysis["pseudo_id_fallback_used_flag"] = (
            analysis["pseudo_id_fallback_used_flag"]
            .astype(str)
            .str.strip()
            .str.lower()
            .map({"true": True, "false": False})
        )


# Load curated_full (only needed cols)
curated_cols = list(pd.read_csv(CURATED_FULL_PATH, nrows=0).columns)

curated_cols_for_shot = [
    "application_id",
    "application_row_id",
    "is_canonical_for_analysis",
    "raw_applicant_full_name",
    "raw_applicant_email",
    "raw_applicant_ssn",
    "raw_applicant_ip_address",
    "raw_applicant_date_of_birth",
    "clean_email",
    "clean_date_of_birth",
    "raw_applicant_zip_code",
    "clean_zip_code",
    "clean_gender",
    "clean_annual_income",
]
usecols = [c for c in curated_cols_for_shot if c in curated_cols]
curated = pd.read_csv(CURATED_FULL_PATH, usecols=usecols)

direct_pii_present_in_curated = [c for c in direct_pii_cols if c in curated.columns]
pii_non_null_counts = {c: int(curated[c].notna().sum()) for c in direct_pii_present_in_curated}


# Redaction helpers 
def _mask_text(v, replacement="[REDACTED]"):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    return replacement if s != "" else v


def _mask_email(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    if s == "":
        return v
    if "@" not in s:
        return "[REDACTED_EMAIL]"
    local, domain = s.split("@", 1)
    local_masked = (local[:1] + "***") if local else "***"
    return f"{local_masked}@{domain}"


def _mask_ssn(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    return "***-**-" + s[-4:] if s else v


def _mask_ip(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    return "[REDACTED_IP]" if s else v


def _mask_zip(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    return "[REDACTED_ZIP]" if s else v


def _mask_dob(v):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return v
    s = str(v).strip()
    if s == "":
        return v
    m = re.search(r"(19|20)\d{2}", s)
    year = m.group(0) if m else "XXXX"
    return f"{year}-**-**"


# Build redacted curated preview 
preview = curated.head(10).copy()

for col in preview.columns:
    col_l = col.lower()
    if "ssn" in col_l:
        preview[col] = preview[col].apply(_mask_ssn)
    elif "email" in col_l:
        preview[col] = preview[col].apply(_mask_email)
    elif "ip_address" in col_l:
        preview[col] = preview[col].apply(_mask_ip)
    elif "date_of_birth" in col_l:
        preview[col] = preview[col].apply(_mask_dob)
    elif "zip_code" in col_l:
        preview[col] = preview[col].apply(_mask_zip)
    elif "full_name" in col_l or col_l.endswith("name"):
        preview[col] = preview[col].apply(lambda x: _mask_text(x, "[REDACTED_NAME]"))


# Pseudonymization monitoring metrics 
metrics = {}

metrics["analysis_rows"] = int(len(analysis))
metrics["analysis_cols"] = int(analysis.shape[1])

if "applicant_pseudo_id" in analysis.columns:
    metrics["missing_applicant_pseudo_id"] = int(analysis["applicant_pseudo_id"].isna().sum())
    metrics["unique_applicant_pseudo_id"] = int(analysis["applicant_pseudo_id"].nunique(dropna=True))
    metrics["duplicate_applicant_pseudo_id_count"] = int(analysis["applicant_pseudo_id"].duplicated().sum())
else:
    metrics["missing_applicant_pseudo_id"] = None
    metrics["unique_applicant_pseudo_id"] = None
    metrics["duplicate_applicant_pseudo_id_count"] = None

if "pseudo_id_source" in analysis.columns:
    pseudo_source_dist = (
        analysis["pseudo_id_source"]
        .fillna("MISSING")
        .value_counts(dropna=False)
        .rename("count")
        .to_frame()
        .assign(pct=lambda d: (100 * d["count"] / len(analysis)).round(2))
        .reset_index(names="pseudo_id_source")
    )
else:
    pseudo_source_dist = pd.DataFrame(columns=["pseudo_id_source", "count", "pct"])

if "pseudo_id_fallback_used_flag" in analysis.columns:
    fallback_rate = float(pd.Series(analysis["pseudo_id_fallback_used_flag"]).dropna().mean()) if len(analysis) > 0 else 0.0
    fallback_count = int(pd.Series(analysis["pseudo_id_fallback_used_flag"]).fillna(False).sum())
    metrics["fallback_used_count"] = fallback_count
    metrics["fallback_used_rate_pct"] = round(100 * fallback_rate, 2)
else:
    metrics["fallback_used_count"] = None
    metrics["fallback_used_rate_pct"] = None

# Optional: canonical share in curated_full (if column exists)
if "is_canonical_for_analysis" in curated.columns:
    canon_series = curated["is_canonical_for_analysis"]
    if canon_series.dtype == object:
        canon_series = canon_series.astype(str).str.strip().str.lower().map({"true": True, "false": False})
    metrics["curated_canonical_rate_pct"] = round(100 * float(pd.Series(canon_series).dropna().mean()), 2)
else:
    metrics["curated_canonical_rate_pct"] = None


# Print + save evidence
summary = []
summary.append("=== Evidence Summary (A + B) ===")
summary.append(f"ANALYSIS  rows={len(analysis):,} cols={analysis.shape[1]:,}")
summary.append(f"CURATED   rows={len(curated):,} cols={curated.shape[1]:,}")
summary.append("")
summary.append("A) applications_analysis.csv checks")
summary.append(f"Required fields present: {required_present}")
summary.append(f"Direct PII columns present in ANALYSIS (should be []): {direct_pii_present_in_analysis}")
summary.append("")
summary.append("B) applications_curated_full.csv checks")
summary.append(f"Direct PII columns present in CURATED_FULL (should be non-empty): {direct_pii_present_in_curated}")
summary.append(f"Non-null counts for PII cols (CURATED_FULL): {pii_non_null_counts}")
summary.append("")
summary.append("Monitoring metrics")
summary.append(f"Fallback used: {metrics.get('fallback_used_count')} rows ({metrics.get('fallback_used_rate_pct')}%)")
summary.append(f"Missing applicant_pseudo_id: {metrics.get('missing_applicant_pseudo_id')}")
summary.append(f"Duplicate applicant_pseudo_id (note: may be expected across multiple applications): {metrics.get('duplicate_applicant_pseudo_id_count')}")
if metrics.get("curated_canonical_rate_pct") is not None:
    summary.append(f"Curated canonical rate (is_canonical_for_analysis): {metrics.get('curated_canonical_rate_pct')}%")
summary.append("")
summary.append(f"Curated preview columns used (redacted): {list(preview.columns)}")

summary_text = "\n".join(summary)

print(summary_text)
print("\n=== Curated preview (redacted, 10 rows) ===")
display(preview)

print("\n=== Pseudo ID source distribution (analysis) ===")
display(pseudo_source_dist)

# Save 
(OUT_DIR / "Summary.txt").write_text(summary_text, encoding="utf-8")
(OUT_DIR / "analysis_columns.txt").write_text("\n".join(list(analysis.columns)), encoding="utf-8")
preview.to_csv(OUT_DIR / "curated_full_preview_redacted_10rows.csv", index=False)
pseudo_source_dist.to_csv(OUT_DIR / "pseudo_id_source_distribution.csv", index=False)
pd.DataFrame([metrics]).to_csv(OUT_DIR / "pseudonymization_metrics.csv", index=False)

=== Evidence Summary (A + B) ===
ANALYSIS  rows=500 cols=16
CURATED   rows=502 cols=14

A) applications_analysis.csv checks
Required fields present: {'applicant_pseudo_id': True, 'pseudo_id_source': True, 'pseudo_id_fallback_used_flag': True}
Direct PII columns present in ANALYSIS (should be []): []

B) applications_curated_full.csv checks
Direct PII columns present in CURATED_FULL (should be non-empty): ['raw_applicant_full_name', 'raw_applicant_email', 'raw_applicant_ssn', 'raw_applicant_ip_address', 'raw_applicant_date_of_birth', 'clean_email', 'clean_date_of_birth']
Non-null counts for PII cols (CURATED_FULL): {'raw_applicant_full_name': 502, 'raw_applicant_email': 495, 'raw_applicant_ssn': 497, 'raw_applicant_ip_address': 497, 'raw_applicant_date_of_birth': 497, 'clean_email': 495, 'clean_date_of_birth': 497}

Monitoring metrics
Fallback used: 5 rows (1.0%)
Missing applicant_pseudo_id: 0
Duplicate applicant_pseudo_id (note: may be expected across multiple applications): 2
Curated 

Unnamed: 0,application_row_id,application_id,raw_applicant_full_name,raw_applicant_email,raw_applicant_ssn,raw_applicant_ip_address,raw_applicant_date_of_birth,raw_applicant_zip_code,clean_email,clean_gender,clean_date_of_birth,clean_zip_code,clean_annual_income,is_canonical_for_analysis
0,0,app_200,[REDACTED_NAME],j***@hotmail.com,***-**-4340,[REDACTED_IP],2001-**-**,[REDACTED_ZIP],j***@hotmail.com,Male,2001-**-**,[REDACTED_ZIP],73000.0,True
1,1,app_037,[REDACTED_NAME],b***@yahoo.com,***-**-4784,[REDACTED_IP],1992-**-**,[REDACTED_ZIP],b***@yahoo.com,Male,1992-**-**,[REDACTED_ZIP],78000.0,True
2,2,app_215,[REDACTED_NAME],s***@mail.com,***-**-5178,[REDACTED_IP],1989-**-**,[REDACTED_ZIP],s***@mail.com,Male,1989-**-**,[REDACTED_ZIP],61000.0,True
3,3,app_024,[REDACTED_NAME],t***@protonmail.com,***-**-1833,[REDACTED_IP],1983-**-**,[REDACTED_ZIP],t***@protonmail.com,Male,1983-**-**,[REDACTED_ZIP],103000.0,True
4,4,app_184,[REDACTED_NAME],b***@aol.com,***-**-2475,[REDACTED_IP],1999-**-**,[REDACTED_ZIP],b***@aol.com,Male,1999-**-**,[REDACTED_ZIP],57000.0,True
5,5,app_275,[REDACTED_NAME],m***@outlook.com,***-**-4912,[REDACTED_IP],1982-**-**,[REDACTED_ZIP],m***@outlook.com,Female,1982-**-**,[REDACTED_ZIP],110000.0,True
6,6,app_099,[REDACTED_NAME],n***@outlook.com,***-**-2503,[REDACTED_IP],1990-**-**,[REDACTED_ZIP],n***@outlook.com,Male,1990-**-**,[REDACTED_ZIP],55000.0,True
7,7,app_246,[REDACTED_NAME],s***@gmail.com,***-**-1864,[REDACTED_IP],1991-**-**,[REDACTED_ZIP],s***@gmail.com,Female,1991-**-**,[REDACTED_ZIP],82000.0,True
8,8,app_042,[REDACTED_NAME],j***@gmail.com,***-**-5530,[REDACTED_IP],1990-**-**,[REDACTED_ZIP],j***@gmail.com,Male,1990-**-**,[REDACTED_ZIP],69000.0,False
9,9,app_348,[REDACTED_NAME],m***@hotmail.com,***-**-8400,[REDACTED_IP],1989-**-**,[REDACTED_ZIP],m***@hotmail.com,Male,1989-**-**,[REDACTED_ZIP],55000.0,True



=== Pseudo ID source distribution (analysis) ===


Unnamed: 0,pseudo_id_source,count,pct
0,ssn,495,99.0
1,name_dob_zip_fallback,4,0.8
2,email_fallback,1,0.2
