In [None]:
from pathlib import Path
import re
import sys
import pandas as pd

# Optional 
try:
    from IPython.display import display
except Exception:
    display = print


# Helpers: robust path finding
def _is_repo_root(p: Path) -> bool:
    """Heuristic: repo root must contain 'data' and at least one code marker."""
    if not (p / "data").is_dir():
        return False
    code_markers = [p / "src", p / ".git", p / "pyproject.toml", p / "requirements.txt"]
    return any(m.exists() for m in code_markers)


def find_repo_root(start: Path) -> Path:
    """Walk upwards from 'start' to find the repo root."""
    start = start.resolve()
    for p in [start] + list(start.parents):
        if _is_repo_root(p):
            return p

    # Fallback: look for a folder that contains data/curated or data/quality
    for p in [start] + list(start.parents):
        if (p / "data" / "curated").is_dir() or (p / "data" / "quality").is_dir():
            return p

    raise FileNotFoundError(
        "Could not locate REPO_ROOT. I looked upward from:\n"
        f"  {start}\n"
        "Expected a folder containing 'data' and (src/.git/pyproject.toml/requirements.txt)."
    )


def first_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p is not None and p.exists():
            return p
    return None


def find_file_in_data(repo_root: Path, filename: str) -> Path | None:
    """Search inside repo_root/data for the first occurrence of filename."""
    data_dir = repo_root / "data"
    if not data_dir.is_dir():
        return None

    try:
        for p in data_dir.rglob(filename):
            if p.is_file():
                return p
    except Exception:
        return None

    return None



# Locate repo + stable folders
START = Path.cwd()
REPO_ROOT = find_repo_root(START)

# Keep sys.path clean 
repo_root_str = str(REPO_ROOT)
if repo_root_str not in sys.path:
    sys.path.insert(0, repo_root_str)

print("REPO_ROOT =", REPO_ROOT)

# Always write outputs to the canonical governance study folder inside the repo
GOV_DIR = REPO_ROOT / "data" / "governance" / "pii_fields_study"
GOV_DIR.mkdir(parents=True, exist_ok=True)
print("GOV_DIR (outputs will be saved here) =", GOV_DIR)



# Load pii_inventory.csv
pii_candidates = [
    GOV_DIR / "pii_inventory.csv",
    GOV_DIR.parent / "pii_inventory.csv",
    REPO_ROOT / "data" / "quality" / "catalogs" / "pii_inventory.csv",
]

PII_PATH = first_existing(pii_candidates) or find_file_in_data(REPO_ROOT, "pii_inventory.csv")

if PII_PATH is None:
    searched = "\n".join([f"  - {p}" for p in pii_candidates] + ["  - (searched under data/**/pii_inventory.csv)"])
    raise FileNotFoundError(
        "Could not find pii_inventory.csv.\nSearched:\n"
        f"{searched}\n\n"
        "Tip: confirm the file exists locally (OneDrive: right click -> 'Always keep on this device')."
    )

print("Using pii_inventory.csv =", PII_PATH)

try:
    pii = pd.read_csv(PII_PATH)
except Exception as e:
    raise RuntimeError(f"Found pii_inventory.csv at {PII_PATH} but failed to read it: {e}") from e


# Confirm expected columns
expected_cols = {"field_path", "classification", "notes/purpose", "present_in"}
missing = expected_cols - set(pii.columns)
if missing:
    raise ValueError(
        f"Missing columns in pii_inventory.csv: {missing}. Current columns: {pii.columns.tolist()}"
    )



# Build present_in_* columns
def parse_presence(x: str) -> set[str]:
    if pd.isna(x):
        return set()
    s = str(x).strip().lower()
    for sep in ["|", ",", ";", " "]:
        s = s.replace(sep, "|")
    parts = [p for p in s.split("|") if p]
    return set(parts)


presence_sets = pii["present_in"].apply(parse_presence)
pii["present_in_raw"] = presence_sets.apply(lambda s: "raw" in s)
pii["present_in_curated"] = presence_sets.apply(lambda s: "curated" in s)
pii["present_in_analysis"] = presence_sets.apply(lambda s: "analysis" in s)


# Normalize classification
def norm_class(x: str) -> str:
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip()
    s_lower = s.lower()
    if s_lower == "pii":
        return "PII"
    if "quasi" in s_lower:
        return "Quasi-PII"
    if "non" in s_lower:
        return "Non-PII"
    return s


pii["pii_class"] = pii["classification"].apply(norm_class)



# Create final matrix + export
matrix = (
    pii[
        [
            "field_path",
            "pii_class",
            "present_in_raw",
            "present_in_curated",
            "present_in_analysis",
            "notes/purpose",
        ]
    ]
    .rename(
        columns={
            "field_path": "field_name",
            "notes/purpose": "notes",
        }
    )
    .sort_values(["pii_class", "field_name"], ascending=[True, True])
)

OUT_MATRIX = GOV_DIR / "pii_presence_matrix.csv"
matrix.to_csv(OUT_MATRIX, index=False)
print("Exported:", OUT_MATRIX)
display(matrix.head(10))


# Direct PII list saved into GOV_DIR
direct_pii_fields = (
    pii.loc[pii["pii_class"] == "PII", "field_path"]
    .dropna()
    .astype(str)
    .sort_values()
    .unique()
    .tolist()
)

OUT_DIRECT = GOV_DIR / "direct_pii_fields_list.txt"
OUT_DIRECT.write_text("\n".join(direct_pii_fields), encoding="utf-8")
print("Direct PII list saved to:", OUT_DIRECT)
print("Sample direct PII fields:", direct_pii_fields[:10])



# Load applications_analysis.csv
analysis_candidates = [
    REPO_ROOT / "data" / "curated" / "applications_analysis.csv",
    REPO_ROOT / "data" / "curated" / "applications_analysis_clean.csv",
]

analysis_path = first_existing(analysis_candidates) or find_file_in_data(REPO_ROOT, "applications_analysis.csv")

if analysis_path is None:
    searched = "\n".join([f"  - {p}" for p in analysis_candidates] + ["  - (searched under data/**/applications_analysis.csv)"])
    raise FileNotFoundError(
        "Could not find applications_analysis.csv.\nSearched:\n"
        f"{searched}"
    )

analysis = pd.read_csv(analysis_path)
print("Loaded analysis dataset:", analysis_path)
print("Rows, Cols:", analysis.shape)


# Exact match check 
direct_set = set(direct_pii_fields)
exact_present = [c for c in analysis.columns if c in direct_set]

# Leaf-name check (in case columns are flattened/renamed)
direct_leaf = {f.split(".")[-1].lower() for f in direct_pii_fields}
leaf_present = [c for c in analysis.columns if c.lower() in direct_leaf]

print("Direct PII columns found (exact match):", exact_present)
print("Direct PII columns found (leaf-name match):", leaf_present)



# Leakage scan (object columns only, count-only outputs)
scan_df = analysis.select_dtypes(include=["object"]).fillna("").astype(str)

# Guard: empty object columns
if scan_df.shape[1] == 0:
    print("No object columns to scan for leakage patterns.")
    leak_summary = pd.DataFrame(columns=["pattern", "sample_size_rows", "columns_scanned", "cell_hits", "columns_with_hits"])
    display(leak_summary)
else:
    sample = scan_df.sample(min(len(scan_df), 300), random_state=42) if len(scan_df) > 0 else scan_df

    patterns = {
        "email_like": re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
        "ip_like": re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
        "ssn_like": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
    }

    # 1) Summary table
    results = []
    for name, pat in patterns.items():
        per_col_hits = sample.apply(lambda col: col.str.contains(pat, regex=True, na=False).sum())
        cell_hits = int(per_col_hits.sum())
        col_hits = int((per_col_hits > 0).sum())
        results.append(
            {
                "pattern": name,
                "sample_size_rows": len(sample),
                "columns_scanned": sample.shape[1],
                "cell_hits": cell_hits,
                "columns_with_hits": col_hits,
            }
        )

    leak_summary = pd.DataFrame(results)

    out_summary = GOV_DIR / "analysis_pii_leakage_scan_summary.csv"
    leak_summary.to_csv(out_summary, index=False)
    print("Saved:", out_summary)

    # 2) Column-level details (only column names + counts, no sensitive values)
    hit_details = []
    for name, pat in patterns.items():
        per_col_hits = sample.apply(lambda col: col.str.contains(pat, regex=True, na=False).sum())
        cols_with_hits = per_col_hits[per_col_hits > 0].sort_values(ascending=False)
        for col_name, n in cols_with_hits.items():
            hit_details.append({"pattern": name, "column": col_name, "hits": int(n)})

    hit_details_df = pd.DataFrame(hit_details)
    out_cols = GOV_DIR / "analysis_pii_leakage_by_column.csv"
    hit_details_df.to_csv(out_cols, index=False)
    print("Saved:", out_cols)

    display(leak_summary)

REPO_ROOT = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03
GOV_DIR (outputs will be saved here) = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study
Using pii_inventory.csv = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\quality\catalogs\pii_inventory.csv
Exported: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\pii_presence_matrix.csv


Unnamed: 0,field_name,pii_class,present_in_raw,present_in_curated,present_in_analysis,notes
1,age_band,Non-PII,False,False,True,Privacy-preserving derived age representation.
10,decision.approved_amount,Non-PII,True,True,False,Required when loan_approved=True.
11,decision.interest_rate,Non-PII,True,True,False,Required when loan_approved=True.
12,decision.loan_approved,Non-PII,True,True,False,Decision flag.
13,decision.rejection_reason,Non-PII,True,True,False,Required when loan_approved=False.
14,financials.annual_income,Non-PII,True,True,False,May drift into annual_salary.
15,financials.annual_salary,Non-PII,True,True,False,Field drift variant for annual_income.
16,financials.credit_history_months,Non-PII,True,True,False,Must be >= 0.
17,financials.debt_to_income,Non-PII,True,True,False,"Expected range [0, 1]."
18,financials.savings_balance,Non-PII,True,True,False,Must be >= 0.


Direct PII list saved to: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\direct_pii_fields_list.txt
Sample direct PII fields: ['applicant_info.date_of_birth', 'applicant_info.email', 'applicant_info.full_name', 'applicant_info.ip_address', 'applicant_info.ssn']
Loaded analysis dataset: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\curated\applications_analysis.csv
Rows, Cols: (500, 16)
Direct PII columns found (exact match): []
Direct PII columns found (leaf-name match): []
Saved: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\analysis_pii_leakage_scan_summary.csv
Saved: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\analysis_pii_leakage_by_column.csv


Unnamed: 0,pattern,sample_size_rows,columns_scanned,cell_hits,columns_with_hits
0,email_like,300,6,0,0
1,ip_like,300,6,0,0
2,ssn_like,300,6,0,0
