In [None]:
from pathlib import Path
import re
import sys
import pandas as pd

try:
    from IPython.display import display
except Exception:
    display = print



# Repo root discovery + helpers
def _is_repo_root(p: Path) -> bool:
    """Repo root should contain 'data' and at least one code marker."""
    if not (p / "data").is_dir():
        return False
    code_markers = [p / "src", p / ".git", p / "pyproject.toml", p / "requirements.txt", p / "README.md"]
    return any(m.exists() for m in code_markers)

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        if _is_repo_root(p):
            return p
    for p in [start] + list(start.parents):
        if (p / "data" / "curated").is_dir() or (p / "data" / "quality").is_dir():
            return p
    raise FileNotFoundError(
        "Could not locate REPO_ROOT. Expected a folder containing 'data' and (src/.git/pyproject.toml/requirements.txt/README.md)."
    )

def first_existing(paths: list[Path]) -> Path | None:
    for p in paths:
        if p is not None and p.exists():
            return p
    return None

def find_file_in_data(repo_root: Path, filename: str) -> Path | None:
    """Search inside repo_root/data for the first occurrence of filename."""
    data_dir = repo_root / "data"
    if not data_dir.is_dir():
        return None
    try:
        for p in data_dir.rglob(filename):
            if p.is_file():
                return p
    except Exception:
        return None
    return None

def find_all(repo_root: Path, filename: str) -> list[Path]:
    """Find all occurrences of a file under the repo."""
    return [p for p in repo_root.rglob(filename) if p.is_file()]


# Locate repo + stable output folder
REPO_ROOT = find_repo_root(Path.cwd())
repo_root_str = str(REPO_ROOT)
if repo_root_str not in sys.path:
    sys.path.insert(0, repo_root_str)

print("REPO_ROOT =", REPO_ROOT)

# Always write outputs to this canonical folder
GOV_DIR = REPO_ROOT / "data" / "governance" / "pii_fields_study"
GOV_DIR.mkdir(parents=True, exist_ok=True)
print("GOV_DIR (outputs will be saved here) =", GOV_DIR)


# Load the correct pii_inventory.csv (handle multiple versions)
def pick_best_pii_inventory(paths: list[Path]) -> tuple[Path, pd.DataFrame]:
    """
    Prefer the canonical inventory with:
      field_path, classification, present_in, notes/purpose
    Fallback to a pre-processed version with:
      field_path, classification, present_in_raw/curated/analysis
    """
    scored = []
    for p in paths:
        try:
            df = pd.read_csv(p)
        except Exception:
            continue

        cols = set(df.columns)

        canonical = {"field_path", "classification", "present_in"}
        canonical_score = 3 if canonical.issubset(cols) else 0
        notes_score = 1 if ("notes/purpose" in cols) else 0

        processed = {"field_path", "classification", "present_in_raw", "present_in_curated", "present_in_analysis"}
        processed_score = 2 if processed.issubset(cols) else 0

        score = canonical_score + notes_score + processed_score
        scored.append((score, p, df))

    if not scored:
        raise FileNotFoundError("Could not read any pii_inventory.csv found in the repository.")

    scored.sort(key=lambda x: (x[0], -len(x[2].columns)), reverse=True)
    best_score, best_path, best_df = scored[0]

    if best_score == 0:
        raise ValueError(
            "Found pii_inventory.csv files but none match expected schemas.\n"
            "Candidates:\n" + "\n".join([str(s[1]) for s in scored[:10]])
        )

    # Inform if multiple matches exist
    if len(scored) > 1:
        top_paths = [str(s[1]) for s in scored[:5]]
        print("PII inventory candidates (top 5 by schema score):")
        for t in top_paths:
            print(" -", t)

    return best_path, best_df

all_pii_paths = find_all(REPO_ROOT, "pii_inventory.csv")
if not all_pii_paths:
    raise FileNotFoundError("No pii_inventory.csv found anywhere under the repo.")

PII_PATH, pii = pick_best_pii_inventory(all_pii_paths)

print("Using pii_inventory.csv =", PII_PATH)
print("pii_inventory columns:", pii.columns.tolist())

# Normalize into a standard internal schema:
# ensure we always have: field_path, classification, notes/purpose, present_in_raw, present_in_curated, present_in_analysis
if "present_in" in pii.columns:
    if "notes/purpose" not in pii.columns:
        pii["notes/purpose"] = ""

    def parse_presence(x: str) -> set[str]:
        if pd.isna(x):
            return set()
        s = str(x).strip().lower()
        for sep in ["|", ",", ";", " "]:
            s = s.replace(sep, "|")
        parts = [p for p in s.split("|") if p]
        return set(parts)

    presence_sets = pii["present_in"].apply(parse_presence)
    pii["present_in_raw"] = presence_sets.apply(lambda s: "raw" in s)
    pii["present_in_curated"] = presence_sets.apply(lambda s: "curated" in s)
    pii["present_in_analysis"] = presence_sets.apply(lambda s: "analysis" in s)

else:
    needed = {"present_in_raw", "present_in_curated", "present_in_analysis"}
    if not needed.issubset(set(pii.columns)):
        raise ValueError(
            "pii_inventory.csv does not contain 'present_in' nor the processed 'present_in_*' columns.\n"
            f"Columns found: {pii.columns.tolist()}"
        )
    if "notes/purpose" not in pii.columns:
        pii["notes/purpose"] = ""

# Confirm minimum columns now
required = {"field_path", "classification", "notes/purpose", "present_in_raw", "present_in_curated", "present_in_analysis"}
missing = required - set(pii.columns)
if missing:
    raise ValueError(f"After normalization, missing required columns: {missing}. Columns: {pii.columns.tolist()}")


# Normalize classification to pii_class
def norm_class(x: str) -> str:
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip()
    s_lower = s.lower()
    if s_lower == "pii":
        return "PII"
    if "quasi" in s_lower:
        return "Quasi-PII"
    if "non" in s_lower:
        return "Non-PII"
    return s

pii["pii_class"] = pii["classification"].apply(norm_class)


# Create PII presence matrix + export
matrix = (
    pii[
        [
            "field_path",
            "pii_class",
            "present_in_raw",
            "present_in_curated",
            "present_in_analysis",
            "notes/purpose",
        ]
    ]
    .rename(columns={"field_path": "field_name", "notes/purpose": "notes"})
    .sort_values(["pii_class", "field_name"], ascending=[True, True])
)

OUT_MATRIX = GOV_DIR / "pii_presence_matrix.csv"
matrix.to_csv(OUT_MATRIX, index=False)
print("Exported:", OUT_MATRIX)
display(matrix.head(10))


# Direct PII list + export
direct_pii_fields = (
    pii.loc[pii["pii_class"] == "PII", "field_path"]
    .dropna()
    .astype(str)
    .sort_values()
    .unique()
    .tolist()
)

OUT_DIRECT = GOV_DIR / "direct_pii_fields_list.txt"
OUT_DIRECT.write_text("\n".join(direct_pii_fields), encoding="utf-8")
print("Direct PII list saved to:", OUT_DIRECT)
print("Sample direct PII fields:", direct_pii_fields[:10])


# Load applications_analysis.csv 
analysis_candidates = [
    REPO_ROOT / "data" / "curated" / "applications_analysis.csv",
    REPO_ROOT / "data" / "curated" / "applications_analysis_clean.csv",
]

analysis_path = first_existing(analysis_candidates) or find_file_in_data(REPO_ROOT, "applications_analysis.csv")
if analysis_path is None:
    searched = "\n".join([f" - {p}" for p in analysis_candidates] + [" - (searched under data/**/applications_analysis.csv)"])
    raise FileNotFoundError("Could not find applications_analysis.csv.\nSearched:\n" + searched)

analysis = pd.read_csv(analysis_path)
print("Loaded analysis dataset:", analysis_path)
print("Rows, Cols:", analysis.shape)


# Direct PII column checks 
direct_set = set(direct_pii_fields)
exact_present = [c for c in analysis.columns if c in direct_set]

direct_leaf = {f.split(".")[-1].lower() for f in direct_pii_fields}
leaf_present = [c for c in analysis.columns if c.lower() in direct_leaf]

print("Direct PII columns found (exact match):", exact_present)
print("Direct PII columns found (leaf-name match):", leaf_present)



# 9) Leakage scan (object columns only, count-only outputs)
scan_df = analysis.select_dtypes(include=["object"]).fillna("").astype(str)

if scan_df.shape[1] == 0:
    print("No object columns to scan for leakage patterns.")
    leak_summary = pd.DataFrame(columns=["pattern", "sample_size_rows", "columns_scanned", "cell_hits", "columns_with_hits"])
    display(leak_summary)
else:
    sample = scan_df.sample(min(len(scan_df), 300), random_state=42)

    patterns = {
        "email_like": re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
        "ip_like": re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
        "ssn_like": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
    }

    # Summary table
    results = []
    for name, pat in patterns.items():
        per_col_hits = sample.apply(lambda col: col.str.contains(pat, regex=True, na=False).sum())
        cell_hits = int(per_col_hits.sum())
        col_hits = int((per_col_hits > 0).sum())
        results.append({
            "pattern": name,
            "sample_size_rows": len(sample),
            "columns_scanned": sample.shape[1],
            "cell_hits": cell_hits,
            "columns_with_hits": col_hits,
        })

    leak_summary = pd.DataFrame(results)

    OUT_SUMMARY = GOV_DIR / "analysis_pii_leakage_scan_summary.csv"
    leak_summary.to_csv(OUT_SUMMARY, index=False)
    print("Saved:", OUT_SUMMARY)

    # Column-level details (only column names + counts, no values)
    hit_details = []
    for name, pat in patterns.items():
        per_col_hits = sample.apply(lambda col: col.str.contains(pat, regex=True, na=False).sum())
        cols_with_hits = per_col_hits[per_col_hits > 0].sort_values(ascending=False)
        for col_name, n in cols_with_hits.items():
            hit_details.append({"pattern": name, "column": col_name, "hits": int(n)})

    hit_details_df = pd.DataFrame(hit_details)

    OUT_COLS = GOV_DIR / "analysis_pii_leakage_by_column.csv"
    hit_details_df.to_csv(OUT_COLS, index=False)
    print("Saved:", OUT_COLS)

    display(leak_summary)

REPO_ROOT = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03
GOV_DIR (outputs will be saved here) = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study
Using pii_inventory.csv = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\quality\pii_inventory.csv
pii_inventory columns: ['field_path', 'classification', 'present_in_raw', 'present_in_curated', 'present_in_analysis']
Exported: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\pii_presence_matrix.csv


Unnamed: 0,field_name,pii_class,present_in_raw,present_in_curated,present_in_analysis,notes
0,age_band,Non-PII,False,False,True,
1,applicant_info.date_of_birth,PII,True,True,False,
2,applicant_info.email,PII,True,True,False,
3,applicant_info.full_name,PII,True,True,False,
5,applicant_info.ip_address,PII,True,True,False,
6,applicant_info.ssn,PII,True,True,False,
4,applicant_info.gender,Quasi-PII,True,True,True,
7,applicant_info.zip_code,Quasi-PII,True,True,True,
8,applicant_pseudo_id,Quasi-PII,False,False,True,
9,application_id,Quasi-PII,True,True,True,


Direct PII list saved to: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\direct_pii_fields_list.txt
Sample direct PII fields: ['applicant_info.date_of_birth', 'applicant_info.email', 'applicant_info.full_name', 'applicant_info.ip_address', 'applicant_info.ssn']
Loaded analysis dataset: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\curated\applications_analysis.csv
Rows, Cols: (500, 16)
Direct PII columns found (exact match): []
Direct PII columns found (leaf-name match): []
Saved: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\analysis_pii_leakage_scan_summary.csv
Saved: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\governance\pii_fields_study\analysis_pii_leakage_by_column.csv


Unnamed: 0,pattern,sample_size_rows,columns_scanned,cell_hits,columns_with_hits
0,email_like,300,6,0,0
1,ip_like,300,6,0,0
2,ssn_like,300,6,0,0
