In [4]:
from pathlib import Path
import sys
import pandas as pd


# Import data
REPO_ROOT = Path.cwd().resolve()
while not (REPO_ROOT / "src").exists() and REPO_ROOT != REPO_ROOT.parent:
    REPO_ROOT = REPO_ROOT.parent

sys.path.insert(0, str(REPO_ROOT))
print("REPO_ROOT =", REPO_ROOT)


# Define an output directory: same folder as this notebook

GOV_DIR = Path.cwd().resolve()
print("GOV_DIR (outputs will be saved here) =", GOV_DIR)


# Load pii_inventory.csv from the same folder
PII_PATH = GOV_DIR.parent / "pii_inventory.csv"
if not PII_PATH.exists():
    raise FileNotFoundError(
        f"Could not find pii_inventory.csv in {GOV_DIR}. "
        "Make sure the notebook is opened in the Governance folder or adjust PII_PATH."
    )

pii = pd.read_csv(PII_PATH)


# Confirm expected columns
expected_cols = {"field_path", "classification", "notes/purpose", "present_in"}
missing = expected_cols - set(pii.columns)
if missing:
    raise ValueError(
        f"Missing columns in pii_inventory.csv: {missing}. Current columns: {pii.columns.tolist()}"
    )

# Build present_in_* columns from 'present_in'
def parse_presence(x: str) -> set[str]:
    if pd.isna(x):
        return set()
    s = str(x).strip().lower()
    for sep in ["|", ",", ";", " "]:
        s = s.replace(sep, "|")
    parts = [p for p in s.split("|") if p]
    return set(parts)

presence_sets = pii["present_in"].apply(parse_presence)

pii["present_in_raw"] = presence_sets.apply(lambda s: "raw" in s)
pii["present_in_curated"] = presence_sets.apply(lambda s: "curated" in s)
pii["present_in_analysis"] = presence_sets.apply(lambda s: "analysis" in s)

# Normalize classification 
def norm_class(x: str) -> str:
    if pd.isna(x):
        return "Unknown"
    s = str(x).strip()
    s_lower = s.lower()
    if s_lower == "pii":
        return "PII"
    if "quasi" in s_lower:
        return "Quasi-PII"
    if "non" in s_lower:
        return "Non-PII"
    return s

pii["pii_class"] = pii["classification"].apply(norm_class)

# Create final matrix and export 
matrix = (
    pii[[
        "field_path",
        "pii_class",
        "present_in_raw",
        "present_in_curated",
        "present_in_analysis",
        "notes/purpose",
    ]]
    .rename(columns={
        "field_path": "field_name",
        "notes/purpose": "notes",
    })
    .sort_values(["pii_class", "field_name"], ascending=[True, True])
)

OUT_MATRIX = GOV_DIR / "pii_presence_matrix.csv"
matrix.to_csv(OUT_MATRIX, index=False)

print(f"Exported: {OUT_MATRIX}")
display(matrix.head(10))

# Direct PII list (for Phase C) saved INTO THE GOVERNANCE FOLDER
direct_pii_fields = (
    pii.loc[pii["pii_class"] == "PII", "field_path"]
    .dropna()
    .astype(str)
    .sort_values()
    .unique()
    .tolist()
)

OUT_DIRECT = GOV_DIR / "direct_pii_fields_list.txt"
OUT_DIRECT.write_text("\n".join(direct_pii_fields), encoding="utf-8")

print(f"Direct PII list saved to: {OUT_DIRECT}")
print("Sample direct PII fields:", direct_pii_fields[:10])

REPO_ROOT = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03
GOV_DIR (outputs will be saved here) = C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\quality\catalogs\governance
Exported: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\quality\catalogs\governance\pii_presence_matrix.csv


Unnamed: 0,field_name,pii_class,present_in_raw,present_in_curated,present_in_analysis,notes
1,age_band,Non-PII,False,False,True,Privacy-preserving derived age representation.
10,decision.approved_amount,Non-PII,True,True,True,Required when loan_approved=True.
11,decision.interest_rate,Non-PII,True,True,True,Required when loan_approved=True.
12,decision.loan_approved,Non-PII,True,True,True,Decision flag.
13,decision.rejection_reason,Non-PII,True,True,True,Required when loan_approved=False.
14,financials.annual_income,Non-PII,True,True,True,May drift into annual_salary.
15,financials.annual_salary,Non-PII,True,True,True,Field drift variant for annual_income.
16,financials.credit_history_months,Non-PII,True,True,True,Must be >= 0.
17,financials.debt_to_income,Non-PII,True,True,True,"Expected range [0, 1]."
18,financials.savings_balance,Non-PII,True,True,True,Must be >= 0.


Direct PII list saved to: C:\Users\anton\OneDrive - Nova SBE\Nova\S2\DEGO\DEGO_GP\DEGO_Project_Group03\data\quality\catalogs\governance\direct_pii_fields_list.txt
Sample direct PII fields: ['applicant_info.date_of_birth', 'applicant_info.email', 'applicant_info.full_name', 'applicant_info.ip_address', 'applicant_info.ssn']
