# 🛡️ Data Governance Lab: Redaction, License Check, and Provenance
This notebook helps you apply the ethics, PII filtering, and metadata tracking from Module 10.

## ✅ Step 1: Load a Dataset and Print a Sample

In [None]:
from pathlib import Path
import json

input_path = Path("data/internal_curated/sample.jsonl")
with input_path.open() as f:
    lines = [json.loads(l) for l in f.readlines()]
print(f"Loaded {len(lines)} examples.")
lines[:2]

## 🔍 Step 2: Detect PII and Secrets

In [None]:
import re
violations = []
clean = []

EMAIL_RE = re.compile(r"[\w\.-]+@[\w\.-]+")
KEYWORDS = ["secret", "token", "password"]

for line in lines:
    text = line.get("instruction", "")
    found_email = EMAIL_RE.search(text)
    found_secret = any(kw in text.lower() for kw in KEYWORDS)
    if found_email or found_secret:
        violations.append({"text": text})
    else:
        clean.append(line)

print(f"Found {len(violations)} risky lines.")

## ✂️ Step 3: Save Cleaned and Flagged Outputs

In [None]:
with open("data/internal_curated/clean.jsonl", "w") as f:
    for item in clean:
        f.write(json.dumps(item) + "\n")

with open("data/internal_curated/violations.csv", "w") as f:
    f.write("text\n")
    for item in violations:
        f.write(f"{item['text'].replace(chr(10), ' ')}\n")

print("✅ Clean + violations saved.")

## 🧾 Step 4: Generate Dataset Registry Entry

In [None]:
import hashlib

sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
entry = {
    "name": input_path.name,
    "sha256": sha256,
    "license": "MIT",
    "source": "internal/manual",
    "notes": "Redacted for PII, cleaned 2025-06-12"
}

with open("data/registry.jsonl", "a") as f:
    f.write(json.dumps(entry) + "\n")
print("✅ Registry entry appended.")