# 02 â€” Discover + Classify (Tags + Risk Scoring)

Builds dataset inventory and applies a simple sensitivity classifier (PII-like patterns) to tag data and compute risk scores.

In [15]:
import os
import pandas as pd

# Ensure the output directory exists
OUT_DIR = os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

# Assuming 'df' DataFrame is already available from previous steps
classified_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")
df.to_parquet(classified_path, index=False)
print("Wrote:", classified_path)
print("PII rate:", (df["tags"].apply(lambda t: "PII" in t)).mean())

Wrote: /outputs/classified_telemetry.parquet
PII rate: 0.0


In [16]:
import pandas as pd
import os

OUT_DIR = os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "outputs")
classified_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")

if os.path.exists(classified_path):
    loaded_df = pd.read_parquet(classified_path)
    print("Successfully loaded:", classified_path, "rows:", len(loaded_df))
    display(loaded_df.head())
else:
    print(f"Error: File not found at {classified_path}. Please ensure the previous cells have been executed.")

Successfully loaded: /outputs/classified_telemetry.parquet rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,tags,risk_score
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,[],0.1
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,[],0.1
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,[],0.1
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,[],0.1
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,[],0.1


It appears the path `/outputs/classified_telemetry.parquet` refers to a file, not a directory. You can load this parquet file into a pandas DataFrame using `pd.read_parquet()`.

In [17]:
import pandas as pd
import os

OUT_DIR = os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "outputs")
classified_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")

if os.path.exists(classified_path):
    loaded_df = pd.read_parquet(classified_path)
    print("Successfully loaded:", classified_path, "rows:", len(loaded_df))
    display(loaded_df.head())
else:
    print(f"Error: File not found at {classified_path}. Please ensure the previous cells have been executed.")

Successfully loaded: /outputs/classified_telemetry.parquet rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,tags,risk_score
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,[],0.1
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,[],0.1
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,[],0.1
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,[],0.1
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,[],0.1


In [18]:
import os, json, math, random, re, time
from datetime import datetime, timedelta, timezone
import pandas as pd

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
OUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "audit_logs"), exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("OUT_DIR:", OUT_DIR)

BASE_DIR: /
OUT_DIR: /outputs


In [19]:
data_path = "/content/sample_data/california_housing_train.csv" # Changed path to an existing sample file
df = pd.read_csv(data_path)
print("Loaded:", data_path, "rows:", len(df))
df.head()

Loaded: /content/sample_data/california_housing_train.csv rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [20]:
# DISCOVER: Basic inventory + schema signals
inventory = {
    "dataset": "california_housing_train",
    "rows": int(len(df)),
    "columns": list(df.columns),
    "sample_locations": df["median_income"].head(5).unique().tolist(),
    "sample_devices": df["housing_median_age"].head(5).unique().tolist(),
}
inventory_path = os.path.join(OUT_DIR, "discover_inventory.json")
with open(inventory_path, "w") as f:
    json.dump(inventory, f, indent=2)
print("Wrote:", inventory_path)

Wrote: /outputs/discover_inventory.json


In [21]:

# CLASSIFY: Tag sensitive signals using simple patterns (public-safe demo)
PII_PATTERNS = {
    "EMAIL": re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"),
    "PHONE": re.compile(r"\+?\d{1,2}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"),
    "SSN_LIKE": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
}

def classify_note(note: str):
    tags = []
    if PII_PATTERNS["EMAIL"].search(note): tags.append("PII")
    if PII_PATTERNS["PHONE"].search(note): tags.append("PII")
    if PII_PATTERNS["SSN_LIKE"].search(note): tags.append("PII")
    # Sensitivity heuristics
    if "maintenance" in note.lower(): tags.append("OPERATIONAL")
    if tags: tags.append("HIGH_SENSITIVITY")
    return sorted(set(tags))

df["tags"] = df["median_income"].astype(str).apply(classify_note)
df["risk_score"] = df["tags"].apply(lambda t: 0.9 if "PII" in t else (0.5 if "OPERATIONAL" in t else 0.1))
df[["median_income","tags","risk_score"]].head(10)


Unnamed: 0,median_income,tags,risk_score
0,1.4936,[],0.1
1,1.82,[],0.1
2,1.6509,[],0.1
3,3.1917,[],0.1
4,1.925,[],0.1
5,3.3438,[],0.1
6,2.6768,[],0.1
7,1.7083,[],0.1
8,2.1782,[],0.1
9,2.1908,[],0.1


In [22]:

classified_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")
df.to_parquet(classified_path, index=False)
print("Wrote:", classified_path)
print("PII rate:", (df["tags"].apply(lambda t: "PII" in t)).mean())


Wrote: /outputs/classified_telemetry.parquet
PII rate: 0.0
