# 04 â€” Destroy (Simulated)

Generates a simulated deletion plan, tombstones, and audit logs based on retention days and legal hold flags. No destructive actions occur.

In [10]:
import os, json, math, random, re, time
from datetime import datetime, timedelta, timezone
import pandas as pd

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
OUT_DIR = os.path.join(BASE_DIR, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(os.path.join(OUT_DIR, "audit_logs"), exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("OUT_DIR:", OUT_DIR)

BASE_DIR: /
OUT_DIR: /outputs


In [11]:

governance_path = os.path.join(OUT_DIR, "governance_plan.parquet")
df = pd.read_parquet(governance_path)
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,timestamp,tags,event_id,device_id,retention_days
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,2025-12-01T16:28:00.389070+00:00,[],event_0,device_0,30
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,2025-11-30T16:28:00.389070+00:00,[],event_1,device_1,30
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,2025-11-29T16:28:00.389070+00:00,[],event_2,device_2,30
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,2025-11-28T16:28:00.389070+00:00,[],event_3,device_3,30
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,2025-11-27T16:28:00.389070+00:00,[],event_4,device_4,30


In [12]:
import json
import os

# Assuming OUT_DIR and audit_file are already defined from previous steps
# If not, they would need to be re-defined:
# BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
# OUT_DIR = os.path.join(BASE_DIR, "outputs")
# audit_file = os.path.join(OUT_DIR, "audit_logs", "audit_destroy_20260130T162821Z.json") # Or use the actual file name from kernel state

with open(audit_file, "r") as f:
    audit_log_content = json.load(f)

print("Audit Log Content:")
display(audit_log_content)

Audit Log Content:


{'run_ts': '2026-01-30T16:28:21.368490+00:00',
 'dataset': 'aiot_telemetry',
 'destroy_mode': 'SIMULATED',
 'records_marked': 17000,
 'legal_holds': 0,
 'notes': 'No deletes executed; created destroy_plan.csv + tombstones.jsonl only.'}

### Generate `classified_telemetry.parquet`

Based on previous context, we need to create the `classified_telemetry.parquet` file. This involves loading `california_housing_train.csv`, adding `timestamp` and `tags` columns, and then saving the result.

In [13]:
california_housing_path = "/content/sample_data/california_housing_train.csv"
telemetry_df = pd.read_csv(california_housing_path)

# Add a dummy 'timestamp' column for demonstration purposes
# Ensure timestamps are in the past to allow for 'age_days' calculation
start_date = datetime.now(timezone.utc) - timedelta(days=60)
telemetry_df["timestamp"] = [ (start_date - timedelta(days=i)).isoformat() for i in range(len(telemetry_df)) ]

# Add a dummy 'tags' column, as no PII was previously detected
telemetry_df["tags"] = [[] for _ in range(len(telemetry_df))]

# Add dummy 'event_id' and 'device_id' for consistency with typical telemetry data
telemetry_df["event_id"] = [f"event_{i}" for i in range(len(telemetry_df))]
telemetry_df["device_id"] = [f"device_{i % 100}" for i in range(len(telemetry_df))]

classified_telemetry_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")
telemetry_df.to_parquet(classified_telemetry_path, index=False)

print(f"Created: {classified_telemetry_path}, rows: {len(telemetry_df)}")
display(telemetry_df.head())

Created: /outputs/classified_telemetry.parquet, rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,timestamp,tags,event_id,device_id
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,2025-12-01T16:30:15.760375+00:00,[],event_0,device_0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,2025-11-30T16:30:15.760375+00:00,[],event_1,device_1
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,2025-11-29T16:30:15.760375+00:00,[],event_2,device_2
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,2025-11-28T16:30:15.760375+00:00,[],event_3,device_3
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,2025-11-27T16:30:15.760375+00:00,[],event_4,device_4


In [14]:
classified_telemetry_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")
df = pd.read_parquet(classified_telemetry_path)

# Assign retention days based on tags (e.g., longer retention for PII)
def assign_retention_days(tags):
    if isinstance(tags, list) and "PII" in tags:
        return 365  # 1 year retention for PII
    return 30      # 30 days retention for non-PII

df["retention_days"] = df["tags"].apply(assign_retention_days)

governance_path = os.path.join(OUT_DIR, "governance_plan.parquet")
df.to_parquet(governance_path, index=False)

print(f"Wrote: {governance_path}, rows: {len(df)}")
display(df.head())

Wrote: /outputs/governance_plan.parquet, rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,timestamp,tags,event_id,device_id,retention_days
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,2025-12-01T16:30:15.760375+00:00,[],event_0,device_0,30
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,2025-11-30T16:30:15.760375+00:00,[],event_1,device_1,30
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,2025-11-29T16:30:15.760375+00:00,[],event_2,device_2,30
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,2025-11-28T16:30:15.760375+00:00,[],event_3,device_3,30
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,2025-11-27T16:30:15.760375+00:00,[],event_4,device_4,30


### Generate `governance_plan.parquet`

Based on the `classified_telemetry.parquet` file, we will now generate the `governance_plan.parquet` by adding `retention_days`. We'll assume a simple rule: data containing PII (`tags` includes 'PII') will have a retention of 365 days, while other data will have a retention of 30 days.

In [15]:
classified_telemetry_path = os.path.join(OUT_DIR, "classified_telemetry.parquet")
df = pd.read_parquet(classified_telemetry_path)

# Assign retention days based on tags (e.g., longer retention for PII)
def assign_retention_days(tags):
    if isinstance(tags, list) and "PII" in tags:
        return 365  # 1 year retention for PII
    return 30      # 30 days retention for non-PII

df["retention_days"] = df["tags"].apply(assign_retention_days)

governance_path = os.path.join(OUT_DIR, "governance_plan.parquet")
df.to_parquet(governance_path, index=False)

print(f"Wrote: {governance_path}, rows: {len(df)}")
display(df.head())

Wrote: /outputs/governance_plan.parquet, rows: 17000


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,timestamp,tags,event_id,device_id,retention_days
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,2025-12-01T16:30:15.760375+00:00,[],event_0,device_0,30
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,2025-11-30T16:30:15.760375+00:00,[],event_1,device_1,30
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,2025-11-29T16:30:15.760375+00:00,[],event_2,device_2,30
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,2025-11-28T16:30:15.760375+00:00,[],event_3,device_3,30
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,2025-11-27T16:30:15.760375+00:00,[],event_4,device_4,30


In [16]:

# DESTROY (SIMULATED): Generate deletion plan + tombstones + audit logs
# No hard delete is performed. Records are marked and logged only.

now = datetime.now(timezone.utc)

def parse_ts(ts):
    return datetime.fromisoformat(ts)

df["timestamp_dt"] = df["timestamp"].apply(parse_ts)
df["age_days"] = (now - df["timestamp_dt"]).dt.total_seconds() / (24*3600)

# Eligible for (simulated) deletion if older than retention and not under legal hold (we simulate legal holds on PII)
df["legal_hold"] = df["tags"].apply(lambda t: True if "PII" in t else False)
df["eligible_destroy_simulated"] = (df["age_days"] > df["retention_days"]) & (~df["legal_hold"])

plan = df.loc[df["eligible_destroy_simulated"], ["event_id","device_id","timestamp","retention_days","age_days","tags"]].copy()
plan["destroy_mode"] = "SIMULATED"
plan_path = os.path.join(OUT_DIR, "destroy_plan.csv")
plan.to_csv(plan_path, index=False)
print("Wrote:", plan_path, "rows:", len(plan))
plan.head(10)


Wrote: /outputs/destroy_plan.csv rows: 17000


Unnamed: 0,event_id,device_id,timestamp,retention_days,age_days,tags,destroy_mode
0,event_0,device_0,2025-12-01T16:30:15.760375+00:00,30,60.000006,[],SIMULATED
1,event_1,device_1,2025-11-30T16:30:15.760375+00:00,30,61.000006,[],SIMULATED
2,event_2,device_2,2025-11-29T16:30:15.760375+00:00,30,62.000006,[],SIMULATED
3,event_3,device_3,2025-11-28T16:30:15.760375+00:00,30,63.000006,[],SIMULATED
4,event_4,device_4,2025-11-27T16:30:15.760375+00:00,30,64.000006,[],SIMULATED
5,event_5,device_5,2025-11-26T16:30:15.760375+00:00,30,65.000006,[],SIMULATED
6,event_6,device_6,2025-11-25T16:30:15.760375+00:00,30,66.000006,[],SIMULATED
7,event_7,device_7,2025-11-24T16:30:15.760375+00:00,30,67.000006,[],SIMULATED
8,event_8,device_8,2025-11-23T16:30:15.760375+00:00,30,68.000006,[],SIMULATED
9,event_9,device_9,2025-11-22T16:30:15.760375+00:00,30,69.000006,[],SIMULATED


In [17]:

# Tombstones: minimal record indicating deletion intent (simulated)
tombstones = plan[["event_id","destroy_mode"]].copy()
tombstones["tombstone_ts"] = now.isoformat()
tombstone_path = os.path.join(OUT_DIR, "tombstones.jsonl")
with open(tombstone_path, "w") as f:
    for _, row in tombstones.iterrows():
        f.write(json.dumps(row.to_dict()) + "\n")
print("Wrote:", tombstone_path, "rows:", len(tombstones))


Wrote: /outputs/tombstones.jsonl rows: 17000


In [18]:

# Audit log entry (single summary)
audit = {
    "run_ts": now.isoformat(),
    "dataset": "aiot_telemetry",
    "destroy_mode": "SIMULATED",
    "records_marked": int(len(plan)),
    "legal_holds": int(df["legal_hold"].sum()),
    "notes": "No deletes executed; created destroy_plan.csv + tombstones.jsonl only."
}
audit_file = os.path.join(OUT_DIR, "audit_logs", f"audit_destroy_{now.strftime("%Y%m%dT%H%M%SZ")}.json")
with open(audit_file, "w") as f:
    json.dump(audit, f, indent=2)
print("Wrote:", audit_file)
audit


Wrote: /outputs/audit_logs/audit_destroy_20260130T163016Z.json


{'run_ts': '2026-01-30T16:30:16.236280+00:00',
 'dataset': 'aiot_telemetry',
 'destroy_mode': 'SIMULATED',
 'records_marked': 17000,
 'legal_holds': 0,
 'notes': 'No deletes executed; created destroy_plan.csv + tombstones.jsonl only.'}