
# 09 â€” Data Quality Audit: Accuracy, Completeness, Consistency, Validity, Uniqueness, Timeliness 

Simulate a small dataset and run basic audit checks with clear flags and counts.


In [None]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(123)

n = 30
today = datetime(2025,9,24)
df = pd.DataFrame({
    "id": [f"u{i:03d}" for i in range(n)],
    "age": np.random.randint(-5, 130, size=n),  # includes invalid ages
    "email": [f"user{i}@example.com" for i in range(n)],
    "region": np.random.choice(["North","South","East","West"], size=n),
    "income": np.random.choice([np.nan, 20,25,30,40,50,60,80], size=n, p=[0.2,0.1,0.1,0.15,0.15,0.15,0.1,0.05]),
    "updated_at": [today - timedelta(days=int(d)) for d in np.random.randint(0,60,size=n)],
})
# Inject issues
df.loc[5,"email"] = "bademail.example.com"   # invalid format
df = pd.concat([df, df.iloc[[3]]], ignore_index=True)  # duplicate row
df.loc[10,"id"] = df.loc[3,"id"]             # duplicate id

print(df.head())

# Accuracy/Validity: age in [0,110]
mask_age_bad = ~df["age"].between(0,110)
# Validity: email contains '@'
mask_email_bad = ~df["email"].astype(str).str.contains("@", regex=False)
# Completeness: income missing
mask_income_na = df["income"].isna()
# Uniqueness: duplicated id
mask_id_dup = df["id"].duplicated(keep=False)
# Timeliness: updated within 30 days
mask_stale = (today - df["updated_at"]).dt.days > 30

summary = pd.DataFrame({
    "age_bad": mask_age_bad.sum(),
    "email_bad": mask_email_bad.sum(),
    "income_na": mask_income_na.sum(),
    "id_dups": mask_id_dup.sum(),
    "stale": mask_stale.sum(),
}, index=["count"]).T

print("\nIssue counts:\n", summary)

# ---- Classwork ----
# 1) Write a function 'audit_report(df)' returning per-column missingness rates and basic validity checks.
# 2) Define and enforce an SLA: e.g., <=10% missing income, <=1% invalid emails; assert these conditions.
# 3) Deduplicate: keep latest 'updated_at' for duplicated ids; verify uniqueness post-fix.
