In [0]:
from pyspark.sql import functions as F

spark.sql("USE CATALOG dq_demo")
spark.sql("USE SCHEMA core")

from dqcopilot.profiling.profiler import compute_profiles_for_table
from dqcopilot.profiling.anomalies import (
    detect_null_spike,
    detect_amount_drift,
    detect_volume_drop,
)

In [0]:
# Clear prior runs so metrics are clean
spark.sql("TRUNCATE TABLE dq_demo.core.dq_profiles")
spark.sql("TRUNCATE TABLE dq_demo.core.dq_incidents")


In [0]:
# Compute profiles for dq_demo.core.transactions
rows_written = compute_profiles_for_table(
    spark,
    table_name="transactions",
    date_col="created_date",
    truncate_existing=False,  # already truncated above
)
print(f"Profiles written: {rows_written}")

In [0]:
# NULL spike on customer_id
null_spike_incidents = detect_null_spike(
    spark,
    table_name="transactions",
    column_name="customer_id",
)

# Amount drift on amount
amount_drift_incidents = detect_amount_drift(
    spark,
    table_name="transactions",
    column_name="amount",
)

# Volume drop on row_count, using transaction_id as volume proxy
volume_drop_incidents = detect_volume_drop(
    spark,
    table_name="transactions",
    volume_column="transaction_id",
)

print("NULL_SPIKE incidents:", null_spike_incidents.count())
print("AMOUNT_DRIFT incidents:", amount_drift_incidents.count())
print("VOLUME_DROP incidents:", volume_drop_incidents.count())

In [0]:
spark.table("dq_demo.core.dq_profiles") \
     .where("table_name = 'transactions'") \
     .limit(10).show(truncate=False)

spark.table("dq_demo.core.dq_incidents") \
     .orderBy("batch_date", "incident_type") \
     .show(truncate=False)
