In [0]:
%sql
USE CATALOG dq_demo;
USE SCHEMA core;

CREATE TABLE IF NOT EXISTS dq_expectations (
  expectation_id   STRING,
  table_name       STRING,
  column_name      STRING,   -- or '*ROW_COUNT*' for volume rules
  expectation_type STRING,   -- e.g. 'NOT_NULL', 'RANGE', 'VOLUME_MIN'
  params_json      STRING,   -- JSON with thresholds, categories, etc.
  source           STRING,   -- 'baseline' vs 'agent'
  created_ts       TIMESTAMP
)
USING delta;

CREATE TABLE IF NOT EXISTS dq_expectation_metrics (
  expectation_id   STRING,
  table_name       STRING,
  column_name      STRING,
  batch_date       DATE,
  rows_checked     LONG,
  rows_violated    LONG,
  violation_fraction DOUBLE,
  eval_ts          TIMESTAMP
)
USING delta;


In [0]:
%sql
INSERT INTO dq_expectations VALUES
  ("E1", "transactions", "customer_id", "NOT_NULL",
   '{"min_non_null_fraction": 0.99}', "baseline", current_timestamp()),
  ("E2", "transactions", "amount", "RANGE",
   '{"min": 0.0, "max": 400.0}', "baseline", current_timestamp()),
  ("E3", "transactions", "*ROW_COUNT*", "VOLUME_MIN",
   '{"min_rows": 1000}', "baseline", current_timestamp());


In [0]:
from pyspark.sql import functions as F

spark.sql("USE CATALOG dq_demo")
spark.sql("USE SCHEMA core")

exp_df = spark.table("dq_expectations").where("table_name = 'transactions'")

tx = spark.table("dq_demo.core.transactions") \
          .withColumn("batch_date", F.col("created_date").cast("date"))

for row in exp_df.collect():
    eid = row["expectation_id"]
    col = row["column_name"]
    etype = row["expectation_type"]
    params = row["params_json"]

    # parse params_json if needed
    # for now you can hardcode thresholds or use F.lit()

    if etype == "NOT_NULL":
        agg = (
            tx.groupBy("batch_date")
              .agg(
                  F.count(F.lit(1)).alias("rows_checked"),
                  F.count(F.when(F.col(col).isNull(), 1)).alias("rows_violated"),
              )
              .withColumn("violation_fraction", F.col("rows_violated") / F.col("rows_checked"))
              .withColumn("expectation_id", F.lit(eid))
              .withColumn("table_name", F.lit("transactions"))
              .withColumn("column_name", F.lit(col))
              .withColumn("eval_ts", F.current_timestamp())
        )
    # similarly handle RANGE and VOLUME_MIN

    (
        agg.select(
            "expectation_id", "table_name", "column_name", "batch_date",
            "rows_checked", "rows_violated", "violation_fraction", "eval_ts"
        )
        .write.mode("append").format("delta")
        .saveAsTable("dq_demo.core.dq_expectation_metrics")
    )


In [0]:
%sql
-- how many columns have at least one expectation?
SELECT COUNT(DISTINCT column_name) AS columns_with_expectations
FROM dq_expectations
WHERE table_name = 'transactions';

-- expectation metrics summary
SELECT
  expectation_id,
  AVG(violation_fraction) AS avg_violation_fraction
FROM dq_expectation_metrics
GROUP BY expectation_id;
