In [0]:
%sql
USE CATALOG dq_demo;
USE SCHEMA core;

TRUNCATE TABLE dq_ground_truth;
TRUNCATE TABLE dq_incidents;
TRUNCATE TABLE dq_profiles;

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS dq_demo;
USE CATALOG dq_demo;

CREATE SCHEMA IF NOT EXISTS core;
USE SCHEMA core;


In [0]:
%python
spark.sql("USE CATALOG dq_demo")
spark.sql("USE SCHEMA core")


In [0]:
%python
from pyspark.sql import functions as F
from pyspark.sql import types as T
import uuid
from datetime import date, timedelta

# 1. Date range
start_date = date(2024, 1, 1)
num_days = 90
dates = [start_date + timedelta(days=i) for i in range(num_days)]

date_df = spark.createDataFrame(
    [(d,) for d in dates],
    schema=T.StructType([T.StructField("batch_date", T.DateType(), False)])
)

# 2. Customers (simple synthetic)
num_customers = 5000
customers_df = (
    spark.range(0, num_customers)
    .withColumn("customer_id", F.concat(F.lit("C"), F.col("id")))
    .withColumn(
        "signup_date",
        F.date_sub(
            F.lit(start_date),
            F.floor(F.rand() * 365).cast("int")
        )
    )
    .withColumn("segment", F.when(F.rand() < 0.3, F.lit("premium")).otherwise(F.lit("standard")))
    .withColumn("region", F.when(F.rand() < 0.5, F.lit("NA")).otherwise(F.lit("EU")))
    .drop("id")
)

# Write customers as Delta
(
    customers_df
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("dq_demo.core.customers")
)


In [0]:
# 3. Transactions
import random

rows = []
for d in dates:
    # variable number of transactions per day
    n_tx = random.randint(2000, 4000)
    for i in range(n_tx):
        cust_id = f"C{random.randint(0, num_customers-1)}"
        amount = max(1.0, random.gauss(100.0, 30.0))
        payment_method = random.choices(
            ["card", "cash", "finance"],
            weights=[0.6, 0.25, 0.15],
        )[0]
        region = random.choice(["NA", "EU"])
        rows.append((str(uuid.uuid4()), cust_id, d, amount, payment_method, region, d))

schema = T.StructType([
    T.StructField("transaction_id", T.StringType(), False),
    T.StructField("customer_id", T.StringType(), False),
    T.StructField("event_time", T.DateType(), False),
    T.StructField("amount", T.DoubleType(), False),
    T.StructField("payment_method", T.StringType(), False),
    T.StructField("region", T.StringType(), False),
    T.StructField("created_date", T.DateType(), False),
])

transactions_df = spark.createDataFrame(rows, schema=schema)

(
    transactions_df
    .write
    .mode("overwrite")
    .format("delta")
    .partitionBy("created_date")
    .saveAsTable("dq_demo.core.transactions")
)


In [0]:
from pyspark.sql.window import Window

# Reload transactions
tx = spark.table("dq_demo.core.transactions")

# Example: for a specific day, make many customer_id null => NULL_SPIKE
target_date = start_date + timedelta(days=30)

to_corrupt = (
    tx
    .where(F.col("created_date") == F.lit(target_date))
    .withColumn("row_num", F.row_number().over(Window.orderBy("transaction_id")))
)

# corrupt 30% of rows
corrupted = (
    to_corrupt
    .withColumn(
        "customer_id",
        F.when(F.col("row_num") <= F.col("row_num") * 0 + int(to_corrupt.count() * 0.3),
               F.lit(None).cast("string"))
        .otherwise(F.col("customer_id"))
    )
    .drop("row_num")
)

# Overwrite that partition
clean_others = tx.where(F.col("created_date") != F.lit(target_date))

tx_new = clean_others.unionByName(corrupted)

(
    tx_new
    .write
    .mode("overwrite")
    .format("delta")
    .partitionBy("created_date")
    .saveAsTable("dq_demo.core.transactions")
)


In [0]:
from pyspark.sql import Row

gt_rows = [
    Row(
        id=str(uuid.uuid4()),
        table_name="transactions",
        column_name="customer_id",
        batch_date=target_date,
        anomaly_type="NULL_SPIKE",
        injected_value_stats="30% customer_id set to NULL on this date"
    )
]

gt_schema = T.StructType([
    T.StructField("id", T.StringType(), False),
    T.StructField("table_name", T.StringType(), False),
    T.StructField("column_name", T.StringType(), False),
    T.StructField("batch_date", T.DateType(), False),
    T.StructField("anomaly_type", T.StringType(), False),
    T.StructField("injected_value_stats", T.StringType(), True),
])

gt_df = spark.createDataFrame(gt_rows, schema=gt_schema)

(
    gt_df
    .write
    .mode("append")
    .format("delta")
    .saveAsTable("dq_demo.core.dq_ground_truth")
)


In [0]:
spark.table("dq_demo.core.customers").limit(5).show()
spark.table("dq_demo.core.transactions").groupBy("created_date").count().orderBy("created_date").show()


In [0]:

# 6. AMOUNT_DRIFT anomaly on a later date
drift_date = start_date + timedelta(days=45)  # 2024-02-15

tx = spark.table("dq_demo.core.transactions")

drift_partition = tx.where(F.col("created_date") == F.lit(drift_date))
other_partitions = tx.where(F.col("created_date") != F.lit(drift_date))

# Inflate amount and add noise to simulate drift
drifted = (
    drift_partition
    .withColumn("amount", F.col("amount") * 3.0 + F.rand() * 10.0)
)

tx_with_drift = other_partitions.unionByName(drifted)

(
    tx_with_drift
    .write
    .mode("overwrite")
    .format("delta")
    .partitionBy("created_date")
    .saveAsTable("dq_demo.core.transactions")
)

# Ground truth row for AMOUNT_DRIFT
gt_rows_drift = [
    Row(
        id=str(uuid.uuid4()),
        table_name="transactions",
        column_name="amount",
        batch_date=drift_date,
        anomaly_type="AMOUNT_DRIFT",
        injected_value_stats="amount multiplied by 3 and noise added on this date"
    )
]

gt_df_drift = spark.createDataFrame(gt_rows_drift, schema=gt_schema)

(
    gt_df_drift
    .write
    .mode("append")
    .format("delta")
    .saveAsTable("dq_demo.core.dq_ground_truth")
)


In [0]:
# 7. VOLUME_DROP anomaly (fewer rows than normal)
volume_drop_date = start_date + timedelta(days=60)  # 2024-03-01

tx = spark.table("dq_demo.core.transactions")

drop_partition = tx.where(F.col("created_date") == F.lit(volume_drop_date))
other_partitions = tx.where(F.col("created_date") != F.lit(volume_drop_date))

# Keep only 20 percent of rows on that date
fraction = 0.2
dropped = drop_partition.sample(withReplacement=False, fraction=fraction, seed=42)

tx_with_drop = other_partitions.unionByName(dropped)

(
    tx_with_drop
    .write
    .mode("overwrite")
    .format("delta")
    .partitionBy("created_date")
    .saveAsTable("dq_demo.core.transactions")
)

# Ground truth row for VOLUME_DROP on the table (row_count anomaly)
gt_rows_drop = [
    Row(
        id=str(uuid.uuid4()),
        table_name="transactions",
        column_name="*ROW_COUNT*",  # indicate it is about volume, not one column
        batch_date=volume_drop_date,
        anomaly_type="VOLUME_DROP",
        injected_value_stats=f"row_count reduced to {int(fraction*100)}% on this date"
    )
]

gt_df_drop = spark.createDataFrame(gt_rows_drop, schema=gt_schema)

(
    gt_df_drop
    .write
    .mode("append")
    .format("delta")
    .saveAsTable("dq_demo.core.dq_ground_truth")
)


In [0]:

spark.table("dq_demo.core.dq_ground_truth").show(truncate=False)