In [None]:
!pip install spark-expectations

In [None]:
# drop if rules table exists
spark.sql("drop table if exists default.coffeeco_tin_dq_rules")

# create rules table 
spark.sql("""create table if not exists default.coffeeco_tin_dq_rules (
    product_id STRING,  
    table_name STRING,  
    rule_type STRING,  
    rule STRING,  
    column_name STRING,  
    expectation STRING,  
    action_if_failed STRING,  
    tag STRING,  
    description STRING,  
    enable_for_source_dq_validation BOOLEAN,  
    enable_for_target_dq_validation BOOLEAN,  
    is_active BOOLEAN,  
    enable_error_drop_alert BOOLEAN,  
    error_drop_threshold INT,  
    query_dq_delimiter STRING,  
    enable_querydq_custom_output BOOLEAN
)
    USING delta""") 

In [None]:
spark.sql("select * from default.coffeeco_tin_dq_rules").printSchema()

In [None]:
rules_yaml="""
rules:
- action_if_failed: fail
  column_name: total_nanos
  description: total_nanos should not be null
  enable_error_drop_alert: false
  enable_for_source_dq_validation: false
  enable_for_target_dq_validation: false
  enable_querydq_custom_output: null
  error_drop_threshold: 0
  expectation: total_nanos is not null
  is_active: true
  product_id: coffeeco
  query_dq_delimiter: null
  rule: total_nanos_is_not_null
  rule_type: row_dq
  table_name: default.coffeeco_v1_orders_bronze
  tag: accuracy
- action_if_failed: drop
  column_name: store_closed_permanently_on
  description: store_closed_permanently_on should be greater than when the order is created, as closed stores cannot have orders
  enable_error_drop_alert: false
  enable_for_source_dq_validation: true
  enable_for_target_dq_validation: true
  enable_querydq_custom_output: null
  error_drop_threshold: 0
  expectation: `store_closed_permanently_on` > `timestamp`
  is_active: true
  product_id: coffeeco
  query_dq_delimiter: null
  rule: closed_store_shouldnt_have_orders
  rule_type: row_dq
  table_name: default.coffeeco_v1_orders_bronze
  tag: validity
- action_if_failed: ignore
  column_name: total_units
  description: sum of total units should be greater than 200
  enable_error_drop_alert: false
  enable_for_source_dq_validation: false
  enable_for_target_dq_validation: true
  enable_querydq_custom_output: null
  error_drop_threshold: 0
  expectation: sum(total_units) > 200
  is_active: true
  product_id: coffeeco
  query_dq_delimiter: null
  rule: sum_total_units_gt_200
  rule_type: agg_dq
  table_name: default.coffeeco_v1_orders_bronze
  tag: validity
- action_if_failed: ignore
  column_name: order_created,store_id
  description: detect duplicates
  enable_error_drop_alert: false
  enable_for_source_dq_validation: false
  enable_for_target_dq_validation: true
  enable_querydq_custom_output: True
  error_drop_threshold: 0
  expectation: (select count(*) from (select order_created, store_id from coffeeco_bronze_view t1 join default.coffeeco_v1_orders_bronze t2  on t1.order_created=t2.order_created, t1.store_id=t2.store_id)) = 0
  is_active: true
  product_id: coffeeco
  query_dq_delimiter: null
  rule: duplication_records_should_not_exist
  rule_type: query_dq
  table_name: default.coffeeco_v1_orders_bronze
  tag: validity
"""

In [None]:
# Insert Rules
import yaml
from pyspark.sql.types import StringType, BooleanType, IntegerType, StructType, StructField

schema = StructType([
    StructField("product_id", StringType(), nullable=False),
    StructField("table_name", StringType(), nullable=False),
    StructField("rule_type", StringType(), nullable=False),
    StructField("rule", StringType(), nullable=False),
    StructField("column_name", StringType(), nullable=False),
    StructField("expectation", StringType(), nullable=False),
    StructField("action_if_failed", StringType(), nullable=False),
    StructField("tag", StringType(), nullable=False),
    StructField("description", StringType(), nullable=False),
    StructField("enable_for_source_dq_validation", BooleanType(), nullable=False),
    StructField("enable_for_target_dq_validation", BooleanType(), nullable=False),
    StructField("is_active", BooleanType(), nullable=False),
    StructField("enable_error_drop_alert", BooleanType(), nullable=False),
    StructField("error_drop_threshold", IntegerType(), nullable=False),
    StructField("query_dq_delimiter", StringType(), nullable=True),
    StructField("enable_querydq_custom_output", BooleanType(), nullable=True)
])

rules_data = yaml.safe_load(rules_yaml)
rules_df = spark.createDataFrame(rules_data["rules"], schema)
rules_df.write.mode("overwrite").format("delta").saveAsTable(f"{database}.{rule_table_name}")

In [None]:
# Read the Tin table as a streaming source
coffee_orders_df = spark.readStream.format("delta")\
    .option("withEventTimeOrder", "true")\
    .table("default.coffeeco_v1_orders_tin")\
    .withWatermark("timestamp", "10 seconds") 

In [54]:
# Denorm the data in Tin as needed
from pyspark.sql.functions import col

def denorm(df):
    denorm_df = coffee_orders_df.select(
        "date",
        "timestamp",
        col("order.order_created").alias("order_created"),
        col("order.purchased_at.store_id").alias("store_id"),
        col("order.purchased_at.created").alias("store_created"),
        col("order.purchased_at.opened_on").alias("store_opened_on"),
        col("order.purchased_at.closed_permanently_on").alias("store_closed_permanently_on"),
        col("order.purchased_at.status").alias("store_status"),
        col("order.customer.name").alias("customer_name"),
        col("order.customer.uuid").alias("customer_uuid"),
        col("order.customer.first_seen").alias("customer_first_seen"),
        col("order.customer.customer_type").alias("customer_type"),
        col("order.customer.loyalty_member_id").alias("loyalty_member_id"),
        col("order.items").alias("items"),
        col("order.total.currency").alias("total_currency"),
        col("order.total.units").alias("total_units"),
        col("order.total.nanos").alias("total_nanos")
    )
    first_row = denorm_df.limit(1)
    row_to_copy = row_to_copy.withColumn("store_closed_permanently_on", lit("2024-01-01 00:00:00"))
    return denorm_df.union(row_to_copy)



In [None]:
# For each batch run  Spark Expectations before writing into Bronze Layer
from datetime import date
from spark_expectations.core.expectations import (
    SparkExpectations,
    WrappedDataFrameWriter,
)
from spark_expectations.config.user_config import Constants as user_config

def write_to_bronze(df, batch_id):
    writer = WrappedDataFrameWriter().mode("append").format("delta")
    se: SparkExpectations = SparkExpectations(
        product_id="coffeeco",
        rules_df=spark.table("default.coffeeco_tin_dq_rules"),
        stats_table="default.coffeeco_dq_stats",
        stats_table_writer=writer,
        target_and_error_table_writer=writer,
        debugger=False,
        # stats_streaming_options={user_config.se_enable_streaming: False},
    )
    
    dic_job_info = {
        "job": "coffeeco_load_tin_bronze",
        "Region": "NA",
        "Snapshot": date.today(),
    }
    
    user_conf = {
        user_config.se_notifications_enable_email: False,
        user_config.se_notifications_email_smtp_host: "mailhost.com",
        user_config.se_notifications_email_smtp_port: 25,
        user_config.se_notifications_email_from: "",
        user_config.se_notifications_email_to_other_mail_id: "",
        user_config.se_notifications_email_subject: "spark expectations - data quality - notifications",
        user_config.se_notifications_enable_slack: False,
        user_config.se_notifications_slack_webhook_url: "",
        user_config.se_notifications_on_start: True,
        user_config.se_notifications_on_completion: True,
        user_config.se_notifications_on_fail: True,
        user_config.se_notifications_on_error_drop_exceeds_threshold_breach: True,
        user_config.se_notifications_on_error_drop_threshold: 15,
        user_config.se_enable_query_dq_detailed_result: True,
        user_config.se_enable_agg_dq_detailed_result: True,
        # user_config.querydq_output_custom_table_name: "default.dq_stats_detailed_outputt",
        user_config.se_enable_error_table: True,
        user_config.se_dq_rules_params: {
            "env": "dev",
            "table": "coffeeco_v1_orders_bronze",
        },
        user_config.se_job_metadata: str(dic_job_info),
    }

    @se.with_expectations(
        target_table="default.coffeeco_v1_orders_bronze",
        write_to_table=True,
        write_to_temp_table=False,
        user_conf=user_conf,
        target_table_view="coffeeco_bronze_view"
    )
    def build_bronze(bronze_df):
        return bronze_df

    build_bronze(denorm(df))

In [None]:
# Write to bronze Layer
query = coffee_orders_df.writeStream\
    .foreachBatch(write_to_bronze)\
    .queryName("WriteToBronzeWithSE")\
    .option("checkpointLocation", checkpoint_path)\
    # .trigger(processingTime='5 seconds')\
    .start()

query.awaitTermination()

In [None]:
spark.catalog.listTables()

In [None]:
# Read the Bronze Table
coffee_v1_bronze_df = spark.sql("select * from default.coffeeco_v1_orders_bronze")

In [None]:
coffee_v1_bronze_df.printSchema()

In [None]:
# Read the error table from bronze layer which has dropped records from SE
coffee_v1_bronze_errors_df = spark.sql("select * from default.coffeeco_v1_orders_bronze_error")

In [None]:
coffee_v1_bronze_errors_df.printSchema()

In [None]:
coffee_v1_bronze_errors_df.show(truncate=False)

In [None]:
# Read the stats table
stats_df = spark.sql("select * from default.coffeeco_dq_stats")

In [None]:
stats_df.printSchema()

In [None]:
stats_df.show(truncate=False)

In [None]:
# Read the stats detailed table
stats_detailed_df = spark.sql("select * from default.coffeeco_dq_stats_detailed")

In [None]:
stats_detailed_df.printSchema()

In [None]:
stats_detailed_df.show(truncate=False)