In [0]:
# 00_validate_bronze_encounters.ipynb
# SOURCE: Bronze Encounters table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_encounters"
BRONZE_ENCOUNTERS_TBL = "bronze_encounters"

encounters_df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = encounters_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.sum(F.when(F.col("ID").isNull(), 1).otherwise(0)).alias("null_id_count"),
    F.sum(F.when(F.col("PATIENT").isNull(), 1).otherwise(0)).alias("null_patient_count")
).first()

print("Validation stats:", stats_row.asDict())

In [None]:
# 2. Run assertions to catch critical data quality issues
assert stats_row.null_id_count == 0, f"{stats_row.null_id_count} null ID(s) found"
assert stats_row.null_patient_count == 0, f"{stats_row.null_patient_count} null PATIENT(s) found"
assert stats_row.distinct_ids == stats_row.row_count, "Duplicate Encounter ID(s) detected"
print("Bronze Encounters validation passed")

In [None]:
# 3. Append one-row summary to track record count (data quality check)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_meta")

validation_summary_df = (spark.createDataFrame([stats_row.asDict()])
                              .withColumn("table_name", F.lit(BRONZE_ENCOUNTERS_TBL))
                              .withColumn("_run_ts", F.current_timestamp()))

(validation_summary_df
        .write.mode("append")
        .saveAsTable("kardia_meta.bronze_qc"))