In [0]:
# 01_validate_bronze_patients.ipynb
# SOURCE: Bronze Patients table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
BRONZE_PATIENTS_TBL = "bronze_patients"

patients_df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = patients_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.sum(F.when(F.col("ID").isNull(), 1).otherwise(0)).alias("null_id_count"),
    F.sum(F.when(~F.col("GENDER").isin("M","F"), 1).otherwise(0)).alias("invalid_gender_count")
).first()

# Track validation result
validation_passed = True
error_messages = []

In [0]:
# 2. Run assertions to catch critical data quality issues
if stats_row.null_id_count != 0:
    validation_passed = False
    error_messages.append(f"{stats_row.null_id_count} null ID(s) found")
if stats_row.invalid_gender_count != 0:
    validation_passed = False
    error_messages.append(f"{stats_row.invalid_gender_count} invalid GENDER values")
if stats_row.distinct_ids != stats_row.row_count:
    validation_passed = False
    error_messages.append("Duplicate Patient ID(s) detected")

# Optional printout
print("Validation stats:", stats_row.asDict())
if validation_passed:
    print("Bronze Patients validation passed")
else:
    print("Bronze Patients validation failed:", "; ".join(error_messages))

In [0]:
# 3. Append one-row summary to track record count (data quality check)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_validation")

validation_summary_df = (
    spark.createDataFrame([stats_row.asDict()])
    .withColumn("table_name", F.lit(BRONZE_PATIENTS_TBL))
    .withColumn("passed", F.lit(validation_passed))
    .withColumn("errors", F.lit(", ".join(error_messages)))
    .withColumn("_run_ts", F.current_timestamp())
)

(validation_summary_df.write
                      .mode("append")
                      .option("mergeSchema", "true")
                      .saveAsTable("kardia_validation.bronze_patients_summary"))

In [None]:
# 4. Display Bronze Patients validation status
if validation_passed:
    displayHTML(f"<div style='color:green; font-weight:bold'>Bronze Patients validation passed</div>")
else:
    displayHTML(f"<div style='color:red; font-weight:bold'>Validation failed: {'; '.join(error_messages)}</div>")