In [0]:
# 00_validate_bronze_patients.ipynb
# SOURCE: Bronze Patients table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# Check for NULLs and uniqueness after Bronze ingestion.
stats = df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.sum(F.when(F.col("ID").isNull(), 1).otherwise(0)).alias("null_id_count"),
    F.sum(F.when(~F.col("GENDER").isin("M","F"),1).otherwise(0)).alias("invalid_gender_count")
).first()
print("Validation stats:", stats.asDict())

assert stats.null_id_count == 0, f"{stats.null_id_count} null ID(s) found"
assert stats.invalid_gender_count == 0, f"{stats.invalid_gender_count} invalid GENDER values"
assert stats.distinct_ids == stats.row_count, "Duplicate Patient ID(s) detected"

print("Bronze Patients validation passed")