In [0]:
# 00_validate_raw_encounters.ipynb
# SOURCE: Bronze Encounters table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_encounters"
df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# Check for NULLs and uniqueness after Bronze ingestion.
stats = df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ID").alias("distinct_ids"),
    F.sum(F.when(F.col("ID").isNull(), 1).otherwise(0)).alias("null_id_count"),
    F.sum(F.when(F.col("PATIENT").isNull(), 1).otherwise(0)).alias("null_patient_count")
).first()
print("Validation stats:", stats.asDict())

assert stats.null_id_count == 0, f"{stats.null_id_count} null ID(s) found"
assert stats.null_patient_count == 0, f"{stats.null_patient_count} null PATIENT(s) found"
assert stats.distinct_ids == stats.row_count, "Duplicate Encounter ID(s) detected"

print("Bronze Encounters validation passed")