In [0]:
# 00_validate_bronze_claims.ipynb
# SOURCE: Bronze Claims table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_claims"
BRONZE_CLAIMS_TBL = "bronze_claims"

claims_df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = claims_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ClaimID").alias("distinct_ids"),
    F.sum(F.when(F.col("ClaimID").isNull(), 1).otherwise(0)).alias("null_claimid"),
    F.sum(F.when(F.col("PatientID").isNull(), 1).otherwise(0)).alias("null_patientid"),
    F.min("ClaimAmount").alias("min_amount"),
    F.max("ClaimAmount").alias("max_amount")
).first()

print("Validation stats:", stats_row.asDict())

In [None]:
# 2. Run assertions to catch critical data quality issues
assert stats_row.null_claimid == 0, f"{stats_row.null_claimid} null ClaimID(s)"
assert stats_row.null_patientid == 0, f"{stats_row.null_patientid} null PatientID(s)"
assert stats_row.distinct_ids == stats_row.row_count, "Duplicate ClaimID(s) found"
assert stats_row.min_amount >= 0, f"Negative ClaimAmount: {stats_row.min_amount}"
print("Bronze Claims validation passed")

In [None]:
# 3. Append one-row summary to track record count (data quality check)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_meta")

validation_summary_df = (spark.createDataFrame([stats_row.asDict()])
                              .withColumn("table_name", F.lit(BRONZE_CLAIMS_TBL))
                              .withColumn("_run_ts", F.current_timestamp()))

(validation_summary_df
        .write.mode("append")
        .saveAsTable("kardia_meta.bronze_qc"))