In [0]:
# 00_validate_bronze_claims.ipynb
# SOURCE: Bronze Claims table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_claims"
df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# Check for NULLs and uniqueness after Bronze ingestion.
stats = df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ClaimID").alias("distinct_ids"),
    F.sum(F.when(F.col("ClaimID").isNull(), 1).otherwise(0)).alias("null_claimid"),
    F.sum(F.when(F.col("PatientID").isNull(), 1).otherwise(0)).alias("null_patientid"),
    F.min("ClaimAmount").alias("min_amount"),
    F.max("ClaimAmount").alias("max_amount")
).first()
print("Validation stats:", stats.asDict())

assert stats.null_claimid == 0, f"{stats.null_claimid} null ClaimID(s)"
assert stats.null_patientid == 0, f"{stats.null_patientid} null PatientID(s)"
assert stats.distinct_ids == stats.row_count, "Duplicate ClaimID(s) found"
assert stats.min_amount >= 0, f"Negative ClaimAmount: {stats.min_amount}"
print("Bronze Claims validation passed")