In [0]:
# 01_validate_bronze_claims.ipynb
# SOURCE: Bronze Claims table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_claims"
BRONZE_CLAIMS_TBL = "bronze_claims"

In [0]:
# Read Bronze table
claims_df = spark.read.format("delta").load(BRONZE_PATH)

# Cast ClaimAmount to double for validation purposes only
claims_df = claims_df.withColumn("ClaimAmount", F.col("ClaimAmount").cast("double"))

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = claims_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ClaimID").alias("distinct_ids"),
    F.sum(F.when(F.col("ClaimID").isNull(), 1).otherwise(0)).alias("null_claimid"),
    F.sum(F.when(F.col("PatientID").isNull(), 1).otherwise(0)).alias("null_patientid"),
    F.min("ClaimAmount").alias("min_amount"),
    F.max("ClaimAmount").alias("max_amount")
).first()

# Track validation status
validation_passed = True
error_messages = []

In [0]:
# 2. Run assertions to catch critical data quality issues
try:
    if stats_row.null_claimid != 0:
        raise ValueError(f"{stats_row.null_claimid} null ClaimID(s)")

    if stats_row.null_patientid != 0:
        raise ValueError(f"{stats_row.null_patientid} null PatientID(s)")

    if stats_row.distinct_ids != stats_row.row_count:
        raise ValueError("Duplicate ClaimID(s) found")

    if stats_row.min_amount is not None and stats_row.min_amount < 0:
        raise ValueError(f"Negative ClaimAmount: {stats_row.min_amount}")

    print("Bronze Claims validation passed")

except ValueError as e:
    validation_passed = False
    error_messages.append(str(e))
    print("Bronze Claims validation failed:", e)

In [0]:
# 3. Append one-row summary to track record count (data quality check)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_validation")

validation_summary_df = (
    spark.createDataFrame([stats_row.asDict()])
    .withColumn("table_name", F.lit(BRONZE_CLAIMS_TBL))
    .withColumn("passed", F.lit(validation_passed))
    .withColumn("errors", F.lit(", ".join(error_messages)))
    .withColumn("_run_ts", F.current_timestamp())
)

(validation_summary_df.write
                      .mode("append")
                      .option("mergeSchema", "true")
                      .saveAsTable("kardia_validation.bronze_claims_summary"))

In [None]:
# 4. Display Bronze Claims validation status
if validation_passed:
    displayHTML(f"<div style='color:green; font-weight:bold'>Bronze Claims validation passed</div>")
else:
    displayHTML(f"<div style='color:red; font-weight:bold'>Validation failed: {'; '.join(error_messages)}</div>")