In [0]:
# 01_validate_bronze_providers.ipynb
# SOURCE: Bronze Providers table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_providers"
BRONZE_PROVIDERS_TBL = "bronze_providers"

providers_df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = providers_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ProviderID").alias("distinct_ids"),
    F.sum(F.when(F.col("ProviderID").isNull(), 1).otherwise(0)).alias("null_ids"),
    F.sum(F.when(F.col("ProviderSpecialty").isNull(), 1).otherwise(0)).alias("null_specialty"),
    F.sum(F.when(F.col("ProviderLocation").isNull(), 1).otherwise(0)).alias("null_location")
).first()

# Track validation result
validation_passed = True
error_messages = []

In [0]:
# 2. Run assertions to catch critical data quality issues
if stats_row.null_ids != 0:
    validation_passed = False
    error_messages.append(f"{stats_row.null_ids} null ProviderID(s)")
if stats_row.distinct_ids != stats_row.row_count:
    validation_passed = False
    error_messages.append("Duplicate ProviderID(s) found")

# Optional printout (still exits on failure unless wrapped in try)
print("Validation stats:", stats_row.asDict())
if validation_passed:
    print("Bronze Providers validation passed")
else:
    print("Bronze Providers validation failed:", "; ".join(error_messages))

In [0]:
# 3. Append one-row summary to track record count (data quality check)
validation_summary_df = (
    spark.createDataFrame([stats_row.asDict()])
    .withColumn("table_name", F.lit(BRONZE_PROVIDERS_TBL))
    .withColumn("passed", F.lit(validation_passed))
    .withColumn("errors", F.lit(", ".join(error_messages)))
    .withColumn("_run_ts", F.current_timestamp())
)

(validation_summary_df.write
                      .mode("append")
                      .option("mergeSchema", "true")
                      .saveAsTable("kardia_validation.bronze_providers_summary"))

# Show latest summary entries for verification
display(spark.sql("SELECT * FROM kardia_validation.bronze_providers_summary ORDER BY _run_ts DESC LIMIT 5"))