In [0]:
# 01_validate_bronze_providers.ipynb
# SOURCE: Bronze Providers table
# OUTPUT: N/A

from pyspark.sql import functions as F

BRONZE_PATH = "dbfs:/kardia/bronze/bronze_providers"
BRONZE_PROVIDERS_TBL = "bronze_providers"

providers_df = spark.read.format("delta").load(BRONZE_PATH)

In [0]:
# 1. Check for NULLs and uniqueness after Bronze ingestion.
stats_row = providers_df.agg(
    F.count("*").alias("row_count"),
    F.countDistinct("ProviderID").alias("distinct_ids"),
    F.sum(F.when(F.col("ProviderID").isNull(), 1).otherwise(0)).alias("null_ids"),
    F.sum(F.when(F.col("ProviderSpecialty").isNull(), 1).otherwise(0)).alias("null_specialty"),
    F.sum(F.when(F.col("ProviderLocation").isNull(), 1).otherwise(0)).alias("null_location")
).first()

print("Validation stats:", stats_row.asDict())

In [None]:
# 2. Run assertions to catch critical data quality issues
assert stats_row.null_ids == 0, f"{stats_row.null_ids} null ProviderID(s)"
assert stats_row.distinct_ids == stats_row.row_count, "Duplicate ProviderID(s) found"
print("Bronze Providers validation passed")

In [None]:
# 3. Append one-row summary to track record count (data quality check)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_validation")

validation_summary_df = (spark.createDataFrame([stats_row.asDict()])
                              .withColumn("table_name", F.lit(BRONZE_PROVIDERS_TBL))
                              .withColumn("_run_ts", F.current_timestamp()))

(validation_summary_df.write
                     .mode("append")
                     .saveAsTable("kardia_validation.bronze_providers_summary"))