In [0]:
# 01_bronze_stream_claims_autoloader.ipynb
# SOURCE: Avro claim files in dbfs:/kardia/raw/claims/
# OUTPUT: `kardia_bronze.bronze_claims` with Change Data Feed enabled
# TRIGGER: Incremental batch; append to Delta table with fixed schema.

from pyspark.sql.types import (StructType, StructField, StringType,
                               DoubleType, DateType)

# Table paths
BRONZE_DB           = "kardia_bronze"
BRONZE_CLAIMS_TABLE = f"{BRONZE_DB}.bronze_claims"

RAW_PATH            = "dbfs:/kardia/raw/claims/"
BRONZE_PATH         = "dbfs:/kardia/bronze/bronze_claims"

SCHEMA_PATH         = "dbfs:/kardia/_schemas/bronze_claims"
CHECKPOINT_PATH     = "dbfs:/kardia/_checkpoints/bronze_claims"
BAD_PATH            = "dbfs:/kardia/_quarantine/raw/bad_claims"

In [0]:
claims_schema = StructType([
    StructField("ClaimID",               StringType(), True),
    StructField("PatientID",             StringType(), True),
    StructField("ProviderID",            StringType(), True),
    StructField("ClaimAmount",           DoubleType(), True),
    StructField("ClaimDate",             StringType(), True),
    StructField("DiagnosisCode",         StringType(), True),
    StructField("ProcedureCode",         StringType(), True),
    StructField("ClaimStatus",           StringType(), True),
    StructField("ClaimType",             StringType(), True),
    StructField("ClaimSubmissionMethod", StringType(), True)
])

In [0]:
# 1. Ensure database / table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_CLAIMS_TABLE}
    USING DELTA
    COMMENT 'Bronze Avro ingest of claim records.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [0]:
# 2. Auto Loader incremental batch
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", BAD_PATH)
         .schema(claims_schema)
         .load(RAW_PATH)

         .writeStream
         .format("delta")
         .option("checkpointLocation", CHECKPOINT_PATH)
         .option("mergeSchema", "true")
         .outputMode("append")
         .trigger(availableNow=True)
         .start(BRONZE_PATH)
)
stream.awaitTermination()

print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_CLAIMS_TABLE}")

In [0]:
# 3. Quick sanity check
df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

history_df = (spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`")
                   .select("version","timestamp","operation"))
display(history_df.limit(3))