In [None]:
# 01_bronze_claims_autoloader.ipynb
# SOURCE:  Avro claim files in dbfs:/kardia/raw/claims/
# OUTPUT: `kardia_bronze.bronze_claims` with Change Data Feed enabled
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

# NOTE: For self-describing formats like Avro/Parquet, we let the file's embedded
# schema drive the Bronze ingestion, and cast/enforce types later in Silver.

BRONZE_DB = "kardia_bronze"
BRONZE_CLAIMS_TABLE = f"{BRONZE_DB}.bronze_claims"

RAW_PATH = "dbfs:/kardia/raw/claims/"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_claims"

SCHEMA_PATH = "dbfs:/kardia/_schemas/bronze_claims"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/bronze_claims"
BAD_PATH = "dbfs:/kardia/_quarantine/raw/bad_claims"

In [None]:
# 2. Ensure database / table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_CLAIMS_TABLE}
    USING DELTA
    COMMENT 'Bronze Avro ingest of claim records.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [None]:
# 3. Define an incremental batch pipeline using Auto-loader.
#    Auto-loader discovers new files in cloud storage and writes to a Delta table.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", BAD_PATH)
         .load(RAW_PATH)

         # Add audit columns
         .withColumn("_ingest_ts", F.current_timestamp())
         .withColumn("_source_file", F.input_file_name())
         .withColumn("_batch_id", F.lit(spark.conf.get("spark.databricks.job.runId", "manual")))

         .writeStream
         .option("checkpointLocation", CHECKPOINT_PATH)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_CLAIMS_TABLE)
)
stream.awaitTermination()

print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_CLAIMS_TABLE}")

# NOTE:
# Auto Loader handles the read-side logic, maintaining a file-discovery ledger.
# Structured Streaming handles the write-side logic, using a WAL and offset tracking.
# The ledger, WAL, offsets, and schema evolution log are all stored in the checkpoint directory.
# `cloudFiles.schemaLocation` tells Auto Loader where to persist the evolving schema history.

In [None]:
# 5. Stream finished - Verify Bronze table and ingestion history.

# Read the Bronze Claims table into a DataFrame.
df = spark.table(BRONZE_CLAIMS_TABLE)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Display Delta Lake history to verify CDF and ingest details.
history_df = (spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`")
                   .select("version","timestamp","operation"))
display(history_df.limit(3))