In [0]:
# 01_bronze_stream_encounters_autoloader.ipynb
# SOURCE: Ingest raw encounter Avro files into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_encounters` with CDF enabled.
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

BRONZE_DB               = "kardia_bronze"
BRONZE_ENC_TABLE        = f"{BRONZE_DB}.bronze_encounters"

RAW_PATH                = "dbfs:/kardia/raw/encounters/"
BRONZE_PATH             = "dbfs:/kardia/bronze/bronze_encounters"

SCHEMA_PATH             = "dbfs:/kardia/_schemas/bronze_encounters"
CHECKPOINT_PATH         = "dbfs:/kardia/_checkpoints/bronze_encounters"

In [0]:
# 1. Ensure the Bronze DB and Bronze Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {BRONZE_ENC_TABLE}
USING DELTA
COMMENT 'Bronze Avro ingest of encounter records.'
LOCATION '{BRONZE_PATH}'
TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
""")

In [0]:
# 2. Define an incremental batch pipeline using Auto‑loader.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format",             "avro")
         .option("cloudFiles.schemaLocation",     SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles","true")
         .load(RAW_PATH)

         .writeStream
         .option("checkpointLocation", CHECKPOINT_PATH)
         .trigger(availableNow=True)
         .toTable(BRONZE_ENC_TABLE)
)
stream.awaitTermination()

In [0]:
# 3. Stream finished - Verify Bronze table and ingestion history.
print(f"Loaded data from {RAW_PATH} to {BRONZE_PATH}")

# Read the Bronze Encounters table into a DataFrame.
df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Display Delta Lake history to verify CDF and ingest details.
history_df = spark.sql(
    f""" DESCRIBE HISTORY delta.`{BRONZE_PATH}` """
).select("version", "timestamp", "operation", "operationParameters")

display(history_df.limit(5))