In [0]:
# 01_bronze_encounters_autoloader.ipynb
# SOURCE: Ingest raw encounter Avro files into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_encounters` with CDF enabled.
# TRIGGER: Continuous stream; append to Delta table with schema evolution enabled.

import pyspark.sql.functions as F

BRONZE_DB = "kardia_bronze"
BRONZE_ENCOUNTERS_TBL = f"{BRONZE_DB}.bronze_encounters"

RAW_PATH = "dbfs:/kardia/raw/encounters/"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_encounters"
SCHEMA_PATH = "dbfs:/kardia/_schemas/bronze_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/bronze_encounters"
BAD_PATH = "dbfs:/kardia/_quarantine/raw/bad_encounters"

In [0]:
# 1. Ensure the Bronze DB and Bronze Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_ENCOUNTERS_TBL}
    USING DELTA
    COMMENT 'Bronze Avro ingest of encounter records.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [0]:
# 2. Define a streaming pipeline using Auto‑loader.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles","true")
         .option("badRecordsPath", BAD_PATH)
         .load(RAW_PATH)

         # Add audit columns
         .withColumn("_ingest_ts", F.current_timestamp())
         .withColumn("_source_file", F.input_file_name())
         .withColumn("_batch_id", F.lit(spark.conf.get("spark.databricks.job.runId", "manual")))

         .writeStream
         .option("checkpointLocation", CHECKPOINT_PATH)
         .option("mergeSchema", "true")
         .trigger(processingTime="30 seconds")
         .toTable(BRONZE_ENCOUNTERS_TBL)
)
stream.awaitTermination()

In [0]:
# 3. Batch finished - Verify Bronze Encounters table and ingestion history.

# Read the Bronze Encounters table into a DataFrame.
df = spark.table(BRONZE_ENCOUNTERS_TBL)
count = df.count()

displayHTML(f"<div style='color:green; font-weight:bold'>Bronze Encounters row count: {count}</div>")
display(df.limit(5))

# Display Delta Lake history to verify CDF and ingest details.
history = spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`").select("version", "timestamp", "operation",
                                                                      "operationParameters")
displayHTML("<div style='margin-top:10px; font-weight:bold'>Recent Delta History:</div>")
display(history.limit(5))