In [0]:
# 01_bronze_stream_encounters_autoloader.ipynb
# SOURCE: Ingest raw encounter CSVs into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_encounters` with Change Data Feed enabled.
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

from pyspark.sql.types import StructType, StructField, StringType, DateType

# Table paths
BRONZE_DB               = "kardia_bronze"
BRONZE_ENCOUNTERS_TABLE = f"{BRONZE_DB}.bronze_encounters"

RAW_PATH                = "dbfs:/kardia/raw/encounters/"
BRONZE_PATH             = "dbfs:/kardia/bronze/bronze_encounters"

CHECKPOINT_PATH         = "dbfs:/kardia/_checkpoints/bronze_encounters"

In [0]:
# Define explicit schema to enforce structure and improve Auto-loader performance.
encounters_schema = StructType([
        StructField("ID",                  StringType(),   False),
        StructField("DATE",                DateType(),     True),
        StructField("PATIENT",             StringType(),   False),
        StructField("CODE",                StringType(),   True),
        StructField("DESCRIPTION",         StringType(),   True),
        StructField("REASONCODE",          StringType(),   True),
        StructField("REASONDESCRIPTION",   StringType(),   True)
])

In [0]:
# 1. Ensure the Bronze DB and Bronze Encounters table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_ENCOUNTERS_TABLE}
    USING DELTA
    COMMENT 'Bronze table for batch Auto Loader ingest of encounter records.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [None]:
# 2. Define an incremental batch pipeline using Auto‑loader.
#    Auto‑loader discovers new files in cloud storage and writes to a Delta table.

def _ingest_excel(batch_df, _):
    """For each batch of discovered .xlsx files, convert them to a Spark DF
       using spark‑excel and append to the Bronze Delta table."""

    # toLocalIterator() streams rows one by one from the executor to the driver
    excel_paths = [r.path for r in batch_df.select("path").toLocalIterator()]
    if not excel_paths:
        return

    df = (spark.read
               .format("com.crealytics.spark.excel")
               .option("header", "true")
               .option("inferSchema", "true")
               .load(excel_paths)
               .selectExpr(
                   "ID",
                   "to_date(DATE) as DATE",
                   "PATIENT",
                   "CODE",
                   "DESCRIPTION",
                   "REASONCODE",
                   "REASONDESCRIPTION",
                   "*"
            ))

    (df.write
       .format("delta")
       .mode("append")
       .option("mergeSchema", "true")
       .save(BRONZE_PATH)
    )

# NOTE:
# Databricks does not natively support parsing Excel files. Auto Loader simply watches the folder
# for new Excel files, passes each batch to _ingest_excel, where Spark loads the Excel files, infers
# extra columns `inferSchema = true`, selects and casts core columns and then keeps any additional
# columns from the source.
#
# The resulting DataFrame is appended to the Bronze table with `mergeSchema = true`, which allows
# the schema to evolve forward automatically.

In [None]:
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "binaryFile")
         .option("pathGlobFilter", "*.xlsx")
         .option("cloudFiles.includeExistingFiles", "true")
         .load(RAW_PATH)

         .writeStream
         .foreachBatch(_ingest_excel)
         .option("checkpointLocation", CHECKPOINT_PATH)
         .trigger(availableNow=True)
         .start()
)

stream.awaitTermination()

In [0]:
# 3. Stream finished - Verify Bronze table and ingestion history.
print(f"Loaded data from {RAW_PATH} to {BRONZE_PATH}")

# Read the Bronze Encounters table into a DataFrame.
df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Display Delta Lake history to verify CDF and ingest details.
history_df = spark.sql(
    f""" DESCRIBE HISTORY delta.`{BRONZE_PATH}` """
).select("version", "timestamp", "operation", "operationParameters")

display(history_df.limit(5))