In [0]:
# bronze_encounters_autoloader.ipynb
# SOURCE:  Ingests raw Avro files from ADLS using Auto Loader.
# OUTPUT:  `kardia_bronze.bronze_encounters` (Delta with CDF enabled).
# TRIGGER:
# In batch mode: Reads available data and exits.
# In stream mode: Runs continuous 30s micro-batches.

%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow
from kflow.config import BRONZE_DB, bronze_paths, ensure_adls_auth
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F

ensure_adls_auth()

# Load Bronze paths
P = bronze_paths("encounters")
BRONZE_TABLE = P.table

In [None]:
# Mode widget & flags (batch = drain & stop; stream = continuous)
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass
MODE = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH = (MODE == "batch")

# Use a mode-suffixed checkpoint to avoid clobbering state
CHECKPOINT = f"{P.checkpoint}/{MODE}"

In [0]:
# 1. Ensure Bronze DB and Encounters table exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {P.table}
    USING DELTA
    COMMENT 'Bronze table for batch Auto Loader ingest of patient records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 2. Define a streaming pipeline using Auto Loader
reader = (
    spark.readStream.format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         .transform(add_audit_cols)
)

writer = (
    reader.writeStream
          .option("checkpointLocation", CHECKPOINT)
          .option("mergeSchema", "true")
)

if IS_BATCH:
    # Drain all available input and stop so the job can finish
    q = writer.trigger(availableNow=True).toTable(P.table)
    print(f"[demo] Draining to {P.table} with checkpoint={CHECKPOINT} …")
    q.awaitTermination()
else:
    # Continuous 30s micro-batches; no awaitTermination() here
    q = writer.trigger(processingTime="30 seconds").toTable(P.table)
    print(f"[live] Continuous 30s micro-batches to {P.table} with checkpoint={CHECKPOINT}")