In [None]:
%md
### Kardiaflow - Bronze Encounters Auto Loader

**Source:** Raw Avro files in ADLS

**Target:** `kardia_bronze.bronze_encounters` (CDF enabled)

**Trigger:** Configurable via job param `mode`:
  - `batch` → one-time load of all files
  - `stream` → continuous 30s micro-batches

Notes:
- Avro is self-describing. We rely on schema inference and enforce types in Silver.
- Separate checkpoint directories keep batch/stream states isolated.

In [0]:
import pyspark.sql.functions as F

from kflow.config import BRONZE_DB, bronze_paths
from kflow.etl_utils import add_audit_cols
from kflow.notebook_utils import init

# 1. Initialize notebook environment (auth and catalog)
init()

# Load table paths and names for the Encounters dataset (paths, table, schema, etc.)
P = bronze_paths("encounters")
BRONZE_TABLE = P.table

In [None]:
# 2. Retrieve runtime mode from job widget: "batch" (default) or "stream"
#  - "batch" mode loads all available data and stops
#  - "stream" mode runs continuous 30s micro-batches
# Mode is passed as a job parameter named "mode"
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass
MODE = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH = (MODE == "batch")

# Use a separate checkpoint directory per mode to keep batch and stream state isolated
CHECKPOINT = f"{P.checkpoint}/{MODE}"

In [0]:
# 3. Ensure Bronze DB and Encounters table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
    USING DELTA
    COMMENT 'Bronze Avro ingest of Encounter records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 4. Define a streaming pipeline using Auto Loader
reader = (
    spark.readStream.format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         # Drop any records missing required fields
         .filter(
             F.col("ID").isNotNull() &
             F.col("PATIENT").isNotNull()
         )
         # Add ingest timestamp, source file, batch ID
         .transform(add_audit_cols)
)

writer = (
    reader.writeStream
          .option("checkpointLocation", CHECKPOINT)
          .option("mergeSchema", "true")
)

In [None]:
# 5. Run as either batch or stream depending on runtime mode
#    Toggle trigger mode via job param "mode"
if IS_BATCH:
    # Batch mode: process all available files once and exit
    query = writer.trigger(availableNow=True).toTable(BRONZE_TABLE)
    print(f"[batch] Wrote to {BRONZE_TABLE} with checkpoint={CHECKPOINT} …")
    query.awaitTermination()
else:
    # Streaming mode: run continuously every 30s
    query = writer.trigger(processingTime="30 seconds").toTable(BRONZE_TABLE)
    print(f"[live] Continuous 30s micro-batches to {BRONZE_TABLE} with checkpoint={CHECKPOINT}")