In [0]:
# bronze_encounters_autoloader.ipynb
# SOURCE: Raw AVRO files in ADLS
# TARGET: `kardia_bronze.bronze_encounters` (CDF)
# TRIGGER: Batch or streaming (set mode via Job param); append to Bronze Encounters table
# NOTE: For self‑describing formats like AVRO, we use schema inference and enforce types in Silver.

# Optional library bootstrap for ephemeral jobs clusters
%run ../../99_utilities/bootstrap_kflow

import pyspark.sql.functions as F

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import BRONZE_DB, bronze_paths
from kflow.etl_utils import add_audit_cols

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Encounters dataset (paths, table, schema, etc.)
P = bronze_paths("encounters")
BRONZE_TABLE = P.table

In [None]:
# Retrieve runtime mode from job widget: "batch" (default) or "stream"
# - "batch" mode loads all available data and stops
# - "stream" mode runs continuous 30s micro-batches
# Mode is passed as a job parameter named "mode"
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass
MODE = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH = (MODE == "batch")

# Use a separate checkpoint directory per mode to keep batch and stream state isolated
CHECKPOINT = f"{P.checkpoint}/{MODE}"

In [0]:
# 1. Ensure Bronze DB and Encounters table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {P.table}
    USING DELTA
    COMMENT 'Bronze Avro ingest of Encounter records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 3. Define a streaming pipeline using Auto Loader
reader = (
    spark.readStream.format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         # Drop any records missing required fields
         .filter(
             F.col("ID").isNotNull() &
             F.col("PATIENT").isNotNull()
         )
         # Add ingest timestamp, source file, batch ID
         .transform(add_audit_cols)
)

writer = (
    reader.writeStream
          .option("checkpointLocation", CHECKPOINT)
          .option("mergeSchema", "true")
)

# Run as either batch or stream depending on runtime mode
# Toggle trigger mode via job param "mode"
if IS_BATCH:
    # Batch mode: process all available files once and exit
    q = writer.trigger(availableNow=True).toTable(P.table)
    print(f"[demo] Writing to {P.table} with checkpoint={CHECKPOINT} …")
    q.awaitTermination()
else:
    # Streaming mode: run continuously every 30s
    q = writer.trigger(processingTime="30 seconds").toTable(P.table)
    print(f"[live] Continuous 30s micro-batches to {P.table} with checkpoint={CHECKPOINT}")