In [0]:
# 01_bronze_encounters_autoloader.ipynb
# SOURCE: Ingest raw encounter Avro files into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_encounters` with CDF enabled.
# TRIGGER: Continuous stream; append to Delta table with schema evolution enabled.

from kflow.config import BRONZE_DB, bronze_paths, current_batch_id
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F

# Load Bronze paths
P = bronze_paths("encounters")
BRONZE_TABLE = P.table

In [0]:
# 1. Ensure Bronze Encounters table exists
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {P.table}
    USING DELTA
    COMMENT 'Bronze table for batch Auto Loader ingest of patient records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 2. Define a streaming pipeline using Auto Loader
stream = (
    spark.readStream.format("cloudFiles")
         .option("cloudFiles.format", "avro")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         .transform(add_audit_cols)

         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(processingTime="30 seconds")
         .toTable(P.table)
)

print(f"Stream bronze_encounters started. Source: raw/encounters, Sink: {P.table}")