In [0]:
# 01_bronze_stream_encounters_autoloader.ipynb
# -------------------------------------------------------
# Streams CSV files from /raw/encounters/ into a Bronze Delta table with CDF enabled.

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Path config
RAW_PATH     = "dbfs:/kardia/raw/encounters/"
BRONZE_PATH  = "dbfs:/kardia/bronze/bronze_encounters"
CHKPT_LOC    = "dbfs:/kardia/_checkpoints/bronze_encounters"
SCHEMA_LOC   = "dbfs:/kardia/_schemas/encounters"

In [0]:
# Explicit schema
encounters_schema = StructType([
    StructField("ID",                 StringType(),   True),
    StructField("DATE",               TimestampType(),True),
    StructField("PATIENT",            StringType(),   True),
    StructField("CODE",               StringType(),   True),
    StructField("DESCRIPTION",        StringType(),   True),
    StructField("REASONCODE",         StringType(),   True),
    StructField("REASONDESCRIPTION",  StringType(),   True)
])

In [0]:
# Spark session
spark = (SparkSession.builder
         .appName("bronze_encounters_autoloader")
         .config("spark.sql.shuffle.partitions", "1")   # cost-safe dev
         .getOrCreate())

In [0]:
# 1. Create table with CDF ON before starting the stream
spark.sql(f"""
CREATE TABLE IF NOT EXISTS kardia_bronze.bronze_encounters
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
LOCATION '{BRONZE_PATH}'
""")

In [0]:
# 2. Start Auto Loader stream ──
(enc_stream := spark.readStream.format("cloudFiles")
     .option("cloudFiles.format",  "csv")
     .option("header",             True)
     .option("cloudFiles.schemaLocation", SCHEMA_LOC)
     .schema(encounters_schema)
     .load(RAW_PATH)
     .writeStream
     .format("delta")
     .option("checkpointLocation", CHKPT_LOC)
     .option("mergeSchema", "true")
     .outputMode("append")
     .trigger(availableNow=True)
     .start(BRONZE_PATH))
enc_stream.awaitTermination()

In [0]:
# 3. Stream finished — confirm results
print(f"\nBronze stream completed: {RAW_PATH} → {BRONZE_PATH}")

bronze_df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {bronze_df.count()}")
print("\nSchema:")
bronze_df.printSchema()

print("\nRecent Delta History:")
display(
    spark.sql(f"""
      DESCRIBE HISTORY delta.`{BRONZE_PATH}`
    """).select("version", "timestamp", "operation", "operationParameters")
)
