In [0]:
# 01_bronze_stream_encounters_autoloader.ipynb
# Ingest CSV encounter files into a Bronze Delta table with Change Data Feed enabled.

from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Paths and table names
DB              = "kardia_bronze"
TABLE           = f"{DB}.bronze_encounters"
RAW_PATH        = "dbfs:/kardia/raw/encounters/"
TABLE_PATH      = "dbfs:/kardia/bronze/bronze_encounters"
SCHEMA_LOC      = "dbfs:/kardia/_schemas/encounters"
CHKPT_LOC       = "dbfs:/kardia/_checkpoints/bronze_encounters"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Define a strict schema to avoid schema inference costs
encounters_schema = StructType([
    StructField("ID",                StringType(),    True),
    StructField("DATE",              TimestampType(), True),
    StructField("PATIENT",           StringType(),    True),
    StructField("CODE",              StringType(),    True),
    StructField("DESCRIPTION",       StringType(),    True),
    StructField("REASONCODE",        StringType(),    True),
    StructField("REASONDESCRIPTION", StringType(),    True)
])

In [0]:
# 1. Make sure the Bronze DB and table exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {TABLE}
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
LOCATION '{TABLE_PATH}'
""")

In [0]:
# 2. Start Auto Loader stream
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "csv")
         .option("header", True)
         .option("cloudFiles.schemaLocation", SCHEMA_LOC)
         .schema(encounters_schema)
         .load(RAW_PATH)
         .writeStream
         .format("delta")
         .option("checkpointLocation", CHKPT_LOC)
         .option("mergeSchema", "true")
         .outputMode("append")
         .trigger(availableNow=True)
         .start(TABLE_PATH)
)

stream.awaitTermination()

In [0]:
# 3. Stream finished - confirm results
print(f"Loaded data from {RAW_PATH} to {TABLE_PATH}")

df = spark.read.format("delta").load(TABLE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Show Delta Lake history to verify CDF and ingest details
history_df = spark.sql(f"""
    DESCRIBE HISTORY delta.`{TABLE_PATH}`
""").select("version", "timestamp", "operation", "operationParameters")
display(history_df.limit(5))