In [0]:
# 01_bronze_patients_autoloader.ipynb
# Ingest CSV patient files into a Bronze Delta table with CDF enabled.

from pyspark.sql.types import StructType, StructField, StringType, DateType

# Paths and table names
BRONZE_DB = "kardia_bronze"
BRONZE_PATIENTS_TABLE = f"{BRONZE_DB}.bronze_patients"
RAW_PATH = "dbfs:/kardia/raw/patients/"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
SCHEMA_PATH = "dbfs:/kardia/_schemas/bronze_patients"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/bronze_patients"
BAD_PATH = "dbfs:/kardia/_quarantine/raw/bad_patients"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Explicit schema defines data contract and improves performance
patients_schema = StructType([
    StructField("ID", StringType(), False),
    StructField("BIRTHDATE", DateType(), True),
    StructField("DEATHDATE",  DateType(), True),
    StructField("SSN", StringType(), True),
    StructField("DRIVERS", StringType(), True),
    StructField("PASSPORT", StringType(), True),
    StructField("PREFIX", StringType(), True),
    StructField("FIRST", StringType(), True),
    StructField("LAST", StringType(), True),
    StructField("MARITAL", StringType(), True),
    StructField("RACE", StringType(), True),
    StructField("ETHNICITY", StringType(), True),
    StructField("GENDER", StringType(), True),
    StructField("BIRTHPLACE", StringType(), True),
    StructField("ADDRESS", StringType(), True)
])

In [0]:
# 1. Ensure the Bronze DB and table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_PATIENTS_TABLE}
    USING DELTA
    COMMENT 'Bronze table for batch Auto Loader ingest of patient records.'
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
    """
)

In [0]:
# 2. Auto-loader stream
stream = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", SCHEMA_PATH)
        .option("header", "true")
        .option("badRecordsPath", BAD_PATH)
        .option("rescuedDataColumn", "_rest")
        .schema(patients_schema)
        .load(RAW_PATH)

        .writeStream
        .format("delta")
        .option("mergeSchema", "true")
        .option("checkpointLocation", CHECKPOINT_PATH)
        .outputMode("append")
        .trigger(availableNow=True)
        .start(BRONZE_PATH)
)

stream.awaitTermination()
print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_PATH}")

In [0]:
# 3. Verify Bronze table contents and ingestion history
df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Show Delta Lake history to verify CDF and ingest details
print("Recent Delta history:")
history = spark.sql(
    f"""
    DESCRIBE HISTORY delta.`{BRONZE_PATH}`
    """
).select("version", "timestamp", "operation", "operationParameters")

display(history.limit(5))