In [0]:
# 01_bronze_patients_autoloader.ipynb
# Ingest CSV patient files into a Bronze Delta table with CDF enabled.

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType

# Paths and table names
DB               = "kardia_bronze"
TABLE            = f"{DB}.bronze_patients"
RAW_PATH         = "dbfs:/kardia/raw/patients/"
TABLE_PATH       = "dbfs:/kardia/bronze/bronze_patients"
SCHEMA_LOC       = "dbfs:/kardia/_schemas/patients"
CHKPT_LOC        = "dbfs:/kardia/_checkpoints/bronze_patients"
BAD_PATH         = "dbfs:/kardia/_quarantine/raw/bad_patients"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Define a strict schema to avoid schema inference costs
patients_schema = StructType([
    StructField("ID",         StringType(), True),
    StructField("BIRTHDATE",  DateType(),   True),
    StructField("DEATHDATE",  DateType(),   True),
    StructField("SSN",        StringType(), True),
    StructField("DRIVERS",    StringType(), True),
    StructField("PASSPORT",   StringType(), True),
    StructField("PREFIX",     StringType(), True),
    StructField("FIRST",      StringType(), True),
    StructField("LAST",       StringType(), True),
    StructField("MARITAL",    StringType(), True),
    StructField("RACE",       StringType(), True),
    StructField("ETHNICITY",  StringType(), True),
    StructField("GENDER",     StringType(), True),
    StructField("BIRTHPLACE", StringType(), True),
    StructField("ADDRESS",    StringType(), True)
])

In [0]:
# 1. Ensure the Bronze DB and table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB}")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {TABLE}
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
LOCATION '{TABLE_PATH}'
""")

In [0]:
# 2. Auto-loader stream
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "csv")
         .option("header", True)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("cloudFiles.schemaLocation", SCHEMA_LOC)
         .option("badRecordsPath", BAD_PATH)
         .option("rescuedDataColumn", "_rest")
         .schema(patients_schema)
         .load(RAW_PATH)
         .writeStream
         .format("delta")
         .option("checkpointLocation", CHKPT_LOC)
         .option("mergeSchema", "true")
         .outputMode("append")
         .trigger(availableNow=True)
         .start(TABLE_PATH)
)

stream.awaitTermination()
print(f"Bronze ingest complete: {RAW_PATH} → {TABLE_PATH}")

In [0]:
# 3. Verify Bronze table contents and ingestion history
df = spark.read.format("delta").load(TABLE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

# Show Delta Lake history to verify CDF and ingest details
print("Recent Delta history:")
history = spark.sql(f"""
    DESCRIBE HISTORY delta.`{TABLE_PATH}`
""").select("version", "timestamp", "operation", "operationParameters")
display(history.limit(5))