In [0]:
# 01_bronze_patients_autoloader.ipynb
# Streams CSV files from /raw/patients/ into a Bronze Delta table with CDF enabled.

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

# Path config
RAW_PATH    = "dbfs:/kardia/raw/patients/"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
CHKPT_LOC   = "dbfs:/kardia/_checkpoints/bronze_patients"
SCHEMA_LOC  = "dbfs:/kardia/_schemas/patients"
BAD_PATH     = "dbfs:/kardia/_quarantine/raw/bad_patients"

In [0]:
# Explicit schema
patients_schema = StructType([
    StructField("ID",          StringType(), True),
    StructField("BIRTHDATE",   DateType(),   True),
    StructField("DEATHDATE",   DateType(),   True),
    StructField("SSN",         StringType(), True),
    StructField("DRIVERS",     StringType(), True),
    StructField("PASSPORT",    StringType(), True),
    StructField("PREFIX",      StringType(), True),
    StructField("FIRST",       StringType(), True),
    StructField("LAST",        StringType(), True),
    StructField("MARITAL",     StringType(), True),
    StructField("RACE",        StringType(), True),
    StructField("ETHNICITY",   StringType(), True),
    StructField("GENDER",      StringType(), True),
    StructField("BIRTHPLACE",  StringType(), True),
    StructField("ADDRESS",     StringType(), True)
])

In [0]:
# Initialize a Spark session with minimal shuffle partitions
spark = (
    SparkSession.builder
        .appName("bronze_patients_autoloader")
        .config("spark.sql.shuffle.partitions", "1")                 # dev-friendly
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# 0. Ensure the bronze DB exists
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_bronze")

# 1. Create Bronze table with CDF ON
spark.sql(f"""
CREATE TABLE IF NOT EXISTS kardia_bronze.bronze_patients
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
LOCATION '{BRONZE_PATH}'
""")

(stream := spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format",  "csv")
    .option("cloudFiles.includeExistingFiles", "true")
    .option("header",            True)
    .option("cloudFiles.schemaLocation", SCHEMA_LOC)
    .option("badRecordsPath",    BAD_PATH)
    .option("rescuedDataColumn", "_rest")
    .schema(patients_schema)
    .load(RAW_PATH)
    .writeStream
    .format("delta")
    .option("checkpointLocation", CHKPT_LOC)
    .option("mergeSchema", "true")
    .outputMode("append")
    .trigger(availableNow=True)
    .start(BRONZE_PATH))
stream.awaitTermination()

print("Bronze ingest complete")

In [0]:

print(f"Verifying Bronze Delta Table at: {BRONZE_PATH}\n")

bronze_df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Bronze row count: {bronze_df.count()}")
display(bronze_df.limit(5))

print("\nSchema:")
bronze_df.printSchema()

print("\nRecent Delta history:")
display(
    spark.sql(f"""
        DESCRIBE HISTORY delta.`{BRONZE_PATH}`
    """).select("version","timestamp","operation","operationParameters").limit(5)
)

In [0]:
# Register database (safe, idempotent)
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_bronze")

# Register the physical Delta table path as a table in metastore
spark.sql(f"""
CREATE TABLE IF NOT EXISTS kardia_bronze.bronze_patients
USING DELTA
LOCATION '{BRONZE_PATH}'
""")

# Exit cleanly if running in a job
dbutils.notebook.exit("bronze_patients_ingest_success")


In [0]:
### Bronze ingest complete  
### Proceed to `make_cdf_view` to build Change Data Feed temp view.

In [0]:
%sql
SELECT * FROM kardia_bronze.bronze_patients