In [None]:
# 01_bronze_patients_autoloader.ipynb
# SOURCE: Ingest raw patient CSVs into Bronze using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_patients` with Change Data Feed enabled.
# PATTERN: CSV to Auto Loader (incremental batch)
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

from kflow.config import BRONZE_DB, bronze_paths, current_batch_id
from kflow.display_utils import banner, show_history, show_head
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

# Load Bronze paths
P = bronze_paths("patients")
BRONZE_TABLE = P.table

In [None]:
# Define explicit schema to enforce structure and improve Auto-loader performance.
patients_schema = StructType([
    StructField("ID",         StringType(),  False),
    StructField("BIRTHDATE",  StringType(),  True),
    StructField("DEATHDATE",  StringType(),  True),
    StructField("SSN",        StringType(),  True),
    StructField("DRIVERS",    StringType(),  True),
    StructField("PASSPORT",   StringType(),  True),
    StructField("PREFIX",     StringType(),  True),
    StructField("FIRST",      StringType(),  True),
    StructField("LAST",       StringType(),  True),
    StructField("SUFFIX",     StringType(),  True),
    StructField("MAIDEN",     StringType(),  True),
    StructField("MARITAL",    StringType(),  True),
    StructField("RACE",       StringType(),  True),
    StructField("ETHNICITY",  StringType(),  True),
    StructField("GENDER",     StringType(),  True),
    StructField("BIRTHPLACE", StringType(),  True),
    StructField("ADDRESS",    StringType(),  True)
])

In [None]:
# 1. Ensure Bronze Patients table exists
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
    USING DELTA
    COMMENT 'Bronze table for batch Auto Loader ingest of patient records.'
    LOCATION '{P.bronze}'
    """
)

In [None]:
# 2. Define an incremental batch pipeline using Auto Loader
stream = (
  spark.readStream.format("cloudFiles")
       .option("cloudFiles.format", "csv")
       .option("cloudFiles.schemaLocation", P.schema)
       .option("cloudFiles.includeExistingFiles", "true")
       .option("header", "true")
       .schema(patients_schema)
       .load(P.raw)
       .transform(add_audit_cols)

       .writeStream
       .option("checkpointLocation", P.checkpoint)
       .option("mergeSchema", "true")
       .trigger(availableNow=True)
       .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [None]:
# 3. Batch finished - Verify Bronze table and ingestion history
df = spark.table(BRONZE_TABLE)
banner(f"Bronze Patients row count: {df.count()}", True)
show_head(df, 5)
show_history(P.bronze, 5)