In [0]:
# bronze_patients_autoloader.ipynb
# SOURCE: Raw CSV files in ADLS
# TARGET: `kardia_bronze.bronze_patients` (CDF)
# TRIGGER: Incremental batch via Auto Loader; append to Bronze Patients table

# Optional library bootstrap for ephemeral jobs clusters
%run ../../99_utilities/bootstrap_kflow

from pyspark.sql.types import StructType, StructField, StringType

import pyspark.sql.functions as F

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import BRONZE_DB, bronze_paths
from kflow.display_utils import show_history
from kflow.etl_utils import add_audit_cols

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Patients dataset (paths, table, schema, etc.)
P = bronze_paths("patients")
BRONZE_TABLE = P.table

In [0]:
# Define schema explicitly for CSV input
# CSVs don’t include schema metadata and inference is unreliable
patients_schema = StructType([
    StructField("ID",         StringType(),  False),
    StructField("BIRTHDATE",  StringType(),  True),
    StructField("DEATHDATE",  StringType(),  True),
    StructField("SSN",        StringType(),  True),
    StructField("DRIVERS",    StringType(),  True),
    StructField("PASSPORT",   StringType(),  True),
    StructField("PREFIX",     StringType(),  True),
    StructField("FIRST",      StringType(),  True),
    StructField("LAST",       StringType(),  True),
    StructField("SUFFIX",     StringType(),  True),
    StructField("MAIDEN",     StringType(),  True),
    StructField("MARITAL",    StringType(),  True),
    StructField("RACE",       StringType(),  True),
    StructField("ETHNICITY",  StringType(),  True),
    StructField("GENDER",     StringType(),  True),
    StructField("BIRTHPLACE", StringType(),  True),
    StructField("ADDRESS",    StringType(),  True)
])

In [0]:
# 1. Ensure Bronze DB and Patients table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
    USING DELTA
    COMMENT 'Bronze CSV ingest of Patient records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 3. Define an incremental batch pipeline using Auto Loader
stream = (
  spark.readStream.format("cloudFiles")
       .option("cloudFiles.format", "csv")
       .option("cloudFiles.schemaLocation", P.schema)
       .option("cloudFiles.includeExistingFiles", "true")
       .option("header", "true")
       .option("ignoreEmptyLines","true")
       .schema(patients_schema)
       .load(P.raw)
       # Drop any records without a valid primary key
       .filter(F.col("ID").isNotNull())
       # Add ingest timestamp, source file, batch ID
       .transform(add_audit_cols)

       .writeStream
       .option("checkpointLocation", P.checkpoint)
       .option("mergeSchema", "true")
       .trigger(availableNow=True)
       .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [0]:
# 4. Batch finished - Verify Bronze Patients table and ingestion history.
df = spark.table(BRONZE_TABLE)
print(f"Bronze Patients row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)