In [0]:
# 01_bronze_providers_autoloader.ipynb
# SOURCE:  Ingest raw provider TSVs into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_providers` with Change Data Feed enabled.
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

from kflow.adls import set_sas
from kflow.config import BRONZE_DB, bronze_paths, current_batch_id, adls_raw_path
from kflow.display_utils import banner, show_history, show_head
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

# Auth to ADLS
ACCOUNT = "kardiaadlsdemo"
sas = dbutils.secrets.get("kardia", "adls_raw_sas")
set_sas(ACCOUNT, sas)

# Load Bronze paths
P            = bronze_paths("providers")
BRONZE_TABLE = P.table
RAW_PATH     = adls_raw_path("providers")

In [0]:
# 2. Define explicit schema to enforce structure and improve Auto-loader performance
provider_schema = StructType([
    StructField("ProviderID",        StringType(), True),
    StructField("ProviderSpecialty", StringType(), True),
    StructField("ProviderLocation",  StringType(), True),
])

In [0]:
# 3. Ensure Bronze Providers table exists
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
        ProviderID        STRING,
        ProviderSpecialty STRING,
        ProviderLocation  STRING,
        _ingest_ts        TIMESTAMP,
        _source_file      STRING
    )
    USING DELTA
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 4. Define an incremental batch pipeline using Auto Loader
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "csv")
         .option("delimiter", "\t")
         .option("cloudFiles.includeExistingFiles", "true")
         .option("header", "true")
         .option("badRecordsPath", P.bad)
         .option("rescuedDataColumn", "_rescued_data")
         .schema(provider_schema)
         .load(RAW_PATH)
         .transform(add_audit_cols)

         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [0]:
# 5. Batch finished – Verify Bronze Providers table and ingestion history
df = spark.table(BRONZE_TABLE)
banner(f"Bronze Providers row count: {df.count()}", ok=True)
show_head(df, 5)
show_history(P.bronze, 5)