In [0]:
# 01_bronze_stream_providers_autoloader.ipynb
# SOURCE: Ingest raw provider CSVs into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_providers` with Change Data Feed enabled.
# TRIGGER: Incremental batch; append to Delta table with fixed schema.

from pyspark.sql.types import StructType, StructField, StringType, TimestampType

# Table paths
BRONZE_DB                = "kardia_bronze"
BRONZE_PROVIDERS_TABLE   = f"{BRONZE_DB}.bronze_providers"

# ADLS raw CSV location (replace <your_adls_account> with your account name)
ACCOUNT                  = "<your_adls_account>"
RAW_PATH                 = f"abfss://raw@{ACCOUNT}.dfs.core.windows.net/providers/"

BRONZE_PATH              = "dbfs:/kardia/bronze/bronze_providers"
SCHEMA_PATH              = "dbfs:/kardia/_schemas/bronze_providers"
CHECKPOINT_PATH          = "dbfs:/kardia/_checkpoints/bronze_providers"
BAD_PATH                 = "dbfs:/kardia/_quarantine/raw/bad_providers"

In [0]:
# Configure ADLS access via SAS token stored in the “kardia” secret scope
SAS_TOKEN = dbutils.secrets.get("kardia", "adls_raw_sas")
spark.conf.set(f"fs.azure.account.auth.type.{ACCOUNT}.dfs.core.windows.net", 
               "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{ACCOUNT}.dfs.core.windows.net",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{ACCOUNT}.dfs.core.windows.net", SAS_TOKEN)

In [0]:
# 1. Create database & Bronze Providers table (with CDF enabled)
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")
spark.sql(f"""
  CREATE TABLE IF NOT EXISTS {BRONZE_PROVIDERS_TABLE}
  USING DELTA
  COMMENT 'Bronze Auto Loader ingest of provider reference data.'
  LOCATION '{BRONZE_PATH}'
  TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
""")

In [0]:
# 2. Define explicit schema for providers CSV
providers_schema = StructType([
    StructField("provider_id", StringType(),    True),
    StructField("name",        StringType(),    True),
    StructField("address",     StringType(),    True),
    StructField("updated_at",  TimestampType(), True)
])

In [0]:
# 3. Auto Loader incremental ingest (AvailableNow trigger for one‑time batch)
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format",           "csv")
         .option("cloudFiles.schemaLocation",   SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("header",                      "true")
         .option("badRecordsPath",              BAD_PATH)
         .option("rescuedDataColumn",           "_rest")
         .schema(providers_schema)
         .load(RAW_PATH)

    .writeStream
         .format("delta")
         .option("mergeSchema",       "true")
         .option("checkpointLocation", CHECKPOINT_PATH)
         .outputMode("append")
         .trigger(availableNow=True)
         .start(BRONZE_PATH)
)
stream.awaitTermination()
print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_PATH}")

In [0]:
# 4. Verification
print(f"Loaded data from {RAW_PATH} to {BRONZE_PATH}")
df = spark.read.format("delta").load(BRONZE_PATH)
print(f"Row count: {df.count()}")
display(df.limit(5))

history_df = spark.sql(
    f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`"
).select("version", "timestamp", "operation", "operationParameters")
display(history_df.limit(5))