In [0]:
# 01_bronze_providers_autoloader.ipynb
# SOURCE:  Ingest raw provider TSVs into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_providers` with Change Data Feed enabled.
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

# SAS auth via secret
ADLS_ACCOUNT = "kardiaadlsdemo"
SUFFIX = "core.windows.net"
sas = dbutils.secrets.get("kardia", "adls_raw_sas").lstrip('?')

# Table paths
BRONZE_DB = "kardia_bronze"
BRONZE_PROVIDERS_TABLE = f"{BRONZE_DB}.bronze_providers"

RAW_PATH = f"abfss://raw@{ADLS_ACCOUNT}.dfs.{SUFFIX}/providers/"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_providers"

CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/bronze_providers"
BAD_PATH = "dbfs:/kardia/_quarantine/raw/bad_providers"

spark.conf.set(f"fs.azure.account.auth.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{ADLS_ACCOUNT}.dfs.{SUFFIX}", sas)

In [0]:
# 2. Define explicit schema to enforce structure and improve Auto-loader performance.
provider_schema = StructType([
    StructField("ProviderID",        StringType(), True),
    StructField("ProviderSpecialty", StringType(), True),
    StructField("ProviderLocation",  StringType(), True),
])

In [0]:
# 3. Ensure Bronze DB and table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_PROVIDERS_TABLE} (
        ProviderID        STRING,
        ProviderSpecialty STRING,
        ProviderLocation  STRING,
        _ingest_ts        TIMESTAMP,
        _source_file      STRING
    )
    USING DELTA
    LOCATION '{BRONZE_PATH}'
    TBLPROPERTIES ('delta.enableChangeDataFeed'='true')
    """
)

In [0]:
# 4. Define an incremental batch pipeline using Auto-loader.
query = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "csv")
         .option("delimiter", "\t")
         .option("cloudFiles.includeExistingFiles", "true")
         .option("header", "true")
         .option("badRecordsPath", BAD_PATH)
         .schema(provider_schema)
         .load(RAW_PATH)

         .withColumn("_ingest_ts",   F.current_timestamp())
         .withColumn("_source_file", F.input_file_name())

         .writeStream
         .option("checkpointLocation", CHECKPOINT_PATH)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_PROVIDERS_TABLE)
)
query.awaitTermination()

print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_PATH}")

In [0]:
# 5. Stream finished - Verify Bronze table and ingestion history.

# Read the Bronze Encounters table into a DataFrame.
df = spark.table(BRONZE_PROVIDERS_TABLE)
print(f"Rows in Bronze Providers: {df.count()}")
display(df.orderBy(F.col("_ingest_ts").desc()).limit(10))

# Display Delta Lake history to verify CDF and ingest details.
history_df = spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`") \
                  .select("version", "timestamp", "operation")
display(history_df.limit(5))