In [None]:
# 01_bronze_stream_providers_autoloader.ipynb
# SOURCE: Ingest raw provider CSVs into the Bronze layer using Auto Loader.
# OUTPUT: `kardia_bronze.bronze_providers` with Change Data Feed enabled.
# TRIGGER: Incremental batch; append to Delta table with fixed schema.

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

# 0. Configure SAS auth via secret
ADLS_ACCOUNT = "kardiaadlsdemo"
SUFFIX       = "core.windows.net"

# Retrieve your SAS token (no leading '?') from a secret
sas = dbutils.secrets.get("kardia", "adls_raw_sas").lstrip('?')

spark.conf.set(
    f"fs.azure.account.auth.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
    "SAS"
)
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{ADLS_ACCOUNT}.dfs.{SUFFIX}", sas
)

# Then point Auto Loader at:
SRC_PATH = f"abfss://raw@{ADLS_ACCOUNT}.dfs.{SUFFIX}/providers/"

In [None]:
# ─── 2. Bronze Table Creation ───────────────────────────
BRONZE_DB    = "kardia_bronze"
BRONZE_TABLE = f"{BRONZE_DB}.bronze_providers"
BRONZE_PATH  = "dbfs:/kardia/bronze/bronze_providers"

spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")
spark.sql(f"""
  CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
  USING DELTA
  LOCATION '{BRONZE_PATH}'
  TBLPROPERTIES ('delta.enableChangeDataFeed'='true')
  AS SELECT * FROM (SELECT NULL AS ProviderID) WHERE 1=0
""")
print(f"Bronze table ready: {BRONZE_TABLE}")

In [None]:
# ─── 3. Auto Loader from ADLS ──────────────────────────
provider_schema = StructType([
    StructField("ProviderID",        StringType(), True),
    StructField("ProviderName",      StringType(), True),
    StructField("ProviderSpecialty", StringType(), True),
    StructField("ProviderLocation",  StringType(), True),
])

SRC_PATH       = f"abfss://raw@{ADLS_ACCOUNT}.dfs.{SUFFIX}/providers/"
CHECKPOINT     = "dbfs:/kardia/_checkpoints/bronze_providers"
SCHEMA_LOCATION= "dbfs:/kardia/_schemas/bronze_providers"

stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "csv")
         .option("cloudFiles.includeExistingFiles", "true")
         .option("cloudFiles.schemaLocation", SCHEMA_LOCATION)
         .schema(provider_schema)
         .load(SRC_PATH)
         .withColumn("_ingest_ts",   F.current_timestamp())
         .withColumn("_source_file", F.input_file_name())
         .writeStream
         .format("delta")
         .option("checkpointLocation", CHECKPOINT)
         .option("mergeSchema",        "true")
         .outputMode("append")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [None]:
# ─── 4. Sanity Check ───────────────────────────────────
df = spark.table(BRONZE_TABLE)
print(f"Rows in Bronze Providers: {df.count()}")
display(df.orderBy(F.col("_ingest_ts").desc()).limit(10))
display(spark.sql(f"DESCRIBE HISTORY {BRONZE_TABLE}"))