In [0]:
# 01_bronze_feedback_autoloader.ipynb  
# SOURCE:  JSON‑lines feedback files in ADLS at abfss://raw@kardiaadlsdemo.dfs.core.windows.net/feedback/  
# OUTPUT: `kardia_bronze.bronze_feedback` with Change Data Feed enabled  
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled  

# NOTE: Using Auto Loader for JSONL; schema evolves automatically via schemaLocation.

import pyspark.sql.functions as F
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType,
    ArrayType, MapType
)

# SAS auth via secret
ADLS_ACCOUNT = "kardiaadlsdemo"
SUFFIX       = "core.windows.net"
sas_token    = dbutils.secrets.get("kardia","adls_raw_sas").lstrip('?')

# Table & path configs
BRONZE_DB             = "kardia_bronze"
BRONZE_FEEDBACK_TABLE = f"{BRONZE_DB}.bronze_feedback"

RAW_PATH        = f"abfss://raw@{ADLS_ACCOUNT}.dfs.{SUFFIX}/feedback/"
BRONZE_PATH     = "dbfs:/kardia/bronze/bronze_feedback"
SCHEMA_PATH     = "dbfs:/kardia/_schemas/bronze_feedback"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/bronze_feedback"
BAD_PATH        = "dbfs:/kardia/_quarantine/raw/bad_feedback"

In [0]:
# mount auth configs
spark.conf.set(f"fs.azure.account.auth.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}", "SAS")
spark.conf.set(
    f"fs.azure.sas.token.provider.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
)
spark.conf.set(
    f"fs.azure.sas.fixed.token.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
    sas_token
)

In [0]:
# 2. Define explicit JSON schema for performance & type safety.
feedback_schema = StructType([
    StructField("feedback_id",        StringType(), True),
    StructField("provider_id",        StringType(), True),
    StructField("timestamp",          StringType(), True),
    StructField("visit_id",           StringType(), True),
    StructField("satisfaction_score", IntegerType(), True),
    StructField("comments",           StringType(), True),
    StructField("source",             StringType(), True),
    StructField("tags",               ArrayType(StringType()), True),
    StructField("metadata",           MapType(StringType(), StringType()), True),
])

In [0]:
# 3. Ensure Bronze DB and table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")
spark.sql(f"""
  CREATE TABLE IF NOT EXISTS {BRONZE_FEEDBACK_TABLE} (
    feedback_id        STRING,
    provider_id        STRING,
    timestamp          STRING,
    visit_id           STRING,
    satisfaction_score INT,
    comments           STRING,
    source             STRING,
    tags               ARRAY<STRING>,
    metadata           MAP<STRING,STRING>,
    _ingest_ts         TIMESTAMP,
    _source_file       STRING,
    _batch_id          STRING
  )
  USING DELTA
  LOCATION '{BRONZE_PATH}'
  TBLPROPERTIES ('delta.enableChangeDataFeed'='true')
""")

In [0]:
# 4. Define incremental Auto-loader stream for JSONL files.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "json")
         .option("cloudFiles.schemaLocation", SCHEMA_PATH)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", BAD_PATH)
         .schema(feedback_schema)
         .load(RAW_PATH)

         # Add audit columns
         .withColumn("_ingest_ts",  F.current_timestamp())
         .withColumn("_source_file", F.input_file_name())
         .withColumn("_batch_id",    F.lit(spark.conf.get("spark.databricks.job.runId","manual")))
         
         .writeStream
         .option("checkpointLocation", CHECKPOINT_PATH)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_FEEDBACK_TABLE)
)
stream.awaitTermination()
print(f"Bronze ingest complete: from {RAW_PATH} to {BRONZE_FEEDBACK_TABLE}")

In [0]:
# 5. Stream finished – Verify Bronze table and ingestion history.
df = spark.table(BRONZE_FEEDBACK_TABLE)
print(f"Rows in Bronze Feedback: {df.count()}")
display(df.orderBy(F.col("_ingest_ts").desc()).limit(5))

history_df = spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`") \
                  .select("version","timestamp","operation")
display(history_df.limit(5))