In [0]:
# bronze_claims_autoloader.ipynb
# SOURCE:  Parquet claim files in abfss://raw@kardiaadlsdemo.dfs.core.windows.net/claims/
# OUTPUT: `kardia_bronze.bronze_claims` with Change Data Feed enabled
# TRIGGER: Incremental batch; append to Delta table with schema evolution enabled.

# NOTE: For self‑describing formats like Parquet, we let the file’s embedded
# schema drive the Bronze ingestion, and cast/enforce types later in Silver.

# ───── DEBUG: Confirm wheel exists in DBFS ─────
display(dbutils.fs.ls("dbfs:/Shared/libs"))

import os
print("wheel exists?", os.path.exists("/dbfs/Shared/libs/kflow-0.2.4-py3-none-any.whl"))

# ───── INSTALL: Use local offline wheel ─────
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

import kflow, importlib
importlib.reload(kflow)
print("kflow version:", getattr(kflow, "__version__", "unknown"))

from kflow.config import BRONZE_DB, bronze_paths, ensure_adls_auth
from kflow.display_utils import show_history
from kflow.etl_utils import add_audit_cols

import pyspark.sql.functions as F

ensure_adls_auth()

# Load Bronze paths
P = bronze_paths("claims")
BRONZE_TABLE = P.table

In [0]:
# 2. Ensure Bronze DB and Claims table exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE}
    USING DELTA
    COMMENT 'Bronze Parquet ingest of claim records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [0]:
# 3. Define an incremental batch pipeline using Auto-loader.
#    Auto Loader discovers new files in cloud storage and writes to a Delta table.
stream = (
    spark.readStream
         .format("cloudFiles")
         .option("cloudFiles.format", "parquet")
         .option("cloudFiles.schemaLocation", P.schema)
         .option("cloudFiles.includeExistingFiles", "true")
         .option("badRecordsPath", P.bad)
         .load(P.raw)
         .transform(add_audit_cols)
         
         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

# NOTE:
# Auto Loader handles the read-side logic, maintaining a file-discovery ledger.
# Structured Streaming handles the write-side logic, using a WAL and offset tracking.
# The ledger, WAL, offsets, and schema evolution log are all stored in the checkpoint directory.
# `cloudFiles.schemaLocation` tells Auto Loader where to persist the evolving schema history.

In [0]:
# 5. Batch finished - Verify Bronze Claims table and ingestion history.
df = spark.table(BRONZE_TABLE)
print(f"Bronze Claims row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)