In [0]:
# bronze_providers_autoloader.ipynb
# SOURCE: Raw TSV files in ADLS
# TARGET: `kardia_bronze.bronze_providers` (CDF)
# TRIGGER: Incremental batch via Auto Loader; append to Bronze Providers table

# Install kflow from local wheel for use during job execution
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType

from kflow.config import BRONZE_DB, bronze_paths, raw_path
from kflow.display_utils import show_history
from kflow.etl_utils import add_audit_cols

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Providers dataset (paths, table, schema, etc.)
P            = bronze_paths("providers")
BRONZE_TABLE = P.table
RAW_PATH     = raw_path("providers")

In [0]:
# Define schema explicitly for TSV input
# TSVs donâ€™t include schema metadata and inference is unreliable
provider_schema = StructType([
    StructField("ProviderID",        StringType(), True),
    StructField("ProviderSpecialty", StringType(), True),
    StructField("ProviderLocation",  StringType(), True),
])

In [0]:
# 1. Ensure Bronze DB and Providers table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {BRONZE_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {BRONZE_TABLE} (
        ProviderID        STRING,
        ProviderSpecialty STRING,
        ProviderLocation  STRING,
        _ingest_ts        TIMESTAMP,
        _source_file      STRING
    )
    USING DELTA
    COMMENT 'Bronze TSV ingest of Provider records.'
    LOCATION '{P.bronze}'
    TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """
)

In [None]:
# 3. Define an incremental batch pipeline using Auto Loader

# Collect all Auto Loader options
auto_loader_opts = {
    "cloudFiles.format": "csv",
    "cloudFiles.includeExistingFiles": "true",
    "cloudFiles.schemaLocation": P.schema,
    "delimiter": "\t",
    "header": "true",
    "ignoreEmptyLines": "true",
    "badRecordsPath": P.bad,
    "rescuedDataColumn": "_rescued_data"
}

stream = (
    spark.readStream
         .format("cloudFiles")
         .options(**auto_loader_opts)
         .schema(provider_schema)
         .load(RAW_PATH)
         # Drop any records without a valid primary key
         .filter(F.col("ProviderID").isNotNull())
         # Add ingest timestamp, source file, batch ID
         .transform(add_audit_cols)

         .writeStream
         .option("checkpointLocation", P.checkpoint)
         .option("mergeSchema", "true")
         .trigger(availableNow=True)
         .toTable(BRONZE_TABLE)
)
stream.awaitTermination()

In [0]:
# 4. Batch finished - Verify Bronze Providers table and ingestion history.
df = spark.table(BRONZE_TABLE)
print(f"Bronze Providers row count: {df.count():,}")
display(df.limit(5))
show_history(P.bronze)