In [None]:
# silver_encounters_scd1.ipynb
# SOURCE: `kardia_bronze.bronze_encounters` (CDF)
# TARGET: `kardia_silver.silver_encounters` (SCD1 upsert)
# TRIGGER: - "batch" mode: one-time processing of available data
#          - "stream" mode: continuous micro-batches every 30 seconds

# Optional library bootstrap for ephemeral jobs clusters
# %run ../../99_utilities/bootstrap_kflow

from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import BRONZE_DB, bronze_table, CHANGE_TYPES, SILVER_DB, silver_paths

# Configure Spark with ADLS OAuth credentials and return base ABFS path
abfss_base = ensure_adls_oauth()

# Set catalog to Hive Metastore (required when not using Unity Catalog)
spark.sql("USE CATALOG hive_metastore")

# Load table paths and names for the Encounters dataset
S = silver_paths("encounters")
SRC_TABLE = bronze_table("encounters")
TGT_TABLE = S.table

In [None]:
# Retrieve runtime mode from job widget: "batch" (default) or "stream"
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass

MODE       = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH   = (MODE == "batch")
CHECKPOINT = f"{S.checkpoint}/{MODE}"

In [None]:
# 1. Ensure Silver DB and Encounters table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
      encounter_id       STRING  NOT NULL,
      patient_id         STRING  NOT NULL,
      START_TS           TIMESTAMP,
      CODE               STRING,
      DESCRIPTION        STRING,
      REASONCODE         STRING,
      REASONDESCRIPTION  STRING,
      _ingest_ts         TIMESTAMP,
      _batch_id          STRING,
      _source_file       STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [None]:
# 2. Define upsert logic to apply SCD1 updates to Silver Patients
def upsert_to_silver(batch_df, batch_id):
    # Retain the latest record per encounter_id, ordered by commit version
    w_latest = (
        Window.partitionBy("encounter_id")
              .orderBy(
                  F.col("_commit_version").desc_nulls_last(),
                  F.col("_commit_timestamp").desc_nulls_last()
              )
    )

    latest = (
        batch_df
        .withColumn("rn", F.row_number().over(w_latest))
        .filter("rn = 1")
        .drop("rn", "_commit_version", "_commit_timestamp")
    )

    # Upsert into Silver (update existing Encounters, insert new ones)
    (DeltaTable.forName(spark, TGT_TABLE)
               .alias("t")
               .merge(latest.alias("s"), "t.encounter_id = s.encounter_id")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [None]:
# 3a. Read new and changed rows from Bronze via CDF
silver_ready = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .option("startingVersion", 1) # Skip first commit with no CDF rows
         .table(SRC_TABLE)
         .filter(
             F.col("_change_type").isin(*CHANGE_TYPES) &
             F.col("ID").isNotNull() &
             F.col("PATIENT").isNotNull()
         )
         .select(
             F.col("ID").alias("encounter_id"),
             F.col("PATIENT").alias("patient_id"),
             F.to_timestamp("DATE", "yyyy-MM-dd").alias("START_TS"),
             F.col("CODE").cast("string").alias("CODE"),
             F.col("DESCRIPTION"),
             F.col("REASONCODE").cast("string").alias("REASONCODE"),
             F.col("REASONDESCRIPTION"),
             F.col("_commit_version"),
             F.col("_commit_timestamp"),
             F.col("_ingest_ts"),
             F.col("_batch_id"),
             F.col("_source_file")
         )
)

In [None]:
# 3b. Deduplicate and merge each batch so Silver always reflects the latest encounter data
writer = (
    silver_ready.writeStream
                .foreachBatch(upsert_to_silver)
                .option("checkpointLocation", CHECKPOINT)
)

In [None]:
# Run as either batch or stream depending on runtime mode
if IS_BATCH:
    # Batch mode: process all available files once and exit
    q = writer.trigger(availableNow=True).start()
    print(f"[demo] Reading CDF to {TGT_TABLE} (checkpoint={CHECKPOINT}) …")
    q.awaitTermination()
else:
    # Streaming mode: run continuously every 30s
    q = writer.trigger(processingTime="30 seconds").start()
    print(f"[live] Continuous 30s CDF upserts to {TGT_TABLE} (checkpoint={CHECKPOINT})")