In [None]:
%md
### Kardiaflow - Silver Encounters (SCD1)

**Source:** `kardia_bronze.bronze_encounters` (CDF)

**Target:** `kardia_silver.silver_encounters` (SCD1 upsert)

**Pattern:** Deduplicate by encounter; MERGE to update or insert (SCD1)

**Trigger:** (configurable via job param `mode`)
- **Batch mode:** one-time processing of available data
- **Stream mode:** continuous micro-batches (30s)

**Description:** The Silver layer is where raw data becomes trustworthy and usable. Here we enforce constraints,
standardize types, rename fields into consistent names, mask PHI, and apply deduplication, SCD1/SCD2 handling, and
timezone normalization. In dbt, staging handles renaming/typing while refined handles business rules, making lineage and documentation transparent in its SQL-first world. In Kardiaflow, we combine both in Silver, following Delta Lake convention where Silver covers adaptation and core business logic.

In [None]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from kflow.config import BRONZE_DB, bronze_table, CHANGE_TYPES, SILVER_DB, silver_paths
from kflow.notebook_utils import init

init()

# Load table paths and names for the Encounters dataset
S = silver_paths("encounters")
SRC_TABLE = bronze_table("encounters")
TGT_TABLE = S.table

In [None]:
# 1. Retrieve runtime mode from job widget: "batch" (default) or "stream"
try:
    dbutils.widgets.dropdown("mode", "batch", ["batch", "stream"])
except:
    pass

MODE       = dbutils.widgets.get("mode") if "dbutils" in globals() else "batch"
IS_BATCH   = (MODE == "batch")
CHECKPOINT = f"{S.checkpoint}/{MODE}"

In [None]:
# 2. Ensure Silver DB and Encounters table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
      encounter_id       STRING  NOT NULL,
      patient_id         STRING  NOT NULL,
      START_TS           TIMESTAMP,
      CODE               STRING,
      DESCRIPTION        STRING,
      REASONCODE         STRING,
      REASONDESCRIPTION  STRING,
      _ingest_ts         TIMESTAMP,
      _batch_id          STRING,
      _source_file       STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [None]:
# 3. Define upsert logic to apply SCD1 updates to Silver Encounters
def upsert_to_silver(batch_df, batch_id):
    # Retain the latest record per encounter_id, ordered by commit version
    w_latest = (
        Window.partitionBy("encounter_id")
              .orderBy(
                  F.col("_commit_version").desc_nulls_last(),
                  F.col("_commit_timestamp").desc_nulls_last()
              )
    )

    latest_df = (
        batch_df
          .withColumn("rn", F.row_number().over(w_latest))
          .filter("rn = 1")
          .drop("rn", "_commit_version", "_commit_timestamp")
    )

    # Upsert into Silver (update existing Encounters, insert new ones)
    (
        DeltaTable.forName(spark, TGT_TABLE)
                  .alias("t")
                  .merge(latest_df.alias("s"), "t.encounter_id = s.encounter_id")
                  .whenMatchedUpdateAll()
                  .whenNotMatchedInsertAll()
                  .execute()
    )

In [None]:
# 4a. Read new and changed rows from Bronze via CDF
silver_ready = (
    spark.readStream
         .format("delta")
         .option("readChangeFeed", "true")
         .option("startingVersion", 1) # Skip v0; table creation/metadata. Checkpoint will track after first run.
         .table(SRC_TABLE)
         .filter(
             F.col("_change_type").isin(*CHANGE_TYPES) &
             F.col("ID").isNotNull() &
             F.col("PATIENT").isNotNull()
         )
         .select(
             F.col("ID").alias("encounter_id"),
             F.col("PATIENT").alias("patient_id"),
             F.to_timestamp("DATE", "yyyy-MM-dd").alias("START_TS"),
             F.col("CODE").cast("string").alias("CODE"),
             F.col("DESCRIPTION"),
             F.col("REASONCODE").cast("string").alias("REASONCODE"),
             F.col("REASONDESCRIPTION"),
             F.col("_commit_version"),
             F.col("_commit_timestamp"),
             F.col("_ingest_ts"),
             F.col("_batch_id"),
             F.col("_source_file")
         )
)

In [None]:
# 4b. Deduplicate and merge each batch so Silver always reflects the latest encounter data
writer = (
    silver_ready.writeStream
                .foreachBatch(upsert_to_silver)
                .option("checkpointLocation", CHECKPOINT)
)

In [None]:
# 5. Run as either batch or stream depending on runtime mode
if IS_BATCH:
    # Batch mode: process all available files once and exit
    query = writer.trigger(availableNow=True).start()
    print(f"[batch] Read CDF to {TGT_TABLE} (checkpoint={CHECKPOINT}) …")
    query.awaitTermination()
else:
    # Streaming mode: run continuously every 30s
    query = writer.trigger(processingTime="30 seconds").start()
    print(f"[live] Continuous 30s CDF upserts to {TGT_TABLE} (checkpoint={CHECKPOINT})")