In [None]:
%md
### Kardiaflow - Silver Patients (SCD1)

**Source:** `kardia_bronze.bronze_patients` (CDF)

**Target:** `kardia_silver.silver_patients` (SCD1 upsert)

**Pattern:** Deduplicate by patient; mask PHI, derive `birth_year`, MERGE update/insert (SCD1)

**Trigger:** Incremental batch

Notes:
- Masks sensitive PHI fields.
- Derives `birth_year` from `BIRTHDATE`.

In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from kflow.config import  bronze_table, CHANGE_TYPES, PHI_COLS_MASK, silver_paths
from kflow.notebook_utils import init, show_history

init()

# Load table paths and names for the Patients dataset
S         = silver_paths("patients")
SRC_TABLE = bronze_table("patients")
TGT_TABLE = S.table
CKPT      = S.checkpoint

In [0]:
# 1. Ensure Silver DB and Patients table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {S.db}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {TGT_TABLE} (
        id             STRING  NOT NULL,
        birth_year     INT,
        deathdate      STRING,
        first          STRING,
        last           STRING,
        ssn            STRING,
        drivers        STRING,
        passport       STRING,
        birthplace     STRING,
        marital        STRING,
        race           STRING,
        ethnicity      STRING,
        gender         STRING,
        _ingest_ts     TIMESTAMP,
        _batch_id      STRING,
        _source_file   STRING
    ) USING DELTA
    LOCATION '{S.path}'
    """
)

In [0]:
# 2. Define upsert logic to apply SCD1 updates to Silver Patients
def upsert_to_silver(batch_df, _):
    # Retain inserts and updates only
    filtered = (
        batch_df
          .filter(F.col("_change_type").isin(*CHANGE_TYPES))
          .filter(F.col("ID").isNotNull())
    )

    # Retain the latest record per patient, ordered by _ingest_ts
    w_latest = (
        Window.partitionBy("ID")
              .orderBy(
                  F.col("_commit_version").desc_nulls_last(),
                  F.col("_commit_timestamp").desc_nulls_last()
              )
    )

    deduped_df = (
        filtered
          .withColumn("rn", F.row_number().over(w_latest))
          .filter("rn = 1")
          .drop("rn", "_commit_version", "_commit_timestamp")
    )

    # Mask PHI, standardize column names, derive birth_year
    latest_df = (
        deduped_df.select(
            F.col("ID").alias("id"),
            F.year(F.to_date("BIRTHDATE", "yyyy-MM-dd")).alias("birth_year"),
            *[F.lit(None).cast("string").alias(c.lower()) for c in PHI_COLS_MASK],
            F.col("MARITAL").alias("marital"),
            F.col("RACE").alias("race"),
            F.col("ETHNICITY").alias("ethnicity"),
            F.col("GENDER").alias("gender"),
            F.col("_ingest_ts"),
            F.col("_batch_id"),
            F.col("_source_file"),
        )
    )

    # Upsert into Silver
    (
        DeltaTable.forName(spark, TGT_TABLE)
            .alias("t")
            .merge(latest_df.alias("s"), "t.id = s.id")
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
    )

In [0]:
# 3. Incremental batch run
(spark.readStream
      .format("delta")
      .option("readChangeFeed", "true")
      .option("startingVersion", 1) # Skip v0; table creation/metadata. Checkpoint will track after first run.
      .table(SRC_TABLE)

      .writeStream
      .foreachBatch(upsert_to_silver)
      .option("checkpointLocation", CKPT)
      .trigger(availableNow=True)
      .start()
      .awaitTermination())

In [0]:
# 4. Verify Silver Patients table row count and checkpoint path.
df = spark.table(TGT_TABLE)
print(f"Silver Patients row count: {df.count():,}")
display(df.orderBy(F.col("_ingest_ts").desc_nulls_last()).limit(5))
show_history(S.path)