In [0]:
# 02_silver_patients_scd1_batch.py
# SOURCE: kardia_bronze.bronze_patients (Delta CDF)
# OUTPUT: `kardia_silver.silver_patients`, updated incrementally.
# PATTERN: Read Change Data Feed from Bronze, mask PHI columns, derive `BIRTH_YEAR` from `BIRTHDATE`.
# TRIGGER: Incremental batch job

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_DB = "kardia_silver"
BRONZE_PATIENTS_TBL = "kardia_bronze.bronze_patients"
SILVER_PATIENTS_TBL = "kardia_silver.silver_patients"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_patients"

# CDF event types for upserts; only process new inserts and updated records
CHANGE_TYPES = ("insert", "update_postimage")

PHI_COLS_MASK = ["DEATHDATE","SSN","DRIVERS","PASSPORT","FIRST","LAST","BIRTHPLACE"]

In [0]:
# 1. Ensure the Silver DB and Silver Patients table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_PATIENTS_TBL} (
        ID           STRING,
        BIRTH_YEAR   INT,
        DEATHDATE    STRING,
        FIRST        STRING,
        LAST         STRING,
        SSN          STRING,
        DRIVERS      STRING,
        PASSPORT     STRING,
        BIRTHPLACE   STRING,
        MARITAL      STRING,
        RACE         STRING,
        ETHNICITY    STRING,
        GENDER       STRING,
        CONSTRAINT id_nn CHECK (ID IS NOT NULL),
        CONSTRAINT gender_ck CHECK (GENDER IN ('M','F'))
        ) USING DELTA
        """
)

In [0]:
# 2. Merge the latest version of each patient record into the Silver Patients table.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
#    Use the `_commit_version` column from CDF to identify the most recent change per ID.
def upsert_to_silver_patients(batch_df, _):
    w_latest = Window.partitionBy("ID").orderBy(F.col("_commit_version").desc())

    latest = (
        batch_df
        .filter(F.col("_change_type").isin(*CHANGE_TYPES))
        .withColumn("rn", F.row_number().over(w_latest))
        .filter("rn = 1")
        .withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))
        .select(
            "ID",
            "BIRTH_YEAR",
            *[F.lit(None).cast("string").alias(c) for c in PHI_COLS_MASK],
            "MARITAL","RACE","ETHNICITY","GENDER"
        )
    )

    (DeltaTable.forName(spark, SILVER_PATIENTS_TBL)
               .alias("t")
               .merge(latest.alias("s"), "t.ID = s.ID")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())


In [0]:
# 3. Incremental batch run
(spark.readStream
      .format("delta")
      .option("readChangeFeed", "true")
      .table(BRONZE_PATIENTS_TBL)

      .writeStream
      .foreachBatch(upsert_to_silver_patients)
      .option("checkpointLocation", CHECKPOINT_PATH)
      .trigger(availableNow=True)
      .start()
      .awaitTermination())

In [0]:
# 4. Print final row count and checkpoint location
print(f"Silver patients row count: {spark.table(SILVER_PATIENTS_TBL).count()}")
print(f"Checkpoint: {CHECKPOINT_PATH}")

# NOTE: A Delta Lake merge is not a join in the relational sense.
# It is a mutation command where the source table drives the operation.
#
# Delta CDF emits all changes since the last checkpoint.
# This may include multiple versions of the same ID across different commits.
# We deduplicate using row_number() to keep only the latest version per patient ID.