In [0]:
# 02_silver_patients_transform.py
# SOURCE: Ingest patient records from Bronze promote them to Silver.
# OUTPUT: `kardia_silver.silver_patients`, updated incrementally.
# PATTERN: Read Change Data Feed from Bronze, mask PHI columns, derive `BIRTH_YEAR` from `BIRTHDATE`.
# TRIGGER: Incremental batch job (since patient records arrive infrequently).

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

SILVER_DB       = "kardia_silver"

BRONZE_TABLE    = "kardia_bronze.bronze_patients"
SILVER_TABLE    = "kardia_silver.silver_patients"

CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_patients"
CHANGE_TYPES    = ["insert", "update_postimage"]

PHI_COLUMNS = [
    "DEATHDATE",
    "FIRST",
    "LAST",
    "MAIDEN",
    "SSN",
    "DRIVERS",
    "PASSPORT",
    "BIRTHPLACE"
]

In [0]:
# 1. Ensure the Silver DB and Silver Patients table exist before writing.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
      ID           STRING,
      BIRTH_YEAR   INT,
      DEATHDATE    STRING,
      FIRST        STRING,
      LAST         STRING,
      MAIDEN       STRING,
      SSN          STRING,
      DRIVERS      STRING,
      PASSPORT     STRING,
      BIRTHPLACE   STRING,
      MARITAL      STRING,
      RACE         STRING,
      ETHNICITY    STRING,
      GENDER       STRING
    ) USING DELTA
    """
)

In [0]:
# 2. Merge the latest version of each patient record into the Silver Patients table.
#    `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
#    Use the `_commit_version` column from CDF to identify the most recent change per ID.
def upsert(batch_df, _):
    w = Window.partitionBy("ID").orderBy(F.col("_commit_version").desc())

    latest = (
        batch_df
        .filter(F.col("_change_type").isin(*CHANGE_TYPES))
        .withColumn("rn", F.row_number().over(w))
        .filter("rn = 1")
        .withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))
        .select(
            "ID",
            "BIRTH_YEAR",
            *[F.lit(None).cast("string").alias(c) for c in PHI_COLUMNS],
            "MARITAL","RACE","ETHNICITY","GENDER"
        )
    )

    (DeltaTable.forName(spark, SILVER_TABLE)
        .alias("t")
        .merge(latest.alias("s"), "t.ID = s.ID")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute())


In [0]:
# 3. Incremental batch run
(spark.readStream
      .format("delta")
      .option("readChangeFeed", "true")
      .table(BRONZE_TABLE)

      .writeStream
      .foreachBatch(upsert)
      .option("checkpointLocation", CHECKPOINT_PATH)
      .trigger(availableNow=True)
      .start()
      .awaitTermination())

In [0]:
# 4. Print final row count and checkpoint location
print(f"Silver patients row count: {spark.table(SILVER_TABLE).count()}")
print(f"Checkpoint: {CHECKPOINT_PATH}")


# NOTE: A Delta Lake merge is not a join in the relational sense.
# It is a mutation command where the source table drives the operation.
#
# Delta CDF emits all changes since the last checkpoint.
# This may include multiple versions of the same ID across different commits.
# We deduplicate using row_number() to keep only the latest version per patient ID.