In [0]:
# 02_silver_patients_transform.py
# SOURCE: Ingest patient records from Bronze with PHI and promote them to Silver.
# OUTPUT: `kardia_silver.silver_patients`, updated incrementally.
# Read Change Data Feed from Bronze, mask PHI columns, derive `BIRTH_YEAR` from `BIRTHDATE`.
# TRIGGER: Incremental batch job (since patient records arrive infrequently).

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

# Table paths
SILVER_DB       = "kardia_silver"
BRONZE_TABLE    = "kardia_bronze.bronze_patients"
SILVER_TABLE    = f"{SILVER_DB}.silver_patients"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_patients"
CHANGE_TYPES    = ["insert", "update_postimage"]

PHI_COLUMNS = [
    "DEATHDATE",
    "FIRST",
    "LAST",
    "MAIDEN",
    "SSN",
    "DRIVERS",
    "PASSPORT",
    "BIRTHPLACE"
]

In [0]:
# 1. Ensure the Silver DB and Silver Patients table exist before writing.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_TABLE} (
      ID           STRING,
      BIRTH_YEAR   INT,
      DEATHDATE    STRING,
      FIRST        STRING,
      LAST         STRING,
      MAIDEN       STRING,
      SSN          STRING,
      DRIVERS      STRING,
      PASSPORT     STRING,
      BIRTHPLACE   STRING,
      MARITAL      STRING,
      RACE         STRING,
      ETHNICITY    STRING,
      GENDER       STRING
    ) USING DELTA
    """
)

In [0]:
# 2. Define helper function and upsert logic
#    Since row_number() requires _commit_version, we must wait before projecting down to
#    the final schema until foreachBatch and transform only after duplicates are removed.
#    This is why we don't apply transformations inline (as we do in silver_encounters_transform).

# Mask PHI columns and derive BIRTH_YEAR.
def _clean_patient_data(patients_df):
    patients_df = patients_df.withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))

    for phi_column in PHI_COLUMNS:
        patients_df = patients_df.withColumn(phi_column, F.lit(None).cast("string"))

    return patients_df.select(
        "ID",
        "BIRTH_YEAR",
        *PHI_COLUMNS,
        "MARITAL",
        "RACE",
        "ETHNICITY",
        "GENDER"
    )

# Merge the latest version of each patient record into the Silver Patients table.
# `batch_df` is a static DF containing the latest new and updated rows from Bronze CDF.
# Use the `_commit_version` column from CDF to identify the most recent change per ID.
def upsert_to_silver(batch_df, _):
    latest_patient_ids = (
        batch_df.withColumn(
                     "row_num",
                     F.row_number().over(
                         Window.partitionBy("ID").orderBy(F.col("_commit_version").desc())
                     )
                )
                .filter("row_num = 1")
                .drop("row_num")
    )

    silver_ready_df = _clean_patient_data(latest_patient_ids)

    (DeltaTable.forName(spark, SILVER_TABLE)
               .alias("target")
               .merge(silver_ready_df.alias("source"), "target.ID = source.ID")
               .whenMatchedUpdateAll()
               .whenNotMatchedInsertAll()
               .execute())

In [0]:
# 3. Build source stream.

# 3a. Read new inserts and updates from the Bronze Patients table.
bronze_cdf = (
    spark.readStream
         .format("delta")
         .option("readChangeData", "true")
         .table(BRONZE_TABLE)
         .filter(F.col("_change_type").isin(*CHANGE_TYPES))
)

# 3b. Write incremental batch output.
#     Ideal for infrequent updates like patient data.
#     Checkpointing guarantees exactly-once processing across reruns.
query = (bronze_cdf.writeStream
                   .foreachBatch(upsert_to_silver)
                   .option("checkpointLocation", CHECKPOINT_PATH)
                   .trigger(availableNow=True)
                   .start())

In [0]:
# 4. Print final row count and checkpoint location
print(f"Silver patients row count: {spark.table(SILVER_TABLE).count()}")
print(f"Checkpoint: {CHECKPOINT_PATH}")

# NOTE: A Delta Lake merge is not a join in the relational sense.
# It is a mutation command where the source table drives the operation.
#
# Delta CDF emits all changes since the last checkpoint.
# This may include multiple versions of the same ID across different commits.
# We deduplicate using row_number() to keep only the latest version per patient ID.