In [0]:
# 02_silver_patients_transform.ipynb
# Promote Bronze patient data to Silver, masking PHI columns.

from delta.tables import DeltaTable
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Table paths
SILVER_DB = "kardia_silver"
SILVER_PATIENTS_TABLE = f"{SILVER_DB}.silver_patients"
BOOKMARK_FILE = "dbfs:/kardia/_state/bronze_to_silver_patients.txt"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS kardia_silver;

-- Define schema for silver_patients and apply tranformations, materializing
-- the first version of the data with the required business logic built in.
-- CTAS locks schema, data, and privacy intent into one readable statement.
CREATE TABLE IF NOT EXISTS kardia_silver.silver_patients AS
SELECT
    ID,
    year(BIRTHDATE) AS BIRTH_YEAR,
    -- mask PHI columns
    CAST(NULL AS STRING) AS DEATHDATE,
    CAST(NULL AS STRING) AS FIRST,
    CAST(NULL AS STRING) AS LAST,
    CAST(NULL AS STRING) AS MAIDEN,
    CAST(NULL AS STRING) AS SSN,
    CAST(NULL AS STRING) AS DRIVERS,
    CAST(NULL AS STRING) AS PASSPORT,
    CAST(NULL AS STRING) AS BIRTHPLACE,
    MARITAL,
    RACE,
    ETHNICITY,
    GENDER
FROM kardia_bronze.bronze_patients;


In [0]:
# GOAL: Incrementally merge new and updated Bronze patient records into Silver using CDF and version bookmarks.
#       Apply PHI masking, derive BIRTH_YEAR, and ensure only latest change per ID is merged (SCD Type 1).

# STEP 1: Determine if new Bronze data exists using version-based bookmarking.
#         Read the last processed Bronze version from the bookmark file (defaults to -1 on first run)
try:
    last_processed = int(dbutils.fs.head(BOOKMARK_FILE, 1024))
except Exception:
    last_processed = -1

# Read the current version of the Bronze table
latest_bronze = (
    spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}` LIMIT 1").first().version
)

# STEP 2: If there is new data to process, read the CDF from the latest version.
if last_processed < latest_bronze:
    # Pull only the changed rows from Change Data Feed
    bronze_updates = (
        spark.read
            .format("delta")
            .option("readChangeFeed", "true")
            .option("startingVersion", last_processed + 1)
            .load(BRONZE_PATH)
            .filter(F.col("_change_type").isin("insert", "update_postimage"))
    )

    # The Delta CDF may emit multiple rows for the same ID across versions.
    # We use a window to retain only the latest change per ID when multiple updates exist.
    latest_commit_per_id = Window.partitionBy("ID").orderBy(F.col("_commit_version").desc())

    bronze_updates_latest = (
        bronze_updates
            .withColumn("rn", F.row_number().over(latest_commit_per_id))
            .filter(F.col("rn") == 1)
            .drop("rn")
    )

    # Build the Silver rows: derive BIRTH_YEAR and mask PHI
    silver_rows = bronze_updates_latest.withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))

    PHI_COLUMNS = [
        "DEATHDATE", "FIRST", "LAST", "MAIDEN", "SSN",
        "DRIVERS", "PASSPORT", "BIRTHPLACE"
    ]

    # Create a column of NULL values with string type to overwrite sensitive PHI fields
    for c in PHI_COLUMNS:
        silver_rows = silver_rows.withColumn(c, F.lit(None).cast("string"))

    silver_rows = silver_rows.select(
        "ID", "BIRTH_YEAR", *PHI_COLUMNS, "MARITAL", "RACE", "ETHNICITY", "GENDER"
    )

    # Enable auto-merge so a future nullable column won't break the pipeline
    spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

    # STEP 3: Perform an upsert of the latest patient rows (silver_rows) into existing
    #         Silver table SILVER_PATIENTS_TABLE using SCD-1 logic.
    # - If the record ID exists, overwrite it with the latest values.
    # - If the record does not exist, insert it as a new row.
    (
        DeltaTable.forName(spark, SILVER_PATIENTS_TABLE)
                .alias("t")
                .merge(silver_rows.alias("s"), "t.ID = s.ID")
                .whenMatchedUpdateAll()
                .whenNotMatchedInsertAll()
                .execute()
    )

    print("Silver patients table updated.")
else:
    print("No new commits. Silver patients is already current.")

# STEP 4: Write the updated latest_bronze version back to the bookmark file,
#         ensuring on the next run only new changes will be pulled.
dbutils.fs.put(BOOKMARK_FILE, str(latest_bronze), overwrite=True)
print(f"Saved latest processed version: {latest_bronze}")

# Sanity check
print(f"Row count: {spark.table(SILVER_PATIENTS_TABLE).count()}")

# NOTE

# A Delta Lake merge is not a join in the relational sense.
# It is a mutation command where the source DataFrame drives the operation.

# Delta reads all CDF records from the `startingVersion` up to and including the latest commit
# So even if multiple commits occurred since the last run, all changes will be included in the read.

In [0]:
# Refresh and show summary
spark.sql(f"REFRESH TABLE {SILVER_PATIENTS_TABLE}")
print(f"Row count: {spark.table(SILVER_PATIENTS_TABLE).count()}")

spark.sql(
    f"""
    SELECT version, timestamp, operation
    FROM (DESCRIBE HISTORY {SILVER_PATIENTS_TABLE})
    ORDER BY version DESC
    LIMIT 5
    """
).show(truncate=False)

# NOTE: Without a REFRESH, you might see incorrect results when querying the table.