In [0]:
# 02_silver_patients_transform.ipynb
# Promote Bronze patient data to Silver, masking PHI columns.

from delta.tables import DeltaTable
from pyspark.sql import functions as F

# Paths and table names
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
SILVER_DB = "kardia_silver"
SILVER_PATIENTS_TABLE = f"{SILVER_DB}.silver_patients"
BOOKMARK_FILE = "dbfs:/kardia/_state/bronze_to_silver_patients.txt"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
%sql
-- Create the Silver database and table if they don’t exist
CREATE DATABASE IF NOT EXISTS kardia_silver;

CREATE TABLE IF NOT EXISTS kardia_silver.silver_patients AS
SELECT
    ID,
    year(BIRTHDATE) AS BIRTH_YEAR,
    -- mask PHI columns
    CAST(NULL AS STRING) AS DEATHDATE,
    CAST(NULL AS STRING) AS FIRST,
    CAST(NULL AS STRING) AS LAST,
    CAST(NULL AS STRING) AS MAIDEN,
    CAST(NULL AS STRING) AS SSN,
    CAST(NULL AS STRING) AS DRIVERS,
    CAST(NULL AS STRING) AS PASSPORT,
    CAST(NULL AS STRING) AS BIRTHPLACE,
    MARITAL,
    RACE,
    ETHNICITY,
    GENDER
FROM kardia_bronze.bronze_patients;


In [0]:
# Read the last processed Bronze version (defaults to -1 on first run)
try:
    last_processed = int(dbutils.fs.head(BOOKMARK_FILE, 1024))
except FileNotFoundError:
    last_processed = -1
except Exception as e:
    raise RuntimeError(f"Failure reading bookmark: {e}")

latest_bronze = (
    spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}` LIMIT 1").first().version
)

if last_processed < latest_bronze:

    # Pull only the changed rows from Change Data Feed
    bronze_updates = (
        spark.read
            .format("delta")
            .option("readChangeFeed", "true")
            .option("startingVersion", last_processed + 1)
            .load(BRONZE_PATH)
            .filter(F.col("_change_type").isin("insert", "update_postimage"))
            .dropDuplicates(["ID"])
    )

    # ──────────────────────────────────────────────────────────
    # Build the Silver rows: derive BIRTH_YEAR + mask PHI
    # ──────────────────────────────────────────────────────────
    PHI_COLUMNS = [
        "DEATHDATE", "FIRST", "LAST", "MAIDEN", "SSN",
        "DRIVERS", "PASSPORT", "BIRTHPLACE"
    ]

    silver_rows = bronze_updates.withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))

    for c in PHI_COLUMNS:
        silver_rows = silver_rows.withColumn(c, F.lit(None).cast("string"))

    silver_rows = silver_rows.select(
        "ID", "BIRTH_YEAR", *PHI_COLUMNS, "MARITAL", "RACE", "ETHNICITY", "GENDER"
    )

    # Enable auto-merge so a future nullable column won’t break the demo
    spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

    # Run the SCD-1 merge
    (
        DeltaTable.forName(spark, SILVER_PATIENTS_TABLE)
                .alias("t")
                .merge(silver_rows.alias("s"), "t.ID = s.ID")
                .whenMatchedUpdateAll()
                .whenNotMatchedInsertAll()
                .execute()
    )

    print("Silver patients table updated.")
else:
    print("No new commits – Silver patients already current.")

# Update bookmark
dbutils.fs.put(BOOKMARK_FILE, str(latest_bronze), overwrite=True)
print(f"Saved latest processed version: {latest_bronze}")

# Simple sanity check
print(f"Row count: {spark.table(SILVER_PATIENTS_TABLE).count()}")

In [0]:
# Refresh and show summary
spark.sql(f"REFRESH TABLE {SILVER_PATIENTS_TABLE}")
print(f"Row count: {spark.table(SILVER_PATIENTS_TABLE).count()}")

spark.sql(
    f"""
    SELECT version, timestamp, operation
    FROM   (DESCRIBE HISTORY {SILVER_PATIENTS_TABLE})
    ORDER  BY version DESC
    LIMIT  5
    """
).show(truncate=False)