In [0]:
# 02_silver_patients_transform.ipynb
# Promote Bronze patient data to Silver, masking PHI columns.

from delta.tables import DeltaTable
from pyspark.sql import SparkSession, functions as F

# Paths and table names
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
SILVER_DB   = "kardia_silver"
SILVER_TBL  = f"{SILVER_DB}.silver_patients"
STATE_PATH  = "dbfs:/kardia/_state/bronze_to_silver_patients.txt"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
%sql
-- Create the Silver database and table if they don’t exist
CREATE DATABASE IF NOT EXISTS kardia_silver;

CREATE TABLE IF NOT EXISTS kardia_silver.silver_patients AS
SELECT
    ID,
    year(BIRTHDATE) AS BIRTH_YEAR,
    -- mask PHI columns
    CAST(NULL AS STRING) AS DEATHDATE,
    CAST(NULL AS STRING) AS FIRST,
    CAST(NULL AS STRING) AS LAST,
    CAST(NULL AS STRING) AS MAIDEN,
    CAST(NULL AS STRING) AS SSN,
    CAST(NULL AS STRING) AS DRIVERS,
    CAST(NULL AS STRING) AS PASSPORT,
    CAST(NULL AS STRING) AS BIRTHPLACE,
    MARITAL,
    RACE,
    ETHNICITY,
    GENDER
FROM delta.`dbfs:/kardia/bronze/bronze_patients`;


In [0]:
# Read the last processed Bronze version (defaults to -1 on first run)
try:
    last_processed = int(dbutils.fs.head(STATE_PATH, 1024))
except Exception:
    last_processed = -1

# Current highest Bronze version
latest_bronze = spark.sql(
    f"SELECT MAX(version) AS v FROM (DESCRIBE HISTORY delta.`{BRONZE_PATH}`)"
).first().v

if last_processed < latest_bronze:
    # Load change data from Bronze (new inserts and updates only)
    bronze_updates = (
        spark.read.format("delta")
             .option("readChangeFeed", "true")
             .option("startingVersion", last_processed + 1)
             .load(BRONZE_PATH)
             .filter(F.col("_change_type").isin("insert", "update_postimage"))
             .dropDuplicates(["ID"])
    )

    # Transform: extract birth year and mask PHI
    silver_rows = (
        bronze_updates
            .withColumn("BIRTH_YEAR", F.year("BIRTHDATE"))
            .withColumn("DEATHDATE",  F.lit(None).cast("string"))
            .withColumn("FIRST",      F.lit(None).cast("string"))
            .withColumn("LAST",       F.lit(None).cast("string"))
            .withColumn("MAIDEN",     F.lit(None).cast("string"))
            .withColumn("SSN",        F.lit(None).cast("string"))
            .withColumn("DRIVERS",    F.lit(None).cast("string"))
            .withColumn("PASSPORT",   F.lit(None).cast("string"))
            .withColumn("BIRTHPLACE", F.lit(None).cast("string"))
            .select(
                "ID", "BIRTH_YEAR", "DEATHDATE", "FIRST", "LAST", "MAIDEN",
                "SSN", "DRIVERS", "PASSPORT", "BIRTHPLACE",
                "MARITAL", "RACE", "ETHNICITY", "GENDER"
            )
    )

    # Merge into Silver (SCD-1)
    DeltaTable.forName(spark, SILVER_TBL) \
        .alias("t") \
        .merge(silver_rows.alias("s"), "t.ID = s.ID") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()

    print("Silver table updated")
else:
    print("No new commits – Silver already current")

In [0]:
# Update the state file to the latest Bronze version
dbutils.fs.put(STATE_PATH, str(latest_bronze), overwrite=True)
print(f"Saved latest processed version: {latest_bronze}")

In [0]:
# Refresh and show summary
spark.sql(f"REFRESH TABLE {SILVER_TBL}")
print(f"Row count: {spark.table(SILVER_TBL).count()}")

spark.sql(f"""
SELECT version, timestamp, operation
FROM   (DESCRIBE HISTORY {SILVER_TBL})
ORDER  BY version DESC
LIMIT  5
""").show(truncate=False)