In [0]:
# 02_silver_transform_patients_cdf_mask_pii.ipynb
# -------------------------------------------------------
# Ingests bronze_patients -> silver_patients
# First run: take full snapshot -> initial Silver write
# Later runs: read only new CDF rows and MERGE
# SCD-1 (last-write-wins), masks PHI columns
# -------------------------------------------------------

from pyspark.sql import SparkSession, functions as F
from delta.tables import DeltaTable
import json
import sys

# Path config
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
SILVER_PATH = "dbfs:/kardia/silver/silver_patients"
STATE_PATH  = "dbfs:/kardia/_state/bronze_patients_lastver.txt"

In [0]:
def read_state(path, default_val):
    try:
        dbutils.fs.ls(path)
        return int(dbutils.fs.head(path, 256))
    except Exception:
        if DeltaTable.isDeltaTable(spark, SILVER_PATH):
            return (spark.sql(f"DESCRIBE HISTORY delta.`{SILVER_PATH}`")
                      .selectExpr("max(version) AS v").first().v)
        return default_val

In [0]:
spark = (
    SparkSession.builder
      .appName("silver_patients_incremental")
      .config("spark.sql.shuffle.partitions", "1")  # dev-friendly; tune in prod
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# ── 1. Bronze metadata & CDF status ─────────────────────
hist_df    = spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`")
latest_ver = hist_df.selectExpr("max(version) AS v").first().v

# 1A. Ensure CDF is ON (enable if missing)
detail_row = spark.sql(f"DESCRIBE DETAIL delta.`{BRONZE_PATH}`").first()
prop_map   = {k.lower(): v.lower() for k, v in detail_row["properties"].items()}
cdf_on_tbl = prop_map.get("delta.enablechangedatafeed", "false") == "true"

if not cdf_on_tbl:
    print("⚙ Enabling Change Data Feed on Bronze …")
    spark.sql(f"""
      ALTER TABLE delta.`{BRONZE_PATH}`
      SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """)
    # refresh metadata
    hist_df    = spark.sql(f"DESCRIBE HISTORY delta.`{BRONZE_PATH}`")
    latest_ver = hist_df.selectExpr("max(version) AS v").first().v

# 1B. Locate the commit that turned CDF on (robust JSON parse)
def find_cdf_meta_version(hist_df):
    for r in (hist_df
              .filter("operation = 'SET TBLPROPERTIES'")
              .select("version", "operationParameters")
              .orderBy("version")            # oldest first
              .collect()):
        props_json = r.operationParameters.get("properties")
        if props_json:
            try:
                props_dict = json.loads(props_json)
                if props_dict.get("delta.enableChangeDataFeed", "").lower() == "true":
                    return r.version
            except json.JSONDecodeError:
                pass
    return 0   # fall back: assume enabled at CREATE TABLE (v0)

first_cdf_meta_ver = find_cdf_meta_version(hist_df)
first_cdf_data_ver = first_cdf_meta_ver + 1

# If we just enabled CDF and no new WRITE has happened yet, exit early
if latest_ver <= first_cdf_meta_ver:
    print("No data commits since CDF was enabled; skipping incremental step.")
    dbutils.fs.put(STATE_PATH, str(latest_ver), overwrite=True)
    dbutils.notebook.exit("cdf_enabled_waiting_for_next_commit")

In [0]:
# 2. Decide run mode: bootstrap vs incremental ────────────────────────
silver_exists = DeltaTable.isDeltaTable(spark, SILVER_PATH)
last_processed_ver = 0
try:
    last_processed_ver = int(dbutils.fs.head(STATE_PATH, 256))
except Exception:
    last_processed_ver = first_cdf_meta_ver - 1

if not silver_exists:
    print("First run – bootstrapping Silver from full Bronze snapshot")
    
    # Read entire Bronze, drop duplicates, apply transformations
    base_df = (
        spark.read.format("delta").load(BRONZE_PATH)
            .dropDuplicates(["ID"])  # last-write-wins semantics
            .select(
                "ID",
                F.year("BIRTHDATE").cast("int").alias("BIRTH_YEAR"),
                F.lit(None).cast("string").alias("DEATHDATE"),
                F.lit(None).cast("string").alias("FIRST"),
                F.lit(None).cast("string").alias("LAST"),
                F.lit(None).cast("string").alias("MAIDEN"),
                F.lit(None).cast("string").alias("SSN"),
                F.lit(None).cast("string").alias("DRIVERS"),
                F.lit(None).cast("string").alias("PASSPORT"),
                F.lit(None).cast("string").alias("BIRTHPLACE"),
                "MARITAL", "RACE", "ETHNICITY", "GENDER"
            )
    )
    base_df.write.format("delta").mode("overwrite").save(SILVER_PATH)
    dbutils.fs.put(STATE_PATH, str(latest_ver), overwrite=True)
    dbutils.notebook.exit(f"bootstrap_complete_v{latest_ver}")

In [0]:
# 3. Incremental run via CDF ────────────────────────────────────────
start_ver = max(last_processed_ver + 1, first_cdf_data_ver)
if start_ver > latest_ver:
    print(f"No new Bronze commits (latest={latest_ver}, last_processed={last_processed_ver}).")
    dbutils.notebook.exit("no_new_commits")

print(f"Reading Bronze change data feed from version {start_ver} to {latest_ver}")
cdf_df = (
    spark.read
         .format("delta")
         .option("readChangeFeed", "true")
         .option("startingVersion", start_ver)
         .load(BRONZE_PATH)
         .filter(F.col("_change_type").isin("insert", "update_postimage"))
         .dropDuplicates(["ID"])
         .select(
             "ID",
             F.year("BIRTHDATE").cast("int").alias("BIRTH_YEAR"),
             F.lit(None).cast("string").alias("DEATHDATE"),
             F.lit(None).cast("string").alias("FIRST"),
             F.lit(None).cast("string").alias("LAST"),
             F.lit(None).cast("string").alias("MAIDEN"),
             F.lit(None).cast("string").alias("SSN"),
             F.lit(None).cast("string").alias("DRIVERS"),
             F.lit(None).cast("string").alias("PASSPORT"),
             F.lit(None).cast("string").alias("BIRTHPLACE"),
             "MARITAL", "RACE", "ETHNICITY", "GENDER"
         )
)

if cdf_df.rdd.isEmpty():
    print("No row-level changes in this commit range.")
    dbutils.fs.put(STATE_PATH, str(latest_ver), overwrite=True)
    dbutils.notebook.exit("empty_cdf_batch")

In [0]:
# 4. Merge changes into Silver (null-safe for all fields)
tgt = DeltaTable.forPath(spark, SILVER_PATH)

# Perform update with null-protection (SCD-1 style, last-write-wins)
(
    tgt.alias("t")
        .merge(cdf_df.alias("s"), "t.ID = s.ID")
        .whenMatchedUpdate(set={
            "ID":           "coalesce(s.ID, t.ID)",
            "BIRTH_YEAR":   "coalesce(s.BIRTH_YEAR, t.BIRTH_YEAR)",
            "DEATHDATE":    "coalesce(s.DEATHDATE, t.DEATHDATE)",
            "FIRST":        "coalesce(s.FIRST, t.FIRST)",
            "LAST":         "coalesce(s.LAST, t.LAST)",
            "MAIDEN":       "coalesce(s.MAIDEN, t.MAIDEN)",
            "SSN":          "coalesce(s.SSN, t.SSN)",
            "DRIVERS":      "coalesce(s.DRIVERS, t.DRIVERS)",
            "PASSPORT":     "coalesce(s.PASSPORT, t.PASSPORT)",
            "BIRTHPLACE":   "coalesce(s.BIRTHPLACE, t.BIRTHPLACE)",
            "MARITAL":      "coalesce(s.MARITAL, t.MARITAL)",
            "RACE":         "coalesce(s.RACE, t.RACE)",
            "ETHNICITY":    "coalesce(s.ETHNICITY, t.ETHNICITY)",
            "GENDER":       "coalesce(s.GENDER, t.GENDER)"
        })
        .whenNotMatchedInsertAll()
        .execute()
)


In [0]:
# 5. Persist state and final checks ─────────────────────────────────
dbutils.fs.put(STATE_PATH, str(latest_ver), overwrite=True)
print(f"✔ Silver upsert complete (processed through version {latest_ver})")
cnt = spark.read.format("delta").load(SILVER_PATH).count()
print(f"Silver row count: {cnt}")

# Preview final rows
display(spark.read.format("delta").load(SILVER_PATH).limit(5))

In [0]:
## Silver complete  
## Proceed to gold view creation `03_gold/create_gold_views`