># Sample SCD 1 code:

In [0]:
# PARAMETERS:

catalog = 'dev'

# Schemas
bronze_db = 'db1_bronze_raw'
silver_db = 'db1_silver'
gold_db = 'db1_gold'

# Table
trans_tbl = 'transactions'
cleaned_tbl = 'clean_transactions'

In [0]:
from pyspark.sql import Row

# Sample data
data = [
    Row(id=1, name="Raj", age=30),
    Row(id=2, name="Virat", age=25),
    Row(id=3, name="Mahesh", age=35)
]

# Create DataFrame and temp view
df = spark.createDataFrame(data)
df.createOrReplaceTempView("src")

# Show the temp view
display(spark.sql("SELECT * FROM src"))

# Define target table and primary key columns
target_table = "default.sample_target"
pk_cols = ["id"]

In [0]:
# Create target table if not exists
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {target_table}
    USING DELTA AS SELECT * FROM src WHERE 1 = 0
"""
print(create_table_query)
spark.sql(create_table_query)

# Build ON expression for MERGE
on_expr = " AND ".join([f"t.{c} = s.{c}" for c in pk_cols])

print(on_expr)

# Perform MERGE (SCD-1 upsert)
merge_query = f"""
    MERGE INTO {target_table} t
    USING src s
    ON   {on_expr}
    WHEN MATCHED THEN UPDATE SET *
    WHEN NOT MATCHED THEN INSERT *
"""
print(merge_query)
spark.sql(merge_query)

# Show the target table after merge
display(spark.table(target_table))

---

## Actual Code :

In [0]:
# Databricks notebook: silver_to_gold_star_schema.py
# ==================================================
from datetime import datetime
from pyspark.sql.functions import (
    col, expr, sequence, to_date, explode, lit, date_format
)

from pyspark.sql import DataFrame

# ---------------------------------------------------------------------------
# 1. Minimal SCD-1 helper (Delta MERGE)
# ---------------------------------------------------------------------------
def upsert_scd1(src: DataFrame, target_table: str, pk_cols: list) -> None:
    """
    Overwrite-on-key (SCD-1) into a Delta table.
    If the target doesn't exist, it's created with the src schema.
    """
    # Register source DataFrame as temp view for SQL operations
    src.createOrReplaceTempView("src")

    # Create target Delta table if it does not exist
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {target_table}
        USING DELTA AS SELECT * FROM src WHERE 1 = 0
    """)

    # Build ON clause for MERGE using primary key columns
    on_expr = " AND ".join([f"t.{c} = s.{c}" for c in pk_cols])

    # Perform SCD-1 upsert using Delta MERGE
    spark.sql(f"""
        MERGE INTO {target_table} t
        USING src s
        ON   {on_expr}
        WHEN MATCHED THEN UPDATE SET *
        WHEN NOT MATCHED THEN INSERT *
    """)

# ---------------------------------------------------------------------------
# 2. Build / refresh DATE dimension (dev.gold_db.dim_date)
# ---------------------------------------------------------------------------
def refresh_dim_date(start="2000-01-01", end="2035-12-31") -> None:
    # Generate a date sequence DataFrame for the date dimension
    df = (
        spark.range(0, 1)
        .select(sequence(to_date(lit(start)), to_date(lit(end))).alias("date_seq"))
        .select(explode("date_seq").alias("Date_ID"))
        .withColumn("Year", expr("year(Date_ID)"))
        .withColumn("Month", expr("month(Date_ID)"))
        .withColumn("Day", expr("day(Date_ID)"))
        .withColumn("Week", expr("weekofyear(Date_ID)"))
        .withColumn("Quarter", expr("(month(Date_ID)-1)/3 + 1"))
        .withColumn("Weekday", date_format(col("Date_ID"), "EEEE"))
    )

    # Overwrite the dim_date table with the new date dimension data
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(f"{catalog}.{gold_db}.dim_date"))

    # display(df)

# ---------------------------------------------------------------------------
# 3. Distribute silver layer to gold star-schema
# ---------------------------------------------------------------------------
# The "-> None" in the function definition indicates that the function does not return any value.
# It is a type hint specifying that the function's return type is None.
def distribute_to_gold() -> None:
    # Read cleaned transactions from silver layer
    df = spark.table(f"{catalog}.{silver_db}.{cleaned_tbl}")

    # ----- Dimensions -----
    # Upsert customer dimension
    upsert_scd1(
        df.select(
            "Customer_ID", "Customer_Name", "Gender", "DOB",
            "Email", "Phone", "Customer_City"
        ).distinct(),
        f"{catalog}.{gold_db}.dim_customer",
        ["Customer_ID"]
    )

    # Upsert merchant dimension
    upsert_scd1(
        df.select(
            "Merchant_ID", "Merchant_Name",
            "Merchant_Category", "Merchant_Country"
        ).distinct(),
        f"{catalog}.{gold_db}.dim_merchant",
        ["Merchant_ID"]
    )

    # Upsert card dimension
    upsert_scd1(
        df.select(
            "Card_ID", "Card_Type", "Issuer_Bank",
            "Card_Tier", "Expiry_Date"
        ).distinct(),
        f"{catalog}.{gold_db}.dim_card",
        ["Card_ID"]
    )

    # Upsert location dimension
    upsert_scd1(
        df.select(
            "Location_ID", "City", "State", "Country"
        ).distinct(),
        f"{catalog}.{gold_db}.dim_location",
        ["Location_ID"]
    )

    # ----- Fact -----
    # Prepare fact table DataFrame with foreign keys to dimensions
    fact_df = (
        df.select(
            "Transaction_ID",
            "Transaction_Date",
            "Transaction_Amount",
            "Transaction_Status",
            "Transaction_Type",
            "Customer_ID",
            "Card_ID",
            "Merchant_ID",
            "Location_ID",
            col("Transaction_Date").alias("Transaction_Date_ID")  # FK -> dim_date
        )
    )

    # Append new fact records to the fact_transactions table
    (fact_df.write
        .format("delta")
        .mode("append")
        .saveAsTable(f"{catalog}.{gold_db}.fact_transactions"))

    # display(fact_df)

# ---------------------------------------------------------------------------
# 4. Orchestration entry point
# ---------------------------------------------------------------------------
def run_star_schema_refresh() -> str:
    """
    Databricks Job entry:
      1. Refresh dim_date
      2. Push silver to gold star-schema
    """
    # Refresh the date dimension table
    refresh_dim_date()
    # Distribute silver data to gold star-schema (dimensions and fact)
    distribute_to_gold()
    return "Silver -> Gold star-schema refresh complete."

# For ad-hoc runs in a notebook:
if __name__ == "__main__":
    print(run_star_schema_refresh())

In [0]:
%sql
select * from dev.db1_gold.dim_customer;

In [0]:
%sql
select * from dev.db1_gold.fact_transactions limit 50;