# Gold Modeling Notebook — Build Star Schema

This notebook constructs the Gold layer star schema by transforming refined Silver data into dimensional and fact tables optimized for analytical consumption. It applies deterministic modeling rules, generates surrogate keys, assembles conformed dimensions, and produces a clean fact table suitable for BI tools and semantic models.

The modeling process is intentionally stable and predictable: it standardizes dimensional attributes, resolves grain, derives analytical measures, and writes each table as an independent Delta dataset. All environment‑specific values are externalized through configuration, and the notebook mirrors the logic used by the pipeline‑ready script to ensure consistent execution across development and automated orchestration paths.

In [None]:
# ============================================================
# Gold Modeling Notebook
# Deterministic Silver → Gold star schema construction
# ============================================================

# ------------------------------------------------------------
# 1. Initialize Spark
# ------------------------------------------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, lower, monotonically_increasing_id,
    year, month, dayofmonth, hour
)
import json

spark = SparkSession.builder.getOrCreate()

# ------------------------------------------------------------
# 2. Load configuration
# ------------------------------------------------------------
config_path = "/Workspace/Repos/gold/gold-modeling-config.json"

with open(config_path, "r") as f:
    config = json.load(f)

silver_path = config["paths"]["silver"]
gold_dim_customer_path = config["paths"]["gold_dim_customer"]
gold_dim_product_path = config["paths"]["gold_dim_product"]
gold_dim_date_path = config["paths"]["gold_dim_date"]
gold_fact_sales_path = config["paths"]["gold_fact_sales"]
write_mode = config["write"]["mode"]

# ------------------------------------------------------------
# 3. Read Silver data
# ------------------------------------------------------------
df_silver = (
    spark.read
         .format("delta")
         .load(silver_path)
)

# ------------------------------------------------------------
# 4. Build Dimensions
# ------------------------------------------------------------

# -------------------------
# Customer Dimension
# -------------------------
df_dim_customer = (
    df_silver
        .select("customer_id")
        .dropDuplicates()
        .withColumn("customer_key", monotonically_increasing_id())
)

# -------------------------
# Product Dimension
# -------------------------
df_dim_product = (
    df_silver
        .select("product_id")
        .dropDuplicates()
        .withColumn("product_key", monotonically_increasing_id())
)

# -------------------------
# Date Dimension
# -------------------------
df_dim_date = (
    df_silver
        .select("order_timestamp")
        .withColumn("date_key", monotonically_increasing_id())
        .withColumn("year", year(col("order_timestamp")))
        .withColumn("month", month(col("order_timestamp")))
        .withColumn("day", dayofmonth(col("order_timestamp")))
        .withColumn("hour", hour(col("order_timestamp")))
)

# ------------------------------------------------------------
# 5. Build Fact Table
# ------------------------------------------------------------
df_fact_sales = (
    df_silver
        .join(df_dim_customer, "customer_id", "left")
        .join(df_dim_product, "product_id", "left")
        .join(df_dim_date, "order_timestamp", "left")
        .select(
            "customer_key",
            "product_key",
            "date_key",
            "quantity",
            "unit_price",
            "total_amount"
        )
)

# ------------------------------------------------------------
# 6. Write Gold Tables
# ------------------------------------------------------------
(
    df_dim_customer.write
                   .format("delta")
                   .mode(write_mode)
                   .save(gold_dim_customer_path)
)

(
    df_dim_product.write
                  .format("delta")
                  .mode(write_mode)
                  .save(gold_dim_product_path)
)

(
    df_dim_date.write
                .format("delta")
                .mode(write_mode)
                .save(gold_dim_date_path)
)

(
    df_fact_sales.write
                 .format("delta")
                 .mode(write_mode)
                 .save(gold_fact_sales_path)
)

# ------------------------------------------------------------
# 7. Confirmation
# ------------------------------------------------------------
print("Gold star schema modeling completed successfully.")