# 03 — Build Gold Layer (PySpark on Dataproc, No Hive Required)

In [None]:
import os
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("gold-dataproc").getOrCreate()
GCS_BASE  = os.getenv("GCS_BASE", "/user/nb")
print("GCS_BASE:", GCS_BASE)

### View: `gold.dim_customers`

**Modeling decisions & Why**
- Surrogate key via `row_number`.
- Enrich with ERP demographics and location.
- SCD‑1 attributes.

**Columns:**

| Column | Type | Description |
|---|---|---|
| `customer_key` | `int` | Surrogate key (row_number) |
| `customer_id` | `int` | Natural key from Silver |
| `customer_number` | `string` | Business key |
| `first_name` | `string` | Customer first name |
| `last_name` | `string` | Customer last name |
| `country` | `string` | Country label |
| `marital_status` | `string` | SCD‑1 attribute |
| `gender` | `string` | SCD‑1 attribute, fallback to ERP when n/a |
| `birthdate` | `date` | From ERP demographics |
| `create_date` | `date` | Customer create date |

In [None]:
ci = spark.read.parquet(f"{GCS_BASE}/silver/crm_cust_info")
ca = spark.read.parquet(f"{GCS_BASE}/silver/erp_cust_az12")
la = spark.read.parquet(f"{GCS_BASE}/silver/erp_loc_a101")
dim_customers = (ci.alias("ci")
   .join(ca.alias("ca"), F.col("ci.cst_key")==F.col("ca.cid"), "left")
   .join(la.alias("la"), F.col("ci.cst_key")==F.col("la.cid"), "left")
   .selectExpr(
      "ci.cst_id as customer_id",
      "ci.cst_key as customer_number",
      "ci.cst_firstname as first_name",
      "ci.cst_lastname as last_name",
      "la.cntry as country",
      "ci.cst_marital_status as marital_status",
      "case when ci.cst_gndr <> 'n/a' then ci.cst_gndr else coalesce(ca.gen, 'n/a') end as gender",
      "ca.bdate as birthdate",
      "ci.cst_create_date as create_date"
   ))
dim_customers = dim_customers.withColumn("customer_key", F.row_number().over(Window.orderBy("customer_id")))
dim_customers = dim_customers.select("customer_key","customer_id","customer_number","first_name","last_name","country","marital_status","gender","birthdate","create_date")
out_path = f"{GCS_BASE}/gold/dim_customers"
dim_customers.write.mode("overwrite").parquet(out_path)
spark.read.parquet(out_path).createOrReplaceTempView("gold_dim_customers")
print("Rows:", spark.read.parquet(out_path).count())
spark.sql("SELECT * FROM gold_dim_customers LIMIT 10").show(truncate=False)

### View: `gold.dim_products`

**Modeling decisions & Why**
- Surrogate key via `row_number`.
- Category join for readable labels.
- Keep current products only (`prd_end_dt IS NULL`).

**Columns:**

| Column | Type | Description |
|---|---|---|
| `product_key` | `int` | Surrogate key (row_number) |
| `product_id` | `int` | Natural key |
| `product_number` | `string` | Business key |
| `product_name` | `string` | Name |
| `category_id` | `string` | Derived from prd_key |
| `category` | `string` | Category label |
| `subcategory` | `string` | Subcategory label |
| `maintenance` | `string` | Maintenance attribute |
| `cost` | `int` | Unit cost |
| `product_line` | `string` | Decoded label |
| `start_date` | `date` | Start date |

In [None]:
pn = spark.read.parquet(f"{GCS_BASE}/silver/crm_prd_info")
pc = spark.read.parquet(f"{GCS_BASE}/silver/erp_px_cat_g1v2")
dim_products = (pn.alias("pn").join(pc.alias("pc"), F.col("pn.cat_id")==F.col("pc.id"), "left")
   .where(F.col("pn.prd_end_dt").isNull())
   .selectExpr(
      "pn.prd_id as product_id",
      "pn.prd_key as product_number",
      "pn.prd_nm as product_name",
      "pn.cat_id as category_id",
      "pc.cat as category",
      "pc.subcat as subcategory",
      "pc.maintenance as maintenance",
      "pn.prd_cost as cost",
      "pn.prd_line as product_line",
      "pn.prd_start_dt as start_date"
   ))
from pyspark.sql.window import Window
dim_products = dim_products.withColumn("product_key", F.row_number().over(Window.orderBy("start_date","product_number")))
dim_products = dim_products.select("product_key","product_id","product_number","product_name","category_id","category","subcategory","maintenance","cost","product_line","start_date")
out_path = f"{GCS_BASE}/gold/dim_products"
dim_products.write.mode("overwrite").parquet(out_path)
spark.read.parquet(out_path).createOrReplaceTempView("gold_dim_products")
print("Rows:", spark.read.parquet(out_path).count())
spark.sql("SELECT * FROM gold_dim_products LIMIT 10").show(truncate=False)

### View: `gold.fact_sales`

**Modeling decisions & Why**
- Grain: order line (order × product × customer).
- Measures from Silver; dims are SCD‑1.
- Natural-key join; surrogate keys exposed for BI.

**Columns:**

| Column | Type | Description |
|---|---|---|
| `order_number` | `string` | Order identifier (line grain) |
| `product_key` | `int` | FK to dim_products |
| `customer_key` | `int` | FK to dim_customers |
| `order_date` | `date` | Order date |
| `shipping_date` | `date` | Ship date |
| `due_date` | `date` | Due date |
| `sales_amount` | `int` | Extended amount |
| `quantity` | `int` | Units sold |
| `price` | `int` | Unit price |

In [None]:
sd = spark.read.parquet(f"{GCS_BASE}/silver/crm_sales_details")
dim_products = spark.read.parquet(f"{GCS_BASE}/gold/dim_products")
dim_customers = spark.read.parquet(f"{GCS_BASE}/gold/dim_customers")
fact_sales = (sd.alias("sd")
   .join(dim_products.alias("pr"), F.col("sd.sls_prd_key")==F.col("pr.product_number"), "left")
   .join(dim_customers.alias("cu"), F.col("sd.sls_cust_id")==F.col("cu.customer_id"), "left")
   .selectExpr(
      "sd.sls_ord_num  as order_number",
      "pr.product_key  as product_key",
      "cu.customer_key as customer_key",
      "sd.sls_order_dt as order_date",
      "sd.sls_ship_dt  as shipping_date",
      "sd.sls_due_dt   as due_date",
      "sd.sls_sales    as sales_amount",
      "sd.sls_quantity as quantity",
      "sd.sls_price    as price"
   ))
out_path = f"{GCS_BASE}/gold/fact_sales"
fact_sales.write.mode("overwrite").parquet(out_path)
spark.read.parquet(out_path).createOrReplaceTempView("gold_fact_sales")
print("Rows:", spark.read.parquet(out_path).count())
spark.sql("SELECT * FROM gold_fact_sales LIMIT 10").show(truncate=False)

## Summary
- Gold Parquet written to `GCS_BASE/gold/*`; temp views registered for BI/SQL.