# 03 — Build Gold Layer (Spark SQL)

> **Colab-ready Spark SQL notebooks** following Medallion Architecture.
> Run notebooks in order: **01 Bronze → 02 Silver → 03 Gold → Analytics**.

### Conventions
- Databases (schemas): `bronze`, `silver`, `gold`
- Naming: `snake_case`
- Storage: managed tables under `/content/spark-warehouse` (created automatically)
- All code uses **Spark SQL** via `spark.sql(...)` and shows previews with `.show(10, truncate=False)`

In [None]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("Medallion-SparkSQL")
         .config("spark.sql.warehouse.dir", "/content/spark-warehouse")
         .enableHiveSupport().getOrCreate())
for db in ["bronze","silver","gold"]:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
print("Databases ready:", [r.databaseName for r in spark.sql("SHOW DATABASES").collect()])

In [None]:
def preview(table_fqn, limit=10):
    print("\n=== Preview:", table_fqn, "===")
    spark.sql(f"SELECT COUNT(*) AS row_count FROM {table_fqn}").show(truncate=False)
    spark.sql(f"SELECT * FROM {table_fqn} LIMIT {limit}").show(limit, truncate=False)

### gold.dim_customers (VIEW)

In [None]:
spark.sql("DROP VIEW IF EXISTS gold.dim_customers")
spark.sql('''CREATE VIEW gold.dim_customers AS
SELECT ROW_NUMBER() OVER (ORDER BY ci.cst_id) customer_key, ci.cst_id customer_id, ci.cst_key customer_number,
       ci.cst_firstname first_name, ci.cst_lastname last_name, la.cntry country, ci.cst_marital_status marital_status,
       CASE WHEN ci.cst_gndr <> 'n/a' THEN ci.cst_gndr ELSE COALESCE(ca.gen,'n/a') END gender,
       ca.bdate birthdate, ci.cst_create_date create_date
FROM silver.crm_cust_info ci
LEFT JOIN silver.erp_cust_az12 ca ON ci.cst_key = ca.cid
LEFT JOIN silver.erp_loc_a101 la ON ci.cst_key = la.cid'''); preview("gold.dim_customers")

### gold.dim_products (VIEW)

In [None]:
spark.sql("DROP VIEW IF EXISTS gold.dim_products")
spark.sql('''CREATE VIEW gold.dim_products AS
SELECT ROW_NUMBER() OVER (ORDER BY pn.prd_start_dt, pn.prd_key) product_key, pn.prd_id product_id, pn.prd_key product_number,
       pn.prd_nm product_name, pn.cat_id category_id, pc.cat category, pc.subcat subcategory, pc.maintenance maintenance,
       pn.prd_cost cost, pn.prd_line product_line, pn.prd_start_dt start_date
FROM silver.crm_prd_info pn
LEFT JOIN silver.erp_px_cat_g1v2 pc ON pn.cat_id = pc.id
WHERE pn.prd_end_dt IS NULL'''); preview("gold.dim_products")

### gold.fact_sales (TABLE)

In [None]:
spark.sql("DROP TABLE IF EXISTS gold.fact_sales")
spark.sql('''CREATE TABLE gold.fact_sales AS
SELECT sd.sls_ord_num order_number, pr.product_key product_key, cu.customer_key customer_key,
       sd.sls_order_dt order_date, sd.sls_ship_dt shipping_date, sd.sls_due_dt due_date,
       sd.sls_sales sales_amount, sd.sls_quantity quantity, sd.sls_price price
FROM silver.crm_sales_details sd
LEFT JOIN gold.dim_products pr  ON sd.sls_prd_key = pr.product_number
LEFT JOIN gold.dim_customers cu ON sd.sls_cust_id = cu.customer_id'''); preview("gold.fact_sales")