### EDA — **Gold Only**

Sources:
- `gold.fact_sales` (sale_id, customer_id, product_key, qty, price, amount, order_dt, ship_dt, due_dt)
- `gold.dim_customers` (customer_id, first_name, last_name, gender, marital_status)
- `gold.dim_products`  (product_key, product_id, product_name, product_line, prd_cost)


#### Environment Setup

In [None]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Gold-EDA-DirectJoins").getOrCreate()
print("Spark version:", spark.version)


#### Tables in `gold` schema

In [None]:
spark.sql("""SHOW TABLES IN gold""").show(20, truncate=False)

#### Row counts — facts and dimensions

In [None]:
spark.sql("""SELECT 'gold.fact_sales' AS object, COUNT(*) AS rows FROM gold.fact_sales
UNION ALL SELECT 'gold.dim_customers', COUNT(*) FROM gold.dim_customers
UNION ALL SELECT 'gold.dim_products', COUNT(*) FROM gold.dim_products""").show(20, truncate=False)

#### Null checks in fact (keys & measures)

In [None]:
spark.sql("""SELECT
  SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END) AS null_customer_id,
  SUM(CASE WHEN product_key IS NULL THEN 1 ELSE 0 END) AS null_product_key,
  SUM(CASE WHEN qty    IS NULL THEN 1 ELSE 0 END) AS null_qty,
  SUM(CASE WHEN price  IS NULL THEN 1 ELSE 0 END) AS null_price,
  SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END) AS null_amount
FROM gold.fact_sales""").show(20, truncate=False)

#### Daily revenue trend

In [None]:
spark.sql("""SELECT DATE(order_dt) AS order_date,
       COUNT(*) AS orders,
       SUM(qty) AS total_qty,
       CAST(SUM(amount) AS DECIMAL(18,2)) AS revenue
FROM gold.fact_sales
WHERE order_dt IS NOT NULL AND qty IS NOT NULL AND amount IS NOT NULL
GROUP BY DATE(order_dt)
ORDER BY order_date""").show(20, truncate=False)

#### Monthly revenue trend

In [None]:
spark.sql("""SELECT DATE_TRUNC('month', order_dt) AS month_start,
       CAST(SUM(amount) AS DECIMAL(18,2)) AS revenue,
       SUM(qty) AS total_qty,
       COUNT(*) AS orders
FROM gold.fact_sales
WHERE order_dt IS NOT NULL AND qty IS NOT NULL AND amount IS NOT NULL
GROUP BY DATE_TRUNC('month', order_dt)
ORDER BY month_start""").show(20, truncate=False)

#### Top 10 customers by revenue

In [None]:
spark.sql("""SELECT f.customer_id,
       dc.first_name,
       dc.last_name,
       CAST(SUM(f.amount) AS DECIMAL(18,2)) AS revenue,
       COUNT(*) AS orders
FROM gold.fact_sales f
LEFT JOIN gold.dim_customers dc
  ON f.customer_id = dc.customer_id
WHERE f.amount IS NOT NULL
GROUP BY f.customer_id, dc.first_name, dc.last_name
ORDER BY revenue DESC
LIMIT 10""").show(20, truncate=False)