## Fact Sales
###### * Create a Fact_Sales table
###### * Derive products attributes from the enriched and denormalized orders_gold table
###### * Grain: One row per order line (row_id)

In [0]:
# import modules
from pyspark.sql.functions import col, concat_ws, coalesce, lit
from delta.tables import DeltaTable


#### Run Config Notebook

In [0]:
%run ../../configs/config_notebook

In [0]:
# Use configuration variables
# Target table
TARGET_TABLE = TABLES["fact_sales"]

# Read tables
df_orders_gold = spark.table(TABLES["orders_gold"])
df_customers   = spark.table(TABLES["dim_customers"])
df_products    = spark.table(TABLES["dim_products"])
df_geography   = spark.table(TABLES["dim_geography"])
df_date        = spark.table(TABLES["dim_date"])

### Build Fact dataframe
###### Join with dimension tables using Left-Join

In [0]:
# First, add location_key to orders_gold for easy join with dim_geography 
df_orders_gold = (
    df_orders_gold
    .withColumn("location_key",
        concat_ws("|",
            coalesce(col("country"), lit("")),
            coalesce(col("state"), lit("")),
            coalesce(col("city"), lit("")),
            coalesce(col("postal_code"), lit(""))
        )
    )
)
# Create fact dataframe by joining dimension tables
df_fact = (
  df_orders_gold.alias("o")

  # customers dimension
  .join(df_customers.alias("c"), "customer_id", "left")
        
  # products dimension
  .join(df_products.alias("p"), "product_id", "left")

  # geography dimension
  .join(df_geography.alias("g"), "location_key", "left")

  # date order_date role
  .join(df_date.alias("od"), col("o.order_date") == col("od.date"), "left")

  # date ship_date role
  .join(df_date.alias("sd"), col("o.ship_date") == col("sd.date"), "left")
)


### Select relevant fact table columns

In [0]:
# Select relevant columns
df_fact = (
  df_fact
  .select(
    col("row_id").alias("order_line_id"),

    col("c.customer_key"),
    col("p.product_key"),
    col("g.geography_key"),
    col("od.date_key").alias("order_date_key"),
    col("sd.date_key").alias("ship_date_key"),

    col("o.order_date"),
    col("o.ship_date"),
    col("o.order_id"),
    col("o.ship_mode"),

    col("o.sales"),
    col("o.quantity"),
    col("o.discount"),
    col("o.profit"),
    col("o.unit_price"),
    col("o.profit_per_unit"),
    col("o.profit_margin"),
    col("o.discount_bucket"),

    col("o.delivery_days"),
    col("o.is_prime_delivery"),
    col("o.is_profitable"),
    col("o.order_size"),
    col("o.transaction_type"),
    col("o.gold_processed_timestamp").alias("created_at_timestamp")
  )
)


### Save as Fact_Sales delta table

In [0]:
# Write to fact_sales (full refresh or incremental update)
if spark.catalog.tableExists(TARGET_TABLE):
    print(f"✅ Table {TARGET_TABLE} already exists. MERGING data...")

    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    # merge on order_line_id (row_id)
    (
        delta_table.alias("target")
        .merge(
            df_fact.alias("source"),
            "target.order_line_id = source.order_line_id"
        )
        .whenMatchedUpdateAll(
            condition="source.created_at_timestamp > target.created_at_timestamp"
        )
        .whenNotMatchedInsertAll()
        .execute()
    )
    print(f"✅ MERGE operation completed successfully.")

else:
    print(f"✅ Table {TARGET_TABLE} does not exist. Creating table...")  
    (
        df_fact.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )
    print(f"✅ Table {TARGET_TABLE} created successfully.")


#### Sanity check of '**fact_sales**' table

In [0]:
%sql
-- Validation
SELECT 
    COUNT(*) as total_order_lines,
    COUNT(DISTINCT order_line_id) as unique_order_lines,
    COUNT(DISTINCT order_id) as unique_orders,
    COUNT(DISTINCT customer_key) as unique_customers,
    COUNT(DISTINCT product_key) as unique_products,
    COUNT(DISTINCT geography_key) as unique_locations
FROM gold_dev.global_mart_retail.fact_sales;

In [0]:
%sql
-- Surrogate keys check (should be 0 for each)
SELECT 
    SUM(CASE WHEN customer_key IS NULL THEN 1 ELSE 0 END) as null_customer_keys,
    SUM(CASE WHEN product_key IS NULL THEN 1 ELSE 0 END) as null_product_keys,
    SUM(CASE WHEN geography_key IS NULL THEN 1 ELSE 0 END) as null_geography_keys,
    SUM(CASE WHEN order_date_key IS NULL THEN 1 ELSE 0 END) as null_order_date_keys,
    SUM(CASE WHEN ship_date_key IS NULL THEN 1 ELSE 0 END) as null_ship_date_keys
FROM gold_dev.global_mart_retail.fact_sales;

In [0]:
%sql
-- Financial metrics check
SELECT 
    ROUND(SUM(sales), 2) as total_revenue,
    ROUND(SUM(profit), 2) as total_profit,
    ROUND(AVG(profit_margin), 2) as avg_profit_margin,
    SUM(quantity) as total_units_sold,
    COUNT(CASE WHEN is_profitable THEN 1 END) as profitable_orders,
    COUNT(CASE WHEN NOT is_profitable THEN 1 END) as unprofitable_orders
FROM gold_dev.global_mart_retail.fact_sales
WHERE transaction_type = 'Sale';