## Business Logic
#### Add calculations that most downstream users need
###### * Add columns such as: [Unit Price, Delivery Time, Profit Margin]
###### * Calculate Transaction Type [Sale or Return]
###### * Flag whether delivery is_prime? [True/False]
###### * Add a flag for profitable transactions ( a boolean column)
###### * Calculate Order Size Category [Small, Medium, Large, Very Large]

In [0]:
# import modules
from pyspark.sql.functions import (
    col, to_date, lit, current_timestamp, datediff, when, coalesce, concat_ws, trim, upper, lower,
    row_number, desc, array, round
)

from pyspark.sql.window import Window

from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, DecimalType, DoubleType, DateType, TimestampType
)

from delta.tables import DeltaTable


#### Run Config Notebook

In [0]:
%run ../../configs/config_notebook

In [0]:
# Use configuration variables
# Target table
TARGET_TABLE = TABLES["orders_gold"]

# read silver table
df_silver = spark.table(TABLES["orders_silver"])

# preview data
print(f"Bronze table records: {df_silver.count()}")


#### Enrich data for analysis
###### * Add calculated columns

In [0]:
# create reusable variables to avoid repeated code
# define 'sales' and 'quantity' validity checks
qty_valid = (col("quantity").isNotNull()) & (col("quantity") != 0)
sales_valid = (col("sales").isNotNull()) & (col("sales") != 0)

# Calculated columns
df_enriched_data = (
    df_silver

    # Calculate delivery days
    .withColumn("delivery_days", datediff(col("ship_date"), col("order_date")))

    # Calculate unit price
    .withColumn("unit_price", 
        when(qty_valid, round(col("sales") / col("quantity"), 2)).otherwise(None)
    )

    # Profit per unit sold
    .withColumn("profit_per_unit", 
        when(qty_valid, round(col("profit") / col("quantity"), 2)).otherwise(None)
    )

    # Calculate profit margin
    .withColumn("profit_margin", 
        when(sales_valid, round(col("profit") / col("sales"), 2))
        .otherwise(None)
    )

    # Calculate discount bucket
    .withColumn("discount_bucket", 
        when(col("discount") == 0, "No Discount")
        .when(col("discount") <= 0.1, "0-10%")
        .when(col("discount") <= 0.2, "10-20%")
        .when(col("discount") <= 0.3, "20-30%")
        .when(col("discount") <= 0.4, "30-40%")
        .when(col("discount") <= 0.5, "40-50%")
        .otherwise("Discount > 50%")
    )

    # Identify transaction type
    .withColumn("transaction_type", 
        when((col("sales") <= 0) | (col("quantity") <= 0), "Return")
        .otherwise("Sale")
    )

    # Flag profitable transactions
    .withColumn("is_profitable", col("profit") > 0)

    # Flag delivery category
    .withColumn("is_prime_delivery", col("delivery_days") <= 1)

    # Categorize order size
    .withColumn("order_size", 
        when(col("transaction_type") == "Return", "Return")
        .when(col("sales") < 100, "Small")
        .when((col("sales") >= 100) & (col("sales") < 500), "Medium")
        .when((col("sales") >= 500) & (col("sales") < 1000), "Large")
        .otherwise("Very Large")
    )

    #  Add audit columns
    .withColumn("gold_processed_timestamp", current_timestamp())
    .withColumn("source_layer", lit("silver"))

)


## Gold Table
###### * Containers enriched denormalized business-ready data
###### * Will be used to derive fact and dimension tables for analytics

In [0]:
# Create upsert logic to update/insert records into denormalized gold table 
if spark.catalog.tableExists(TARGET_TABLE):
    print(f"Gold table exists. Performing MERGE...")

    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    (delta_table.alias("target")
        .merge(
            df_enriched_data.alias("source"),
            "target.row_id = source.row_id"
        )
        # In production I will explicitly control update columns by using .whenMatchedUpdate(set={})
        .whenMatchedUpdateAll() 
        .whenNotMatchedInsertAll()
        .execute()
    )
    print("✅ MERGE completed!")
   
else:
    print(f"Gold table does not exist. Creating orders_gold table...")

    (
        df_enriched_data.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )

    print("✅ Gold table 'orders_gold' completed!")
