In [0]:
from pyspark.sql.functions import *
from delta.tables import DeltaTable

**Create Delta lake table for fact_sales**

In [0]:
spark.sql("""
          CREATE TABLE IF NOT EXISTS retail_cata.gold.fact_sales(
              orderdetails_key BIGINT GENERATED ALWAYS AS IDENTITY(START WITH 1 INCREMENT BY 1),
              orderdetailid INT,
              customer_key BIGINT,
              product_key  BIGINT,
              store_key BIGINT,
              channel_key INT,
              order_date_key BIGINT,
              quantity INT,
              unit_price DECIMAL(18,2),
              total_amount DECIMAL(29,2)
          )USING DELTA 
          LOCATION "abfss://gold@stretailenvdev.dfs.core.windows.net/fact_sales"
          """)

**Define paths**

In [0]:
silver_ord = "abfss://silver@stretailenvdev.dfs.core.windows.net/s_Orders"
silver_ordedetails = "abfss://silver@stretailenvdev.dfs.core.windows.net/s_OrderDetails"
gold_path = "abfss://gold@stretailenvdev.dfs.core.windows.net/fact_sales"

**Incremental loading**

In [0]:
if DeltaTable.isDeltaTable(spark, gold_path):
    max_datekey = (spark.read.format("delta").load(gold_path)
                  .agg(max(col("order_date_key")).alias("max_datekey"))
                  .first()["max_datekey"])
else:
    max_datekey = None

In [0]:
if max_datekey:
    # Incremental read from silver (orders + orderdetails)
    inc_orders = (spark.read.format("delta").load(silver_ord)
                  .withColumn("order_date_key", date_format(col("OrderDate"), "yyyyMMdd").cast("bigint"))
                  .filter(col("order_date_key") > lit(max_datekey)))

    inc_orderdetails = (spark.read.format("delta").load(silver_ordedetails)
                        .join(inc_orders.select("OrderID"), "OrderID", "inner"))
else:
    inc_orders = (spark.read.format("delta").load(silver_ord)
                  .withColumn("order_date_key", date_format(col("OrderDate"), "yyyyMMdd").cast("bigint")))
    inc_orderdetails = spark.read.format("delta").load(silver_ordedetails)


**Create dataframe for dimension table in order to create fact_sales table**

In [0]:
df_dimcustomer = spark.sql("""SELECT * FROM retail_cata.gold.dim_customer""")
df_dimstore = spark.sql("""SELECT * FROM retail_cata.gold.dim_store""")
df_dimchannel = spark.sql("""SELECT * FROM retail_cata.gold.dim_channel""")
df_dimproduct = spark.sql("""SELECT * FROM retail_cata.gold.dim_product""") 


In [0]:
df_fact_sales = (
    inc_orderdetails.alias("i") 
    .join(inc_orders.alias("o"), col("i.OrderID") == col("o.OrderID"), "inner")
    .join(df_dimcustomer.alias("c"), col("o.CustomerID") == col("c.CustomerID"), "inner")
    .join(df_dimproduct.alias("p"), col("i.ProductID") == col("p.ProductID"), "inner")
    .join(df_dimstore.alias("s"), col("o.StoreID") == col("s.StoreID"), "inner")
    .join(df_dimchannel.alias("ch"),col("o.ChannelID") == col("ch.ChannelID"), "inner")
    .select(
        col("i.OrderDetailID").alias("orderdetailid"),
        col("c.customer_key").alias("customer_key"),
        col("p.product_key").alias("product_key"),
        col("s.store_key").alias("store_key"),
        col("ch.channel_key").alias("channel_key"),
        col("o.order_date_key").alias("order_date_key"),
        col("i.Quantity").alias("quantity"),
        col("i.UnitPrice").alias("unit_price"),
        (col("i.Quantity") * col("i.UnitPrice")).alias("total_amount")    
))

In [0]:
gold_ready = df_fact_sales

In [0]:
gold_ready.printSchema()


In [0]:
if DeltaTable.isDeltaTable(spark, gold_path):
    gold_factsale = DeltaTable.forPath(spark, gold_path)

    (gold_factsale.alias("t")
     .merge(gold_ready.alias("s"),
            "t.orderdetailid = s.orderdetailid")
     .whenMatchedUpdate(
         set={
             "customer_key": "s.customer_key",
             "product_key": "s.product_key",
             "store_key": "s.store_key",
             "channel_key": "s.channel_key",
             "order_date_key": "s.order_date_key",
             "quantity": "s.quantity",
             "unit_price": "s.unit_price",
             "total_amount": "s.total_amount"
         }
     )
     .whenNotMatchedInsert(
          values={
                "orderdetailid": "s.orderdetailid",
                "customer_key": "s.customer_key",
                "product_key": "s.product_key",
                "store_key": "s.store_key",
                "channel_key": "s.channel_key",
                "order_date_key": "s.order_date_key",
                "quantity": "s.quantity",
                "unit_price": "s.unit_price",
                "total_amount": "s.total_amount"
                
            }
     )
     .execute())
else:
    gold_ready.write.mode("overwrite").format("delta").save(gold_path)


In [0]:
#df_fact_sales.write.mode("overwrite").format("delta").save(gold_path)

In [0]:
%sql
SELECT *
FROM  retail_cata.gold.fact_sales
ORDER BY orderdetails_key DESC
LIMIT 10;