#### **GOLD loader for lakehouse_sv  → lakehouse_gd**
 - Dimensions: DimDate, DimProduct, DimWarehouse, DimCustomer (opt), DimMill (opt), DimMachine (opt), DimEmployee (opt)
 - Facts: FactOrders, FactShipments, FactDailyFillRate, FactWarehouseDailyUtilization
 - Deterministic SurrogateKey = xxhash64 of natural keys
 - MERGE auto-adds missing columns; joins to DimDate use aliases to avoid ambiguity

In [1]:
from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

# -----------------------------
# CONFIG
# -----------------------------
SILVER_DB = "lakehouse_sv"
GOLD_DB   = "lakehouse_gd"

# -----------------------------
# HELPERS
# -----------------------------
def texists(db, name):
    return spark.catalog.tableExists(f"{db}.{name}")

def sk_hash(cols):
    """Deterministic 64-bit surrogate key from natural key columns."""
    exprs = [F.coalesce(F.col(c).cast("string"), F.lit("")) for c in cols]
    return F.abs(F.xxhash64(*exprs)).cast("long")

def _add_missing_columns(table_fqn, df):
    """Add any columns present in df but missing in target table (types from df)."""
    tgt_cols = set(spark.table(table_fqn).columns)
    missing  = [f for f in df.schema.fields if f.name not in tgt_cols]
    for f in missing:
        sql_type = f.dataType.simpleString()  # e.g. 'string', 'int', 'decimal(10,2)', 'date'
        spark.sql(f"ALTER TABLE {table_fqn} ADD COLUMNS ({f.name} {sql_type})")

def merge_upsert(table_fqn, df, natural_key_cols, partition_by=None):
    """
    Create/merge managed Delta.
    - Creates table if missing.
    - If exists: auto-add new columns, then MERGE using only columns common to both.
    - SurrogateKey is provided by df (deterministic); safe to overwrite.
    """
    if not spark.catalog.tableExists(table_fqn):
        w = (df.write.format("delta").mode("overwrite").option("overwriteSchema", "true"))
        if partition_by:
            w = w.partitionBy(*partition_by)
        w.saveAsTable(table_fqn)
        print(f"🆕 {table_fqn}: created")
        return

    _add_missing_columns(table_fqn, df)

    dt   = DeltaTable.forName(spark, table_fqn)
    cond = " AND ".join([f"t.{c} = s.{c}" for c in natural_key_cols])

    target_cols = set(spark.table(table_fqn).columns)
    common_cols = [c for c in df.columns if c in target_cols]

    set_insert = {c: f"s.{c}" for c in common_cols}
    set_update = {c: f"s.{c}" for c in common_cols}

    (dt.alias("t")
       .merge(df.alias("s"), cond)
       .whenMatchedUpdate(set=set_update)
       .whenNotMatchedInsert(values=set_insert)
       .execute())
    print(f"✅ {table_fqn}: merged on {natural_key_cols}")

# -----------------------------
# DIMENSIONS
# -----------------------------
# DimProduct
if texists(SILVER_DB, "products"):
    prod = (spark.table(f"{SILVER_DB}.products")
              .dropDuplicates(["ProductID","Source"])
              .withColumn("PalletsToTon", F.coalesce(F.col("PalletsToTon").cast("double"), F.lit(1.0)))
              .withColumn("SurrogateKey", sk_hash(["ProductID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimProduct", prod,
                 natural_key_cols=["ProductID","Source"],
                 partition_by=["Source"])

# DimWarehouse
if texists(SILVER_DB, "warehouses"):
    wh = (spark.table(f"{SILVER_DB}.warehouses")
            .dropDuplicates(["WarehouseID","Source"])
            .withColumn("CapacityPallets", F.col("CapacityPallets").cast("double"))
            .withColumn("SurrogateKey", sk_hash(["WarehouseID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimWarehouse", wh,
                 natural_key_cols=["WarehouseID","Source"],
                 partition_by=["Source"])

# DimCustomer (optional)
if texists(SILVER_DB, "customers"):
    cus = (spark.table(f"{SILVER_DB}.customers")
             .dropDuplicates(["CustomerID","Source"])
             .withColumn("SurrogateKey", sk_hash(["CustomerID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimCustomer", cus,
                 natural_key_cols=["CustomerID","Source"],
                 partition_by=["Source"])

# DimMill (optional)
if texists(SILVER_DB, "mills"):
    mill = (spark.table(f"{SILVER_DB}.mills")
              .dropDuplicates(["MillID","Source"])
              .withColumn("SurrogateKey", sk_hash(["MillID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimMill", mill,
                 natural_key_cols=["MillID","Source"],
                 partition_by=["Source"])

# DimMachine (optional)
if texists(SILVER_DB, "machines"):
    mac = (spark.table(f"{SILVER_DB}.machines")
             .dropDuplicates(["MachineID","Source"])
             .withColumn("SurrogateKey", sk_hash(["MachineID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimMachine", mac,
                 natural_key_cols=["MachineID","Source"],
                 partition_by=["Source"])

# DimEmployee (optional)
if texists(SILVER_DB, "employees"):
    emp = (spark.table(f"{SILVER_DB}.employees")
             .dropDuplicates(["EmployeeID","Source"])
             .withColumn("SurrogateKey", sk_hash(["EmployeeID","Source"])))
    merge_upsert(f"{GOLD_DB}.DimEmployee", emp,
                 natural_key_cols=["EmployeeID","Source"],
                 partition_by=["Source"])

# DimDate (union of dates)
date_frames = []
if texists(SILVER_DB, "orders"):
    date_frames.append(spark.table(f"{SILVER_DB}.orders")
                           .select(F.col("OrderDate").alias("date"))
                           .where(F.col("OrderDate").isNotNull()))
if texists(SILVER_DB, "shipments"):
    date_frames.append(spark.table(f"{SILVER_DB}.shipments")
                           .select(F.col("Date").alias("date"))
                           .where(F.col("Date").isNotNull()))
if texists(SILVER_DB, "stockmovements"):
    date_frames.append(spark.table(f"{SILVER_DB}.stockmovements")
                           .select(F.col("Date").alias("date"))
                           .where(F.col("Date").isNotNull()))
if texists(SILVER_DB, "plannedproductions"):
    date_frames.append(spark.table(f"{SILVER_DB}.plannedproductions")
                           .select(F.col("ProdDate").alias("date"))
                           .where(F.col("ProdDate").isNotNull()))
if texists(SILVER_DB, "machinedowntime"):
    md = spark.table(f"{SILVER_DB}.machinedowntime")
    date_frames.append(md.select(F.to_date("StartTime").alias("date")).where(F.col("StartTime").isNotNull()))
    date_frames.append(md.select(F.to_date("EndTime").alias("date")).where(F.col("EndTime").isNotNull()))
if texists(SILVER_DB, "machinesensors"):
    date_frames.append(spark.table(f"{SILVER_DB}.machinesensors")
                           .select(F.to_date("Timestamp").alias("date"))
                           .where(F.col("Timestamp").isNotNull()))

if date_frames:
    dim_date = date_frames[0]
    for d in date_frames[1:]:
        dim_date = dim_date.unionByName(d, allowMissingColumns=True)
    dim_date = (dim_date.distinct()
                        .withColumn("date", F.to_date("date"))
                        .withColumn("Year",     F.year("date"))
                        .withColumn("Month",    F.month("date"))
                        .withColumn("Day",      F.dayofmonth("date"))
                        .withColumn("Quarter",  F.quarter("date"))
                        .withColumn("DateSK",   (F.year("date")*10000 + F.month("date")*100 + F.dayofmonth("date")).cast("int"))
                        .withColumn("SurrogateKey", F.col("DateSK").cast("long")))
    merge_upsert(f"{GOLD_DB}.DimDate", dim_date, natural_key_cols=["date"])

# -----------------------------
# DIM LOOKUPS FOR FACT JOINS (no ambiguity)
# -----------------------------
DimProductLK   = spark.table(f"{GOLD_DB}.DimProduct").select("ProductID","Source","PalletsToTon","SurrogateKey").withColumnRenamed("SurrogateKey","ProductSK") if texists(GOLD_DB,"DimProduct") else None
DimWarehouseLK = spark.table(f"{GOLD_DB}.DimWarehouse").select("WarehouseID","Source","SurrogateKey").withColumnRenamed("SurrogateKey","WarehouseSK") if texists(GOLD_DB,"DimWarehouse") else None
DimCustomerLK  = spark.table(f"{GOLD_DB}.DimCustomer").select("CustomerID","Source","SurrogateKey").withColumnRenamed("SurrogateKey","CustomerSK") if texists(GOLD_DB,"DimCustomer") else None
DimDateLK      = spark.table(f"{GOLD_DB}.DimDate").select("date","SurrogateKey").withColumnRenamed("SurrogateKey","DateSK") if texists(GOLD_DB,"DimDate") else None

# -----------------------------
# FACTS
# -----------------------------

# FactOrders
if texists(SILVER_DB, "orders") and DimProductLK is not None and DimDateLK is not None:
    o = spark.table(f"{SILVER_DB}.orders").alias("o")
    p = DimProductLK.alias("p")
    d = DimDateLK.alias("d")
    c = DimCustomerLK.alias("c") if DimCustomerLK is not None else None

    o1 = o.join(p, on=["ProductID","Source"], how="left") \
          .withColumn("OrderDate_d", F.to_date(F.col("o.OrderDate"))) \
          .join(d, F.col("OrderDate_d") == F.col("d.date"), how="left")

    ratio = F.when(F.col("p.PalletsToTon") == 0, F.lit(1.0)).otherwise(F.col("p.PalletsToTon"))
    o1 = (o1.withColumn("OrderedPallets", F.col("o.QuantityPallets").cast("double"))
            .withColumn("OrderedTons",    F.col("o.QuantityPallets").cast("double") / ratio))

    if c is not None and "CustomerID" in o1.columns:
        o1 = o1.join(c, on=["CustomerID","Source"], how="left")

    o1 = (o1.withColumn("NaturalKey", F.sha2(F.concat_ws("|",
                     F.col("o.OrderID").cast("string"),
                     F.col("o.ProductID").cast("string"),
                     F.coalesce(F.col("o.Source").cast("string"), F.lit(""))), 256))
            .withColumn("SurrogateKey", sk_hash(["NaturalKey"])))

    fact_orders = o1.select(
        "SurrogateKey","NaturalKey",
        "d.DateSK","p.ProductSK",
        *(["c.CustomerSK"] if c is not None and "CustomerSK" in o1.columns else []),
        F.col("o.Source").alias("Source"),
        "OrderedPallets","OrderedTons"
    )
    merge_upsert(f"{GOLD_DB}.FactOrders", fact_orders,
                 natural_key_cols=["NaturalKey"],
                 partition_by=["Source"])

# FactShipments
if texists(SILVER_DB, "shipments") and DimProductLK is not None and DimWarehouseLK is not None and DimDateLK is not None:
    s = spark.table(f"{SILVER_DB}.shipments").alias("s")
    p = DimProductLK.alias("p")
    w = DimWarehouseLK.alias("w")
    d = DimDateLK.alias("d")

    s1 = s.join(p, on=["ProductID","Source"], how="left") \
          .join(w, on=["WarehouseID","Source"], how="left") \
          .withColumn("ShipDate", F.to_date(F.col("s.Date"))) \
          .join(d, F.col("ShipDate") == F.col("d.date"), how="left")

    ratio = F.when(F.col("p.PalletsToTon") == 0, F.lit(1.0)).otherwise(F.col("p.PalletsToTon"))
    s1 = (s1.withColumn("ShippedPallets", F.col("s.Pallets").cast("double"))
            .withColumn("ShippedTons",    F.col("s.Pallets").cast("double") / ratio))

    s1 = (s1.withColumn("NaturalKey", F.sha2(F.concat_ws("|",
                     F.col("s.ShipmentID").cast("string"),
                     F.col("s.ProductID").cast("string"),
                     F.col("s.WarehouseID").cast("string"),
                     F.coalesce(F.col("s.Source").cast("string"), F.lit(""))), 256))
            .withColumn("SurrogateKey", sk_hash(["NaturalKey"])))

    fact_ship = s1.select(
        "SurrogateKey","NaturalKey",
        "d.DateSK","p.ProductSK","w.WarehouseSK",
        F.col("s.Source").alias("Source"),
        "ShippedPallets","ShippedTons"
    )
    merge_upsert(f"{GOLD_DB}.FactShipments", fact_ship,
                 natural_key_cols=["NaturalKey"],
                 partition_by=["Source"])

# FactDailyFillRate (date grain)
if texists(SILVER_DB, "orders") and texists(SILVER_DB, "shipments") and DimProductLK is not None and DimDateLK is not None:
    p = DimProductLK.alias("p")
    d = DimDateLK.alias("d")

    o = (spark.table(f"{SILVER_DB}.orders").alias("o")
            .select("OrderDate","ProductID","QuantityPallets","Source")
            .join(p, on=["ProductID","Source"], how="left")
            .withColumn("OrderedTons", F.col("o.QuantityPallets").cast("double") /
                                      F.when(F.col("p.PalletsToTon") == 0, F.lit(1.0)).otherwise(F.col("p.PalletsToTon")))
            .groupBy(F.to_date(F.col("o.OrderDate")).alias("date"))
            .agg(F.sum("OrderedTons").alias("OrderedTons")))

    sh = (spark.table(f"{SILVER_DB}.shipments").alias("s")
            .select("Date","ProductID","Pallets","Source")
            .join(p, on=["ProductID","Source"], how="left")
            .withColumn("ShippedTons", F.col("s.Pallets").cast("double") /
                                       F.when(F.col("p.PalletsToTon") == 0, F.lit(1.0)).otherwise(F.col("p.PalletsToTon")))
            .groupBy(F.to_date(F.col("s.Date")).alias("date"))
            .agg(F.sum("ShippedTons").alias("ShippedTons")))

    fr1 = (o.join(sh, "date", "full")
             .na.fill({"OrderedTons":0.0, "ShippedTons":0.0})
             .withColumn("FillRate", F.when(F.col("OrderedTons") > 0,
                                            F.col("ShippedTons")/F.col("OrderedTons"))
                                   .otherwise(F.lit(0.0))))
    fr2 = fr1.alias("fr").join(d, F.col("fr.date") == F.col("d.date"), how="left")

    fr2 = (fr2.withColumn("NaturalKey", F.sha2(F.col("fr.date").cast("string"), 256))
               .withColumn("SurrogateKey", sk_hash(["NaturalKey"])))

    fact_fr = fr2.select(
        "SurrogateKey","NaturalKey",
        "d.DateSK",
        "OrderedTons","ShippedTons","FillRate"
    )
    merge_upsert(f"{GOLD_DB}.FactDailyFillRate", fact_fr, natural_key_cols=["NaturalKey"])

# FactWarehouseDailyUtilization (date, warehouse)
if texists(SILVER_DB, "stockmovements") and texists(SILVER_DB, "warehouses") and DimWarehouseLK is not None and DimDateLK is not None:
    mv = spark.table(f"{SILVER_DB}.stockmovements").select("Date","FromType","FromID","ToType","ToID","Pallets","Source").alias("m")
    wh = spark.table(f"{SILVER_DB}.warehouses").select("WarehouseID","CapacityPallets","Source").alias("w0")

    receipts = (mv.where(F.col("m.ToType") == F.lit("Warehouse"))
                  .groupBy(F.col("m.Date").alias("Date"), F.col("m.ToID").alias("WarehouseID"), F.col("m.Source").alias("Source"))
                  .agg(F.sum("Pallets").alias("receipts_pallets")))
    shipments = (mv.where(F.col("m.FromType") == F.lit("Warehouse"))
                   .groupBy(F.col("m.Date").alias("Date"), F.col("m.FromID").alias("WarehouseID"), F.col("m.Source").alias("Source"))
                   .agg(F.sum("Pallets").alias("shipments_pallets")))

    daily = (receipts.join(shipments, ["Date","WarehouseID","Source"], "full")
                   .na.fill({"receipts_pallets":0.0, "shipments_pallets":0.0})
                   .withColumn("delta_pallets", F.col("receipts_pallets") - F.col("shipments_pallets")))

    w = Window.partitionBy("WarehouseID","Source").orderBy("Date").rowsBetween(Window.unboundedPreceding, 0)
    eod = (daily.withColumn("pallets_occupied", F.sum("delta_pallets").over(w))
               .join(wh, on=["WarehouseID","Source"], how="left")
               .withColumn("pallets_occupied", F.when(F.col("pallets_occupied") < 0, F.lit(0.0)).otherwise(F.col("pallets_occupied")))
               .withColumn("Utilization", F.when(F.col("CapacityPallets") > 0,
                                                 F.col("pallets_occupied")/F.col("CapacityPallets"))
                                          .otherwise(F.lit(0.0))))

    # Join dims using aliases to avoid ambiguity
    d = DimDateLK.alias("d")
    wlk = DimWarehouseLK.alias("wlk")
    eod = (eod.withColumn("EODDate", F.to_date(F.col("Date")))
              .alias("x")
              .join(d, F.col("x.EODDate") == F.col("d.date"), how="left")
              .join(wlk, on=["WarehouseID","Source"], how="left"))

    eod = (eod.withColumn("NaturalKey",
                          F.sha2(F.concat_ws("|",
                                             F.col("x.Date").cast("string"),
                                             F.col("x.WarehouseID").cast("string"),
                                             F.coalesce(F.col("x.Source").cast("string"), F.lit(""))), 256))
              .withColumn("SurrogateKey", sk_hash(["NaturalKey"])))

    fact_util = eod.select(
        "SurrogateKey","NaturalKey",
        "d.DateSK","wlk.WarehouseSK",
        F.col("x.Source").alias("Source"),
        F.col("pallets_occupied").alias("PalletsOccupied"),
        "CapacityPallets","Utilization"
    )
    merge_upsert(f"{GOLD_DB}.FactWarehouseDailyUtilization",
                 fact_util,
                 natural_key_cols=["NaturalKey"],
                 partition_by=["Source"])


StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 3, Finished, Available, Finished)

🆕 lakehouse_gd.DimProduct: created
🆕 lakehouse_gd.DimWarehouse: created
🆕 lakehouse_gd.DimCustomer: created
🆕 lakehouse_gd.DimMill: created
🆕 lakehouse_gd.DimMachine: created
🆕 lakehouse_gd.DimEmployee: created
🆕 lakehouse_gd.DimDate: created
🆕 lakehouse_gd.FactOrders: created
🆕 lakehouse_gd.FactShipments: created
🆕 lakehouse_gd.FactDailyFillRate: created
🆕 lakehouse_gd.FactWarehouseDailyUtilization: created


In [2]:
%%sql
-- =========================================================
-- 0) BASIC ROW COUNTS (Gold)
-- =========================================================

SELECT 'DimProduct'  AS table_name, COUNT(*) AS rows FROM lakehouse_gd.DimProduct  UNION ALL
SELECT 'DimWarehouse',               COUNT(*)       FROM lakehouse_gd.DimWarehouse UNION ALL
SELECT 'DimCustomer',                COUNT(*)       FROM lakehouse_gd.DimCustomer  UNION ALL
SELECT 'DimDate',                    COUNT(*)       FROM lakehouse_gd.DimDate      UNION ALL
SELECT 'FactOrders',                 COUNT(*)       FROM lakehouse_gd.FactOrders   UNION ALL
SELECT 'FactShipments',              COUNT(*)       FROM lakehouse_gd.FactShipments UNION ALL
SELECT 'FactDailyFillRate',          COUNT(*)       FROM lakehouse_gd.FactDailyFillRate UNION ALL
SELECT 'FactWarehouseDailyUtilization', COUNT(*)    FROM lakehouse_gd.FactWarehouseDailyUtilization;


StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 4, Finished, Available, Finished)

<Spark SQL result set with 8 rows and 2 fields>

In [3]:
%%sql

-- =========================================================
-- 3) FACTS: FK / REFERENTIAL INTEGRITY CHECKS (SKs present?)
-- =========================================================

-- Missing ProductSK on Orders
SELECT COUNT(*) AS missing_productsk
FROM lakehouse_gd.FactOrders fo
LEFT ANTI JOIN lakehouse_gd.DimProduct dp
ON fo.ProductSK = dp.SurrogateKey;

-- Missing DateSK on Orders
SELECT COUNT(*) AS missing_datesk
FROM lakehouse_gd.FactOrders fo
LEFT ANTI JOIN lakehouse_gd.DimDate dd
ON fo.DateSK = dd.SurrogateKey;

-- Missing WarehouseSK on Shipments
SELECT COUNT(*) AS missing_warehousesk
FROM lakehouse_gd.FactShipments fs
LEFT ANTI JOIN lakehouse_gd.DimWarehouse dw
ON fs.WarehouseSK = dw.SurrogateKey;

-- Null SK rates (quick view)
SELECT 
  SUM(CASE WHEN DateSK      IS NULL THEN 1 ELSE 0 END) AS null_datesk,
  SUM(CASE WHEN ProductSK   IS NULL THEN 1 ELSE 0 END) AS null_productsk,
  SUM(CASE WHEN WarehouseSK IS NULL THEN 1 ELSE 0 END) AS null_warehousesk,
  SUM(CASE WHEN NaturalKey  IS NULL THEN 1 ELSE 0 END) AS null_customersk
FROM lakehouse_gd.FactShipments;  -- change to FactOrders to check that table


StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 8, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 1 fields>

<Spark SQL result set with 1 rows and 4 fields>

In [4]:
%%sql

-- =========================================================
-- 5) MEASURE RECONCILIATION vs SILVER (ORDERS)
-- =========================================================
-- Gold FactOrders vs Silver orders (per day)
WITH gold AS (
  SELECT fo.DateSK, SUM(fo.OrderedPallets) AS ordered_pallets_gold,
         SUM(fo.OrderedTons) AS ordered_tons_gold
  FROM lakehouse_gd.FactOrders fo
  GROUP BY fo.DateSK
),
dates AS (
  SELECT d.SurrogateKey AS DateSK, d.date
  FROM lakehouse_gd.DimDate d
),
silver AS (
  SELECT to_date(o.OrderDate) AS date,
         SUM(o.QuantityPallets) AS ordered_pallets_silver,
         SUM(CAST(o.QuantityPallets AS DOUBLE) / 
             CASE WHEN p.PalletsToTon IS NULL OR p.PalletsToTon = 0 THEN 1.0 ELSE p.PalletsToTon END) AS ordered_tons_silver
  FROM lakehouse_sv.orders o
  JOIN lakehouse_gd.DimProduct p
    ON o.ProductID = p.ProductID AND o.Source = p.Source
  GROUP BY to_date(o.OrderDate)
)
SELECT d.date,
       g.ordered_pallets_gold, s.ordered_pallets_silver,
       g.ordered_tons_gold,    s.ordered_tons_silver,
       (g.ordered_pallets_gold - s.ordered_pallets_silver) AS pallets_diff,
       (g.ordered_tons_gold - s.ordered_tons_silver)       AS tons_diff
FROM gold g
JOIN dates d ON g.DateSK = d.DateSK
FULL JOIN silver s ON s.date = d.date
ORDER BY d.date
LIMIT 200;


StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 9, Finished, Available, Finished)

<Spark SQL result set with 7 rows and 7 fields>

In [5]:
%%sql

-- =========================================================
-- 10) QUICK SPOT CHECKS (top dates / warehouses)
-- =========================================================
-- Most recent utilization by warehouse
SELECT d.date, f.WarehouseSK, f.Source, f.PalletsOccupied, f.CapacityPallets, f.Utilization
FROM lakehouse_gd.FactWarehouseDailyUtilization f
JOIN lakehouse_gd.DimDate d ON f.DateSK = d.SurrogateKey
ORDER BY d.date DESC, f.WarehouseSK
LIMIT 100;

-- Daily totals orders vs shipments in Gold (pallets)
SELECT d.date,
       SUM(o.OrderedPallets) AS orders_pallets,
       SUM(s.ShippedPallets) AS shipments_pallets
FROM lakehouse_gd.DimDate d
LEFT JOIN lakehouse_gd.FactOrders o     ON o.DateSK = d.SurrogateKey
LEFT JOIN lakehouse_gd.FactShipments s  ON s.DateSK = d.SurrogateKey
GROUP BY d.date
ORDER BY d.date DESC
LIMIT 200;


StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 11, Finished, Available, Finished)

<Spark SQL result set with 18 rows and 6 fields>

<Spark SQL result set with 8 rows and 3 fields>

In [6]:
from notebookutils import mssparkutils

delete_tables = False  # <-- set True to delete the folders too

if delete_tables:
    # same list you used when creating
    tables = [
        "DimCustomer","DimEmployee","DimMachine","DimMill","DimProduct","DimWarehouse","DimDate",
        "FactDailyFillRate",
        "FactOrders","FactWarehouseDailyUtilization","FactShipments"
    ]

    # Drop from the current Lakehouse database (what Fabric shows as the schema)
    db = spark.catalog.currentDatabase()
    print(f"Dropping from database: {db}")

    for t in tables:
        spark.sql(f"DROP TABLE IF EXISTS {db}.{t.lower()}")
        print(f"🗑️  Dropped: {db}.{t}")

StatementMeta(, b0b85492-c77f-4a14-ac16-ec6c7c8853ed, 12, Finished, Available, Finished)