In [0]:
from pyspark.sql import functions as F

CATALOG = "fmucd_capstone"
SILVER = f"{CATALOG}.silver"
GOLD = f"{CATALOG}.gold"

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {GOLD}")

fact = spark.table(f"{SILVER}.fact_work_orders")
dim_b = spark.table(f"{SILVER}.dim_building_scd2").filter("Active_Flag = true")
dim_a = spark.table(f"{SILVER}.dim_asset_scd2").filter("Active_Flag = true")

# ------------------------------------------------------------
# GOLD 1: Enriched work orders (base for dashboards)
# ------------------------------------------------------------
wo_enriched = (
    fact.alias("f")
    .join(dim_b.alias("b"), on="building_sk", how="left")
    .join(dim_a.alias("a"), on="asset_sk", how="left")
    .select(
        "f.wo_id", "f.wo_description", "f.wo_priority",
        "f.wo_start_date", "f.wo_end_date", "f.wo_duration_days", "f.maintenance_type",
        "f.labor_cost", "f.material_cost", "f.other_cost", "f.total_cost", "f.labor_hours",
        "f.min_temp_c", "f.max_temp_c", "f.atmospheric_pressure_hpa", "f.humidity_pct",
        "f.wind_speed_mps", "f.wind_degree", "f.precipitation_mm", "f.snow_mm", "f.cloudness_pct",

        "b.university_id", "b.country", "b.state_province",
        "b.building_id", "b.building_name", "b.building_type", "b.building_size", "b.built_year",
        "b.fci", "b.crv", "b.dmc",

        "a.system_code", "a.system_description",
        "a.subsystem_code", "a.subsystem_description",
        "a.component_code", "a.component_description",

        "f.Load_Date"
    )
)

wo_enriched.write.format("delta").mode("overwrite").option("overwriteSchema","true") \
    .saveAsTable(f"{GOLD}.work_orders_enriched")

# ------------------------------------------------------------
# Helper: month bucket
# ------------------------------------------------------------
wo = spark.table(f"{GOLD}.work_orders_enriched") \
    .withColumn("month", F.date_trunc("month", F.col("wo_start_date")))

# ------------------------------------------------------------
# GOLD 2: Monthly KPIs
# ------------------------------------------------------------
kpi_monthly = (
    wo.groupBy("month")
    .agg(
        F.countDistinct("wo_id").alias("work_orders"),
        F.avg("wo_duration_days").alias("avg_duration_days"),
        F.expr("percentile_approx(wo_duration_days, 0.5)").alias("median_duration_days"),
        F.sum("total_cost").alias("total_cost"),
        F.avg("total_cost").alias("avg_cost"),
        F.sum(F.when(F.col("maintenance_type") == "PPM", 1).otherwise(0)).alias("ppm_count"),
        F.sum(F.when(F.col("maintenance_type") == "UPM", 1).otherwise(0)).alias("upm_count"),
    )
    .orderBy("month")
)

kpi_monthly.write.format("delta").mode("overwrite").option("overwriteSchema","true") \
    .saveAsTable(f"{GOLD}.kpi_work_orders_monthly")

# ------------------------------------------------------------
# GOLD 3: Cost by system monthly
# ------------------------------------------------------------
cost_system_monthly = (
    wo.groupBy("month", "system_code", "system_description")
    .agg(
        F.countDistinct("wo_id").alias("work_orders"),
        F.sum("total_cost").alias("total_cost"),
        F.avg("total_cost").alias("avg_cost"),
        F.avg("wo_duration_days").alias("avg_duration_days")
    )
    .orderBy(F.desc("total_cost"))
)

cost_system_monthly.write.format("delta").mode("overwrite").option("overwriteSchema","true") \
    .saveAsTable(f"{GOLD}.cost_by_system_monthly")

# ------------------------------------------------------------
# GOLD 4: Building health snapshot (dim + work order rollup)
# ------------------------------------------------------------
building_health = (
    wo.groupBy("university_id","building_id","building_name","building_type","state_province")
    .agg(
        F.avg("fci").alias("avg_fci"),
        F.max("crv").alias("crv"),
        F.max("dmc").alias("dmc"),
        F.countDistinct("wo_id").alias("work_orders"),
        F.sum("total_cost").alias("total_cost"),
        F.avg("wo_duration_days").alias("avg_duration_days")
    )
    .orderBy(F.desc("total_cost"))
)

building_health.write.format("delta").mode("overwrite").option("overwriteSchema","true") \
    .saveAsTable(f"{GOLD}.building_health")

# ------------------------------------------------------------
# GOLD 5: PPM vs UPM summary monthly (with cost)
# ------------------------------------------------------------
ppm_upm_monthly = (
    wo.groupBy("month", "maintenance_type")
    .agg(
        F.countDistinct("wo_id").alias("work_orders"),
        F.sum("total_cost").alias("total_cost"),
        F.avg("wo_duration_days").alias("avg_duration_days")
    )
    .orderBy("month", "maintenance_type")
)

ppm_upm_monthly.write.format("delta").mode("overwrite").option("overwriteSchema","true") \
    .saveAsTable(f"{GOLD}.ppm_upm_summary_monthly")

print("âœ… Gold tables created:")
print(f" - {GOLD}.work_orders_enriched")
print(f" - {GOLD}.kpi_work_orders_monthly")
print(f" - {GOLD}.cost_by_system_monthly")
print(f" - {GOLD}.building_health")
print(f" - {GOLD}.ppm_upm_summary_monthly")


In [0]:
%sql
OPTIMIZE fmucd_capstone.gold.work_orders_enriched
ZORDER BY (wo_start_date, building_id, system_code);
