In [0]:
import mlflow
import mlflow.spark
from pyspark.sql import functions as F
from pyspark.ml.functions import vector_to_array

CATALOG = "fmucd_capstone"
SOURCE_TABLE = f"{CATALOG}.gold.work_orders_enriched"
SCORED_TABLE = f"{CATALOG}.gold.high_duration_risk_queue"

#RUN_ID = "a927cdf563b340f48f60ff285561731b"
RUN_ID = dbutils.jobs.taskValues.get(taskKey="04_ml_training", key="run_id", debugValue=None)
if RUN_ID is None:
    raise ValueError("run_id not found. Run 04_ml_training first.")

MODEL_ARTIFACT = "high_duration_risk_model"

model_uri = f"runs:/{RUN_ID}/{MODEL_ARTIFACT}"
model = mlflow.spark.load_model(model_uri,dfs_tmpdir="/Volumes/workspace/sor/fmucd/mlruns/tmp")

df = spark.table(SOURCE_TABLE)

# ------------------------------------------------------------
# Recreate the same model duration feature used in training
# (IMPORTANT: scoring must match training feature logic)
# ------------------------------------------------------------
df = df.withColumn("system_code_filled", F.coalesce(F.col("system_code"), F.lit("UNKNOWN")))
df = df.withColumn("wo_priority_filled", F.coalesce(F.col("wo_priority"), F.lit(-1)).cast("string"))

df = df.withColumn(
    "duration_days_model",
    F.coalesce(
        F.col("wo_duration_days").cast("double"),
        F.when(
            F.col("wo_start_date").isNotNull() & F.col("wo_end_date").isNotNull(),
            F.datediff(F.col("wo_end_date"), F.col("wo_start_date")).cast("double")
        )
    )
)

# Categoricals with fallback
for c in ["maintenance_type", "building_type", "state_province"]:
    df = df.withColumn(
        c,
        F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), F.lit("UNKNOWN")).otherwise(F.col(c))
    )

# Cast numeric candidates safely (model will ignore columns it doesn't use, but keep types sane)
num_cols = [
    "labor_hours",
    "labor_cost", "material_cost", "other_cost", "total_cost",
    "building_size", "fci", "dmc",
    "min_temp_c", "max_temp_c", "atmospheric_pressure_hpa", "humidity_pct",
    "wind_speed_mps", "wind_degree", "precipitation_mm", "snow_mm", "cloudness_pct"
]
for c in num_cols:
    df = df.withColumn(c, F.expr(f"try_cast({c} as double)"))

# Fill null numerics so assembler doesn't get NaN/Null explosions
df = df.fillna({c: 0.0 for c in num_cols})

# ------------------------------------------------------------
# Score
# ------------------------------------------------------------
scored = model.transform(df)

# Extract probability for class 1
scored = scored.withColumn("probability_arr", vector_to_array(F.col("probability")))
scored = scored.withColumn("risk_score", F.col("probability_arr").getItem(1)).drop("probability_arr")


# Bucket for ops queue
scored = scored.withColumn(
    "risk_bucket",
    F.when(F.col("risk_score") >= 0.80, F.lit("HIGH"))
     .when(F.col("risk_score") >= 0.50, F.lit("MEDIUM"))
     .otherwise(F.lit("LOW"))
)

# Curated output (keep it human-usable)
out = scored.select(
    "wo_id", "wo_description", "wo_priority",
    "wo_start_date", "wo_end_date", "wo_duration_days",
    "maintenance_type",
    "system_code", "system_description",
    "subsystem_code", "subsystem_description",
    "component_code", "component_description",
    "building_id", "building_name", "state_province", "building_type",
    "total_cost", "labor_hours",
    "risk_score", "risk_bucket",
    F.current_timestamp().alias("scored_ts"),
    F.lit(RUN_ID).alias("model_run_id")
)

(out.write.format("delta")
 .mode("overwrite")
 .option("overwriteSchema", "true")
 .saveAsTable(SCORED_TABLE)
)

print("âœ… Scored table written:", SCORED_TABLE)
display(out.orderBy(F.desc("risk_score")).limit(50))


In [0]:
%sql
SELECT risk_bucket, COUNT(*) AS cnt
FROM fmucd_capstone.gold.high_duration_risk_queue
GROUP BY risk_bucket
ORDER BY cnt DESC;


In [0]:
%sql
SELECT
  system_code,
  system_description,
  COUNT(*) AS work_orders,
  AVG(risk_score) AS avg_risk_score,
  SUM(total_cost) AS total_cost
FROM fmucd_capstone.gold.high_duration_risk_queue
WHERE risk_bucket = 'HIGH'
GROUP BY system_code, system_description
ORDER BY avg_risk_score DESC
LIMIT 20;


In [0]:
%sql
CREATE OR REPLACE TABLE fmucd_capstone.gold.high_duration_risk_queue_ranked AS
WITH base AS (
  SELECT *,
         percent_rank() OVER (ORDER BY risk_score) AS pr
  FROM fmucd_capstone.gold.high_duration_risk_queue
)
SELECT *,
       CASE
         WHEN pr >= 0.99 THEN 'HIGH'
         WHEN pr >= 0.95 THEN 'MEDIUM'
         ELSE 'LOW'
       END AS risk_bucket_ranked
FROM base;
