In [0]:
from pyspark.sql import functions as F

SOURCE_TABLE = "fmucd_capstone.gold.work_orders_enriched"
df = spark.table(SOURCE_TABLE)

# Fill system_code so we never lose rows due to nulls
df = df.withColumn("system_code_filled", F.coalesce(F.col("system_code"), F.lit("UNKNOWN")))

# Compute a durable duration column:
# - use wo_duration_days if present
# - else compute datediff(end, start)
df = df.withColumn(
    "duration_days_model",
    F.coalesce(
        F.col("wo_duration_days").cast("double"),
        F.when(
            F.col("wo_start_date").isNotNull() & F.col("wo_end_date").isNotNull(),
            F.datediff(F.col("wo_end_date"), F.col("wo_start_date")).cast("double")
        )
    )
)

# Keep only rows where duration is available (this should NOT be empty now)
df = df.filter(F.col("duration_days_model").isNotNull())

# Priority filled (don’t drop nulls)
df = df.withColumn("wo_priority_filled", F.coalesce(F.col("wo_priority"), F.lit(-1)))

print("Rows after building duration_days_model:", df.count())


In [0]:
from pyspark.sql import functions as F

thresholds = (
    df.groupBy("system_code_filled", "wo_priority_filled")
      .agg(F.expr("percentile_approx(duration_days_model, 0.90)").alias("p90_duration"))
)

df = (
    df.join(thresholds, on=["system_code_filled", "wo_priority_filled"], how="left")
      .withColumn(
          "label_high_duration",
          F.when(F.col("duration_days_model") >= F.col("p90_duration"), 1.0).otherwise(0.0)
      )
)

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print("train:", train_df.count(), "test:", test_df.count())
display(train_df.groupBy("label_high_duration").count())


In [0]:
from pyspark.sql import functions as F

SOURCE_TABLE = "fmucd_capstone.gold.work_orders_enriched"
df = spark.table(SOURCE_TABLE)

# Fill system_code so nulls don't drop rows
df = df.withColumn("system_code_filled", F.coalesce(F.col("system_code"), F.lit("UNKNOWN")))

# Priority filled so nulls don't drop rows
df = df.withColumn("wo_priority_filled", F.coalesce(F.col("wo_priority"), F.lit(-1)))

# Compute model duration (use duration if present else derive from dates)
df = df.withColumn(
    "duration_days_model",
    F.coalesce(
        F.col("wo_duration_days").cast("double"),
        F.when(
            F.col("wo_start_date").isNotNull() & F.col("wo_end_date").isNotNull(),
            F.datediff(F.col("wo_end_date"), F.col("wo_start_date")).cast("double")
        )
    )
).filter(F.col("duration_days_model").isNotNull())

# Threshold per peer group
thresholds = (
    df.groupBy("system_code_filled", "wo_priority_filled")
      .agg(F.expr("percentile_approx(duration_days_model, 0.90)").alias("p90_duration"))
)

df = (
    df.join(thresholds, on=["system_code_filled", "wo_priority_filled"], how="left")
      .withColumn("label_high_duration",
                  F.when(F.col("duration_days_model") >= F.col("p90_duration"), 1.0).otherwise(0.0))
)

# Split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print("train:", train_df.count(), "test:", test_df.count())
display(train_df.groupBy("label_high_duration").count())


In [0]:
from pyspark.sql import functions as F

# Categoricals (include filled keys instead of raw)
cat_cols = ["maintenance_type", "system_code_filled", "building_type", "state_province", "wo_priority_filled"]

num_cols = [
    "labor_hours",
    "labor_cost", "material_cost", "other_cost", "total_cost",
    "building_size", "fci", "dmc",
    "min_temp_c", "max_temp_c", "atmospheric_pressure_hpa", "humidity_pct",
    "wind_speed_mps", "wind_degree", "precipitation_mm", "snow_mm", "cloudness_pct"
]

# Clean categoricals
for c in ["maintenance_type", "building_type", "state_province"]:
    train_df = train_df.withColumn(c, F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), F.lit("UNKNOWN")).otherwise(F.col(c)))
    test_df  = test_df.withColumn(c,  F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), F.lit("UNKNOWN")).otherwise(F.col(c)))

# Priority filled must be string to behave like category
train_df = train_df.withColumn("wo_priority_filled", F.col("wo_priority_filled").cast("string"))
test_df  = test_df.withColumn("wo_priority_filled",  F.col("wo_priority_filled").cast("string"))

# Cast numerics safely
for c in num_cols:
    train_df = train_df.withColumn(c, F.expr(f"try_cast({c} as double)"))
    test_df  = test_df.withColumn(c,  F.expr(f"try_cast({c} as double)"))

# Drop all-null numeric cols
counts = train_df.select([F.sum(F.when(F.col(c).isNotNull(), 1).otherwise(0)).alias(c) for c in num_cols]).collect()[0].asDict()
kept_num_cols = [c for c in num_cols if counts.get(c, 0) > 0]
dropped_num_cols = [c for c in num_cols if c not in kept_num_cols]

print("Kept numeric cols:", kept_num_cols)
print("Dropped all-null numeric cols:", dropped_num_cols)

# Fill nulls
train_df = train_df.fillna({c: 0.0 for c in kept_num_cols})
test_df  = test_df.fillna({c: 0.0 for c in kept_num_cols})


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import mlflow, os
import mlflow.spark


mlruns_path = "/Volumes/workspace/sor/fmucd/mlruns"

MODEL_NAME = "high_duration_risk_model"

indexers = [StringIndexer(inputCol=c, outputCol=f"{c}__idx", handleInvalid="keep") for c in cat_cols]

encoder = OneHotEncoder(
    inputCols=[f"{c}__idx" for c in cat_cols],
    outputCols=[f"{c}__ohe" for c in cat_cols],
    handleInvalid="keep"
)

assembler_inputs = kept_num_cols + [f"{c}__ohe" for c in cat_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features", handleInvalid="keep")

lr = LogisticRegression(
    labelCol="label_high_duration",
    featuresCol="features",
    maxIter=50,
    regParam=0.05
)

pipeline = Pipeline(stages=indexers + [encoder, assembler, lr])

with mlflow.start_run(run_name="fmucd_high_duration_risk_lr") as run:
    model = pipeline.fit(train_df)

    preds = model.transform(test_df)
    evaluator = BinaryClassificationEvaluator(labelCol="label_high_duration", metricName="areaUnderROC")
    auc = evaluator.evaluate(preds)

    mlflow.log_metric("auc", float(auc))
    mlflow.log_param("label_definition", "P90 duration per (system_code_filled, wo_priority_filled)")
    mlflow.log_param("numeric_features_used", ",".join(kept_num_cols))
    mlflow.log_param("numeric_features_dropped_all_null", ",".join(dropped_num_cols))
    mlflow.log_param("categorical_features", ",".join(cat_cols))


    mlflow.spark.log_model(model, artifact_path=MODEL_NAME,dfs_tmpdir="/Volumes/workspace/sor/fmucd/mlruns/tmp", pip_requirements=["pyspark==4.0.0", "mlflow"])

    run_id = run.info.run_id
    print("✅ Training complete")
    print("Run ID:", run_id)
    print("AUC:", auc)
    print("Model URI:", f"runs:/{run_id}/{MODEL_NAME}")

    dbutils.jobs.taskValues.set(key="run_id", value=run_id)
