In [2]:
# ============================================================
# 06_demo_scoring.ipynb
# Demo: Score New Cases and Export Predictions
# ============================================================

import os, sys
import pandas as pd
import numpy as np
import joblib

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

from config import RAW_PATH, MODEL_PATH

# Optional: if you want to score raw-format cases, we need the same feature engineering logic.
# For a simple demo, we’ll sample a few rows from RAW and transform them with the same approach
# used in your processed dataset (get_dummies alignment using feature_columns from the artifact).

print("RAW_PATH:", RAW_PATH)
print("MODEL_PATH:", MODEL_PATH)


RAW_PATH: /Users/loictiemani/Documents/sla-risk-prediction/data/raw/sla_cases.csv
MODEL_PATH: /Users/loictiemani/Documents/sla-risk-prediction/models/sla_rf_model.pkl


In [3]:

# 1️⃣ Load model artifact
artifact = joblib.load(MODEL_PATH)

if not isinstance(artifact, dict):
    raise ValueError("Expected MODEL_PATH to store an artifact dict with feature_columns + threshold.")

model = artifact["model"]
threshold = artifact.get("threshold", 0.5)
feature_cols = artifact["feature_columns"]

print("Threshold:", threshold)
print("Number of expected features:", len(feature_cols))


Threshold: 0.95
Number of expected features: 39


In [4]:
# 2️⃣ Load raw cases (synthetic)
raw = pd.read_csv(RAW_PATH)

# Choose a small batch of "new cases" to score (demo)
new_cases = raw.sample(25, random_state=7).copy()

# Remove label if present (as if these are active cases with unknown outcomes)
if "sla_breach" in new_cases.columns:
    new_cases = new_cases.drop(columns=["sla_breach"])

new_cases.head()


Unnamed: 0,case_id,case_type,country,processing_stage,office_id,office_load,priority,document_complexity,documents_missing,client_response_delay_days,reassignment_count,days_in_stage,sla_target_days,total_processing_days
3406,3407,Visa Renewal,DE,Government Review,117,104,Normal,Low,1,17,2,29,30,74.6
757,758,Tax,AU,Government Review,111,44,Normal,Medium,2,11,0,31,21,69.1
3624,3625,Visa Renewal,AU,Government Review,103,111,Normal,Low,4,0,2,14,30,71.3
4544,4545,Work Permit,US,Submission,118,37,Normal,Low,2,21,0,24,45,58.4
3235,3236,Work Permit,UK,Documentation,115,98,Normal,Medium,0,17,2,7,45,67.9


In [5]:
# 3️⃣ Minimal feature preparation for scoring
# IMPORTANT: This must match what you did in 02_feature_engineering.
# We'll recreate the same engineered columns and one-hot encoding,
# then align to artifact.feature_columns.

df = new_cases.copy()

# engineered features (must match 02)
df["docs_per_sla"] = df["documents_missing"] / df["sla_target_days"].replace(0, np.nan)
df["client_delay_per_sla"] = df["client_response_delay_days"] / df["sla_target_days"].replace(0, np.nan)
df["load_x_reassign"] = df["office_load"] * (df["reassignment_count"] + 1)
df["stage_time_ratio"] = df["days_in_stage"] / df["sla_target_days"].replace(0, np.nan)

df["missing_docs_bucket"] = pd.cut(
    df["documents_missing"],
    bins=[-1, 0, 2, 4, 10],
    labels=["0", "1-2", "3-4", "5+"]
)

df["office_load_bucket"] = pd.qcut(df["office_load"], q=4, labels=["low", "mid", "high", "very_high"])

df["client_delay_bucket"] = pd.cut(
    df["client_response_delay_days"],
    bins=[-1, 0, 7, 14, 30, 365],
    labels=["0", "1-7", "8-14", "15-30", "30+"]
)

if "priority" in df.columns:
    df["is_urgent"] = (df["priority"].astype(str).str.lower() == "urgent").astype(int)

# Drop identifiers and leakage if present
for col in ["case_id", "total_processing_days"]:
    if col in df.columns:
        df = df.drop(columns=[col])

# One-hot encode
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=True).fillna(0)

# Align columns to training feature set
for c in feature_cols:
    if c not in df_enc.columns:
        df_enc[c] = 0

df_enc = df_enc[feature_cols]
print("Scoring matrix shape:", df_enc.shape)
df_enc.head()


Scoring matrix shape: (25, 39)


Unnamed: 0,office_id,office_load,documents_missing,client_response_delay_days,reassignment_count,days_in_stage,sla_target_days,docs_per_sla,client_delay_per_sla,load_x_reassign,...,missing_docs_bucket_1-2,missing_docs_bucket_3-4,missing_docs_bucket_5+,office_load_bucket_mid,office_load_bucket_high,office_load_bucket_very_high,client_delay_bucket_1-7,client_delay_bucket_8-14,client_delay_bucket_15-30,client_delay_bucket_30+
3406,117,104,1,17,2,29,30,0.033333,0.566667,312,...,True,False,False,False,True,False,False,False,True,False
757,111,44,2,11,0,31,21,0.095238,0.52381,44,...,True,False,False,False,False,False,False,True,False,False
3624,103,111,4,0,2,14,30,0.133333,0.0,333,...,False,True,False,False,False,True,False,False,False,False
4544,118,37,2,21,0,24,45,0.044444,0.466667,37,...,True,False,False,False,False,False,False,False,True,False
3235,115,98,0,17,2,7,45,0.0,0.377778,294,...,False,False,False,False,True,False,False,False,True,False


In [6]:

# 4️⃣ Score
risk = model.predict_proba(df_enc)[:, 1] if hasattr(model, "predict_proba") else model.predict(df_enc)
pred = (risk >= threshold).astype(int)

results = new_cases.copy()
results["predicted_risk"] = risk
results["predicted_breach"] = pred

results_sorted = results.sort_values("predicted_risk", ascending=False)
results_sorted.head(10)


Unnamed: 0,case_id,case_type,country,processing_stage,office_id,office_load,priority,document_complexity,documents_missing,client_response_delay_days,reassignment_count,days_in_stage,sla_target_days,total_processing_days,predicted_risk,predicted_breach
3406,3407,Visa Renewal,DE,Government Review,117,104,Normal,Low,1,17,2,29,30,74.6,1.0,1
757,758,Tax,AU,Government Review,111,44,Normal,Medium,2,11,0,31,21,69.1,1.0,1
4760,4761,Payroll,CA,Government Review,107,108,Normal,Medium,5,16,1,7,14,64.8,1.0,1
2318,2319,Tax,UK,Government Review,107,40,Urgent,Low,2,6,1,4,14,40.6,1.0,1
2755,2756,Relocation,US,Government Review,104,72,Normal,Low,1,30,1,33,35,83.9,1.0,1
2366,2367,Visa Renewal,US,Submission,109,59,Normal,High,6,29,1,6,30,86.7,1.0,1
2916,2917,Payroll,US,Intake,100,105,Normal,Medium,1,21,0,24,14,58.5,1.0,1
3708,3709,Tax,US,Submission,115,61,Normal,Medium,2,5,1,28,21,52.5,1.0,1
640,641,Relocation,CA,Government Review,107,85,Normal,Low,4,22,2,4,35,86.3,1.0,1
3215,3216,Tax,US,Government Review,108,75,Normal,Low,4,15,3,19,21,75.8,1.0,1


In [7]:

# 5️⃣ Export predictions
out_dir = os.path.join(PROJECT_ROOT, "outputs")
os.makedirs(out_dir, exist_ok=True)

out_path = os.path.join(out_dir, "predictions.csv")
results_sorted.to_csv(out_path, index=False)

print("✅ Saved predictions to:", out_path)



✅ Saved predictions to: /Users/loictiemani/Documents/sla-risk-prediction/outputs/predictions.csv
