In [1]:
# ============================================================
# 03_modeling.ipynb
# Modeling for Mobility SLA Risk Prediction (Early Warning)
# ============================================================

# 0️⃣ Imports and paths
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from config import PROCESSED_PATH, MODEL_PATH

sns.set(style="whitegrid")
pd.set_option("display.max_columns", 200)

print("PROCESSED_PATH:", PROCESSED_PATH)
print("MODEL_PATH:", MODEL_PATH)


PROCESSED_PATH: /Users/loictiemani/Documents/sla-risk-prediction/data/processed/sla_cases_clean.csv
MODEL_PATH: /Users/loictiemani/Documents/sla-risk-prediction/models/sla_rf_model.pkl


In [2]:
# 1️⃣ Load processed feature dataset
df = pd.read_csv(PROCESSED_PATH)

TARGET_COL = "sla_breach"
assert TARGET_COL in df.columns, f"Target column '{TARGET_COL}' not found."

print("Dataset shape:", df.shape)
print("Target distribution:")
display(df[TARGET_COL].value_counts())
display(df[TARGET_COL].value_counts(normalize=True).round(3))
df.head()


Dataset shape: (5000, 40)
Target distribution:


sla_breach
1    4629
0     371
Name: count, dtype: int64

sla_breach
1    0.926
0    0.074
Name: proportion, dtype: float64

Unnamed: 0,office_id,office_load,documents_missing,client_response_delay_days,reassignment_count,days_in_stage,sla_target_days,sla_breach,docs_per_sla,client_delay_per_sla,load_x_reassign,stage_time_ratio,is_urgent,case_type_Payroll,case_type_Relocation,case_type_Tax,case_type_Visa Renewal,case_type_Work Permit,country_CA,country_DE,country_UK,country_US,processing_stage_Decision,processing_stage_Documentation,processing_stage_Government Review,processing_stage_Intake,processing_stage_Submission,priority_Urgent,document_complexity_Low,document_complexity_Medium,missing_docs_bucket_1-2,missing_docs_bucket_3-4,missing_docs_bucket_5+,office_load_bucket_mid,office_load_bucket_high,office_load_bucket_very_high,client_delay_bucket_1-7,client_delay_bucket_8-14,client_delay_bucket_15-30,client_delay_bucket_30+
0,106,51,1,18,1,3,21,1,0.047619,0.857143,102,0.142857,0,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,True,False,False,True,False,False,False,False,True,False
1,119,34,6,4,2,4,35,1,0.171429,0.114286,102,0.114286,0,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False
2,114,121,1,25,0,40,28,1,0.035714,0.892857,121,1.428571,1,False,True,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False,True,False,False,False,False,True,False,False,True,False
3,110,94,2,4,0,33,30,1,0.066667,0.133333,94,1.1,0,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,True,False,False,False
4,107,85,0,7,1,20,21,1,0.0,0.333333,170,0.952381,0,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False


In [3]:
# 2️⃣ Train-test split (stratified)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train breach rate:", round(y_train.mean(), 3), " Test breach rate:", round(y_test.mean(), 3))


Train shape: (4000, 39)  Test shape: (1000, 39)
Train breach rate: 0.926  Test breach rate: 0.926
