In [19]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer



In [20]:
df = pd.read_csv("../data/processed/classification_data.csv")

print("Shape:", df.shape)
df.head()


Shape: (109593, 30)


Unnamed: 0,specialty,appointment_time,gender,no_show,disability,place,appointment_shift,age,under_12_years_old,over_60_years_old,...,Hipertension,Diabetes,Alcoholism,Handcap,Scholarship,SMS_received,no_show_binary,day,month,weekday
0,6,17,0,yes,2,9917,0,9.0,1,0,...,0,0,0,0,0,0,1,1,1,2
1,0,7,2,no,2,7338,1,11.0,1,0,...,0,0,0,0,0,0,0,1,1,2
2,8,16,2,no,2,7337,0,8.0,1,0,...,0,0,0,0,0,0,0,1,1,2
3,8,14,2,yes,2,20037,0,9.0,1,0,...,0,0,0,0,0,1,1,1,1,2
4,5,8,2,no,3,7337,1,12.0,0,0,...,0,0,0,0,0,0,0,1,1,2


In [21]:
y = df["no_show_binary"]

X = df.drop(columns=["no_show", "no_show_binary"], errors="ignore")


In [22]:
ORDINAL_MAP = {
    "low": 0,
    "moderate": 1,
    "high": 2
}

for col in X.select_dtypes(include="object").columns:
    unique_vals = set(X[col].dropna().unique())
    if unique_vals.issubset(ORDINAL_MAP.keys()):
        X[col] = X[col].map(ORDINAL_MAP)


In [23]:
X = pd.get_dummies(X, drop_first=True)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [25]:
X_train, X_test = X_train.align(
    X_test,
    join="left",
    axis=1,
    fill_value=0
)


In [26]:
imputer = SimpleImputer(strategy="median")

X_train = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    imputer.transform(X_test),
    columns=X_test.columns
)


In [27]:
print("NaNs in X_train:", X_train.isna().sum().sum())
print("NaNs in X_test:", X_test.isna().sum().sum())


NaNs in X_train: 0
NaNs in X_test: 0


In [28]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1
)

model.fit(X_train, y_train)


In [29]:
preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, preds))
print("ROC-AUC:", roc_auc_score(y_test, probs))


              precision    recall  f1-score   support

           0       0.74      0.90      0.81     14952
           1       0.60      0.33      0.43      6967

    accuracy                           0.72     21919
   macro avg       0.67      0.61      0.62     21919
weighted avg       0.70      0.72      0.69     21919

ROC-AUC: 0.7721588994835624


In [30]:
os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/noshow_model.pkl")
joblib.dump(list(X_train.columns), "../models/noshow_features.pkl")

print("✅ Model & features saved successfully")


✅ Model & features saved successfully
