In [59]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score


In [60]:
df = pd.read_csv("../data/processed/classification_data.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())


Dataset shape: (109593, 30)
Columns: ['specialty', 'appointment_time', 'gender', 'no_show', 'disability', 'place', 'appointment_shift', 'age', 'under_12_years_old', 'over_60_years_old', 'patient_needs_companion', 'average_temp_day', 'average_rain_day', 'max_temp_day', 'max_rain_day', 'rainy_day_before', 'storm_day_before', 'rain_intensity', 'heat_intensity', 'appointment_date_continuous', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'Scholarship', 'SMS_received', 'no_show_binary', 'day', 'month', 'weekday']


In [61]:
TARGET = [c for c in df.columns if "show" in c.lower()][0]
print("Target column detected:", TARGET)

X = df.drop(columns=[TARGET])
y = df[TARGET]

Target column detected: no_show


In [62]:
if y.dtype == "object":
    y = (
        y.astype(str)
         .str.lower()
         .map({
             "yes": 0,
             "show": 0,
             "no": 1,
             "no-show": 1,
             "noshow": 1
         })
    )

y = y.fillna(0).astype(int)

print("Target value counts:\n", y.value_counts())

Target value counts:
 no_show
1    74761
0    34832
Name: count, dtype: int64


In [63]:
for col in X.columns:
    if X[col].dtype == "object":
        print(f"Encoding categorical column: {col}")
        X[col] = X[col].astype(str).fillna("missing")
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

Encoding categorical column: rain_intensity
Encoding categorical column: heat_intensity
Encoding categorical column: appointment_date_continuous


In [64]:
X = X.fillna(0)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (87674, 29)
Test shape: (21919, 29)


In [67]:
model = RandomForestClassifier(
    n_estimators=50,        # reduced
    max_depth=10,           # limits tree size
    min_samples_leaf=20,
    min_samples_split=20,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [68]:
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print(f"ROC-AUC Score: {auc:.4f}")


ROC-AUC Score: 1.0000


In [69]:
joblib.dump(model, "../models/noshow_model.pkl", compress=3)
joblib.dump(list(X.columns), "../models/noshow_features.pkl")

print("✅ Model saved: models/noshow_model.pkl")
print("✅ Features saved: models/noshow_features.pkl")
print("✅ Ready for Streamlit deployment")

✅ Model saved: models/noshow_model.pkl
✅ Features saved: models/noshow_features.pkl
✅ Ready for Streamlit deployment
