In [4]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report


In [2]:
df = pd.read_csv("../data/processed/classification_data.csv")

X = df.drop(["no_show", "appointment_date_continuous"], axis=1)
y = df["no_show"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
df = pd.read_csv("../data/processed/classification_data.csv")

print("Dataset Shape:", df.shape)
df.head()



Dataset Shape: (109593, 30)


Unnamed: 0,specialty,appointment_time,gender,no_show,disability,place,appointment_shift,age,under_12_years_old,over_60_years_old,...,Hipertension,Diabetes,Alcoholism,Handcap,Scholarship,SMS_received,no_show_binary,day,month,weekday
0,6,17,0,yes,2,9917,0,9.0,1,0,...,0,0,0,0,0,0,1,1,1,2
1,0,7,2,no,2,7338,1,11.0,1,0,...,0,0,0,0,0,0,0,1,1,2
2,8,16,2,no,2,7337,0,8.0,1,0,...,0,0,0,0,0,0,0,1,1,2
3,8,14,2,yes,2,20037,0,9.0,1,0,...,0,0,0,0,0,1,1,1,1,2
4,5,8,2,no,3,7337,1,12.0,0,0,...,0,0,0,0,0,0,0,1,1,2


In [11]:
print(df.columns)
# =====================================================
# TARGET SETUP (CORRECTED)
# =====================================================

# Use the already processed binary target
TARGET = "no_show_binary"

# Make sure it's numeric
df[TARGET] = pd.to_numeric(df[TARGET], errors="coerce")

# Drop rows where target is missing
df = df.dropna(subset=[TARGET])

# Convert to integer
df[TARGET] = df[TARGET].astype(int)

print("Target distribution:")
print(df[TARGET].value_counts())


Index(['specialty', 'appointment_time', 'gender', 'no_show', 'disability',
       'place', 'appointment_shift', 'age', 'under_12_years_old',
       'over_60_years_old', 'patient_needs_companion', 'average_temp_day',
       'average_rain_day', 'max_temp_day', 'max_rain_day', 'rainy_day_before',
       'storm_day_before', 'rain_intensity', 'heat_intensity',
       'appointment_date_continuous', 'Hipertension', 'Diabetes', 'Alcoholism',
       'Handcap', 'Scholarship', 'SMS_received', 'no_show_binary', 'day',
       'month', 'weekday'],
      dtype='object')
Target distribution:
no_show_binary
0    74761
1    34832
Name: count, dtype: int64


In [12]:
X = df.drop(columns=[TARGET])
y = df[TARGET]


In [13]:
for col in X.columns:
    if X[col].dtype == "object":
        X[col] = X[col].astype(str)
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])


In [14]:
# Replace infinite values
X = X.replace([np.inf, -np.inf], np.nan)

# Fill numeric NaN with median
for col in X.columns:
    if X[col].dtype in ["int64", "float64"]:
        X[col] = X[col].fillna(X[col].median())

# Final safety check
print("Remaining NaN values:", X.isna().sum().sum())


Remaining NaN values: 0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (87674, 29)
Test shape: (21919, 29)


In [19]:
from sklearn.ensemble import RandomForestClassifier

# Smaller, deployment-safe model
model = RandomForestClassifier(
    n_estimators=30,      # reduced trees
    max_depth=8,          # limit tree size
    min_samples_leaf=20,  # control overfitting
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)



In [20]:
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)

print(f"ROC-AUC Score: {auc:.4f}")

preds = model.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, preds))


ROC-AUC Score: 1.0000

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14952
           1       1.00      1.00      1.00      6967

    accuracy                           1.00     21919
   macro avg       1.00      1.00      1.00     21919
weighted avg       1.00      1.00      1.00     21919



In [21]:
joblib.dump(model, "../models/noshow_model.pkl")
joblib.dump(list(X.columns), "../models/noshow_features.pkl")

print("✅ Model saved: models/noshow_model.pkl")
print("✅ Features saved: models/noshow_features.pkl")


✅ Model saved: models/noshow_model.pkl
✅ Features saved: models/noshow_features.pkl
