In [41]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score



In [42]:

df = pd.read_csv("../data/processed/classification_data.csv")

print("Initial shape:", df.shape)


Initial shape: (109593, 30)


In [43]:
# Try common target names
possible_targets = ["No-show", "No Show", "noshow", "no_show", "target"]

TARGET = None
for col in possible_targets:
    if col in df.columns:
        TARGET = col
        break

if TARGET is None:
    raise ValueError("❌ Target column not found in dataset")

print("Using target column:", TARGET)

Using target column: no_show


In [44]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

In [45]:
# Convert everything to clean lowercase strings
y = y.astype(str).str.strip().str.lower()

# Map all known variants safely
y = y.replace({
    "yes": 0,
    "show": 0,
    "0": 0,

    "no": 1,
    "no-show": 1,
    "noshow": 1,
    "no show": 1,
    "1": 1
})

  y = y.replace({


In [46]:
# Remove invalid rows
valid_mask = y.isin([0, 1])
X = X.loc[valid_mask]
y = y.loc[valid_mask]

# Convert to int (SAFE now)
y = y.astype(int)

print("Target distribution:")
print(y.value_counts())

Target distribution:
no_show
1    74761
0    34832
Name: count, dtype: int64


In [47]:
# Numeric columns → median
num_cols = X.select_dtypes(include=[np.number]).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Categorical columns → most frequent
cat_cols = X.select_dtypes(exclude=[np.number]).columns
for col in cat_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

# One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

print("Final feature count:", X.shape[1])

Final feature count: 530


In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [49]:
model = RandomForestClassifier(
    n_estimators=40,          # ↓ reduced size
    max_depth=12,             # ↓ limits tree growth
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [50]:
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)

print(f"ROC-AUC Score: {auc:.4f}")


ROC-AUC Score: 1.0000


In [51]:
joblib.dump(model, "../models/noshow_model.pkl")
joblib.dump(list(X.columns), "../models/noshow_features.pkl")

print("✅ Model saved: models/noshow_model.pkl")
print("✅ Features saved: models/noshow_features.pkl")

✅ Model saved: models/noshow_model.pkl
✅ Features saved: models/noshow_features.pkl
