In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
import joblib
import os

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_data.csv")

# Final safety check
df = df.dropna(subset=["no_show"])
df["no_show"] = df["no_show"].astype(int)

X = df.drop(columns=["no_show", "appointment_date", "appointment_date_continuous"])
y = df["no_show"]

# Remove any NaNs in target (MANDATORY)
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Train model (IMPORTANT)
model = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)   # ✅ FIT HAPPENS HERE

# Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

# Save AFTER fitting
os.makedirs("../models", exist_ok=True)
joblib.dump(model, "../models/no_show_model.pkl")

print("✅ Trained model saved successfully")


F1 Score: 0.4689572031344183
ROC AUC: 0.7821148290768918
✅ Trained model saved successfully


In [3]:
import joblib

loaded_model = joblib.load("../models/no_show_model.pkl")
print(hasattr(loaded_model, "estimators_"))


True


In [None]:
import joblib

feature_names = X.columns.tolist()
joblib.dump(feature_names, "../models/feature_names.pkl")

print("Feature names saved:", len(feature_names))
