# Model Baseline - Patient Appointment Prediction

This notebook establishes baseline models and compares different algorithms for predicting patient no-show appointments.

## Table of Contents
1. [Data Loading and Preparation](#data-loading)
2. [Baseline Models](#baseline-models)
3. [Model Comparison](#model-comparison)
4. [Hyperparameter Tuning](#hyperparameter-tuning)
5. [Cross-Validation Analysis](#cross-validation)
6. [Model Selection](#model-selection)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    average_precision_score, accuracy_score, precision_score, 
    recall_score, f1_score, roc_curve, precision_recall_curve
)
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 1. Data Loading and Preparation {#data-loading}


In [None]:
# Load and prepare data
df = pd.read_csv('../data/raw/MedicalCentre.csv')

print("Dataset Overview:")
print(f"Shape: {df.shape}")

# Basic preprocessing
df = df.drop(columns=['PatientID', 'AppointmentID'], errors='ignore')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Age'] = df['Age'].fillna(df['Age'].median())
df = df.drop_duplicates()
df = df[df['Age'] >= 0]

# Create target variable
df['NoShow'] = (df['No-show'] == 'Yes').astype(int)

# Create basic features
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])
df['AwaitingDays'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days.abs()

# Age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 35, 50, 65, 100], 
                       labels=['0-18', '19-35', '36-50', '51-65', '65+'])

# Prepare features
feature_cols = ['Age', 'Gender', 'Scholarship', 'Hypertension', 'Diabetes', 
                'Alcoholism', 'Handicap', 'SMS_received', 'AwaitingDays', 'AgeGroup']
X = df[feature_cols].copy()

# Handle categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

y = df['NoShow']

print(f"\nFeatures shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")
print(f"No-show rate: {y.mean():.1%}")


## 2. Baseline Models {#baseline-models}


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")

# Define baseline models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'SVM': SVC(random_state=42, class_weight='balanced', probability=True),
    'Naive Bayes': GaussianNB()
}

print(f"\nTraining {len(models)} baseline models...")
