In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Step 1: Reading the Dataset
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(r"C:\Users\marca\Downloads\KaggleV2-May-2016.csv")

# Check for missing values and drop them
print("Missing values before dropping:\n", df.isnull().sum())
df = df.dropna()  # Drop rows with any missing values (one-liner as hinted)
print("Missing values after dropping:\n", df.isnull().sum())

# Step 2: Feature Extraction
# Extract specified features: Gender, Age, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap, SMS_received
# Target variable is 'No-show' (assuming this is the column indicating appointment attendance)
features = ['Gender', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
X = df[features].copy()  # Use .copy() to avoid SettingWithCopyWarning
y = df['No-show']  # Target variable: 'Yes' (no-show) or 'No' (showed up)

# Step 3: Preprocessing
# a) Encoding categorical features (Gender and No-show are categorical)
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])  # Encode Gender: 'M' -> 0, 'F' -> 1

# b) Scaling numeric features (Age is the only continuous numeric feature)
scaler = StandardScaler()
X['Age'] = scaler.fit_transform(X[['Age']])  # Scale Age (needs to be 2D for scaler)

# c) Other features (Scholarship, Hipertension, etc.) are binary (0/1), so no scaling/encoding needed
# d) NaN values already handled by dropna() in Step 2

# Step 4: Splitting the Data
# Split into 80% train, 10% validation, 10% test
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1111, random_state=42)  # 0.1111 of 90% = 10% of total

# Step 5: Training Tree-based Classifiers (Decision Tree)
# Hyper-parameter tuning for criterion
criteria = ['gini', 'entropy']  # Possible criteria for DecisionTreeClassifier
val_scores = []

for criterion in criteria:
    dt = DecisionTreeClassifier(criterion=criterion, random_state=42)
    dt.fit(X_train, y_train)
    y_val_pred = dt.predict(X_val)
    val_scores.append(accuracy_score(y_val, y_val_pred))

# Choose the best criterion
best_criterion = criteria[np.argmax(val_scores)]
print(f"Best criterion: {best_criterion} with validation accuracy: {max(val_scores):.4f}")

# Train final Decision Tree model with best criterion
dt_final = DecisionTreeClassifier(criterion=best_criterion, random_state=42)
dt_final.fit(X_train, y_train)

# Classification Metrics for Decision Tree
y_test_pred_dt = dt_final.predict(X_test)
print(f"Decision Tree Test Accuracy: {accuracy_score(y_test, y_test_pred_dt):.4f}")
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_dt))

# Step 6: Random Forest
# Train Random Forest with different numbers of estimators and compare
n_estimators_list = [10, 50, 100]  # Test different numbers of trees
rf_scores = []

for n in n_estimators_list:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    y_test_pred_rf = rf.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred_rf)
    rf_scores.append(accuracy)
    print(f"\nRandom Forest with {n} estimators:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_rf))

# Comment on the difference in classification metrics
print("\nObservations on Random Forest Performance:")
for i, n in enumerate(n_estimators_list):
    print(f"- With {n} estimators, accuracy was {rf_scores[i]:.4f}")
print("Typically, increasing the number of estimators improves accuracy up to a point, then stabilizes or slightly decreases due to overfitting or noise.")


Missing values before dropping:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64
Missing values after dropping:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64
Best criterion: gini with validation accuracy: 0.7946
Decision Tree Test Accuracy: 0.7974
Decision Tree Confusion Matrix:
 [[8775   53]
 [2186   39]]

Random Forest with 10 estimators:
Test Accuracy: 0.7957
Confusion Matrix:
 [[8752   76]
 [2182   43]]

Random Forest with 50 estimators:
Test Accuracy: 0.7971
Confusion Matrix:
 [[8766   62]
 [2181 