In [None]:


"""
Medical appointments are time commitments doctors make with patients. However, some people do not show up for different reasons, that cause lost time and money for the doctor. Let's build models that predict whether the next appointment is a show or no show!
License: the dataset is CC4.0: BY-NC-SA, and it is publicly available online.
You will need to use GitHub to complete this mini project. Find Guidelines of Using GitHub Here.
Expected Output
By the end of this mini project, you are supposed to deliver within your code:
Multiple accuracy measures resembling different criteria used for training your decision tree classifiers.
Multiple accuracy measures resembling a different number of estimators used for your random forest classifiers.
One printed confusion matrix for the best model.

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.calibration import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

# STEP 1: Download the Dataset
# STEP 2: Reading the Dataset. Does the dataset include any missing values? If so, drop them!
dataset = pd.read_csv('KaggleV2-May-2016.csv')
print("DATASET", dataset)
print("DATASET INFO", dataset.info())

"""nissing values elimination is optional. don't have to have acomplete row because we are not passing arrays. it can pass over missing values"""
missing_mask = dataset.isna()
print("BOOLEAN MISSING VALUE MASK:")
print(missing_mask.head())  # Shows True where data is missing

# Step 2b: Count total missing values per column
missing_counts = missing_mask.sum()
print("\nCOUNT OF MISSING VALUES PER COLUMN:")
print(missing_counts)

# Step 2c: Count total missing values in entire dataset
total_missing = missing_mask.values.sum()
print("\nTOTAL MISSING VALUES IN DATASET:", total_missing)

"""
no missing valus found

"""

print('SHAPE', dataset.shape)
print("NUMBER OF TUPLES",(dataset.shape)[0])  


# Create a list of the features we want to keep and check numerical vs categorical

"""
numerical_cols = ['PatientId','AppointmentID', 'Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
categorical_cols = ['Gender','Neighbourhood','No-show']
date_time_column = ['ScheduledDay', 'AppointmentDay']

"""

selected_features = [
     
    "Gender",
    "Age",
    "Scholarship",
    "Hipertension",
    "Diabetes",
    "Alcoholism",
    "Handcap",
    "SMS_received"
    
]

# # Create a new DataFrame with only the selected columns
df_subset = dataset[selected_features]


print(df_subset.head())
print(f"New DataFrame shape: {df_subset.shape}")

# Step 4: Preprocessing
"""
Perform any needed pre-processing on the chosen features, includes:
Scaling
Encoding
Dealing with NaN values
Note:
Use only the preprocessing steps you think are useful.

"""
#  Encode target column No-show
# Convert 'No-show' to 0 and 1
# label encoder is not appropriate because it will convert No-show yes -> 0 and No -> 1 since it is alphabetical order
dataset['No-show'] = dataset['No-show'].map({'No': 0, 'Yes': 1})

numerical_cols = ['Age', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received']
categorical_cols = ['Gender']

# Separate features (X) and target (y)
X = dataset[selected_features]
y = dataset['No-show']
print("label..", y)

# Encoding (One-Hot Encoding) - recommended over others for multiple features
# Creates new binary columns from the categorical features
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X_encoded = X_encoded.astype(int) #to avoid True False after encoding
print("features_encoded:",X_encoded)

"""
Decisin Treee doesn't really care about scaling. The magnitide of numbers have a lot less influence on th emodel. For KNN"and SVM it is needed

"""


# # Scaling (Standard Scaling)
# scaler = StandardScaler()
# # Apply scaler only to the numerical columns
# X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

# Final Data Assembly and Inspection
# Combine processed features and target for the final model-ready DataFrame
df_processed_final = pd.concat([X_encoded, y], axis=1)

print("\n--- Final Preprocessing Summary ---")
print(f"Final DataFrame shape: {df_processed_final.shape}")
print(f"Number of features after One-Hot Encoding and Scaling: {df_processed_final.shape[1] - 1}")
print("\nFirst 5 rows of the fully processed DataFrame (Note the scaled and encoded values):")
print(df_processed_final.head())
print("\nFinal DataFrame Info:")
df_processed_final.info()

# Step 5:  Splitting the Data

"""

Split your data as follows:
80% training set
10% validation set
10% test set
using the 80/10/10 split as it provides a dedicated Validation Set for fine-tuning without touching the final Test Set.

"""
# Assuming X and y are the features and target

# Create Train and Test
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


X_train, X_temp, y_train, y_temp = train_test_split(
    X_encoded, y, test_size=0.2, random_state=1
)

# Step 2: Split Temp (20%) into 10% Validation and 10% Test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

# Choose the model
model_tree = DecisionTreeClassifier(criterion = "entropy", 
                                    max_leaf_nodes = 10)
# Train the model
model_tree.fit(X_train, y_train)

#Evaluate the model
y_pred_tree = model_tree.predict(X_test)
print(confusion_matrix(y_test,y_pred_tree))
print(classification_report(y_test,y_pred_tree))

# ----------------------------
# TRY DIFFERENT DECISION TREE CRITERIA
# ----------------------------

criteria = ["gini", "entropy", "log_loss"]
val_scores = {}

for c in criteria:
    model = DecisionTreeClassifier(criterion=c)
    model.fit(X_train, y_train)

    y_val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)

    val_scores[c] = acc
    print(f"Criterion = {c:8s} | Validation Accuracy = {acc:.4f}")

# ----------------------------
# SELECT BEST CRITERION
# ----------------------------

best_criterion = max(val_scores, key=val_scores.get)
print("\nBest Criterion:", best_criterion)

"""
Random Forest
Repeat step 6.
Increase/decrease the number of estimators in random forest and comment on the difference of the classification metrics.
"""

from sklearn.ensemble import RandomForestClassifier
model_RF = RandomForestClassifier(n_estimators = 10, # Default is 100, but we have small dataset
                                  bootstrap = True) # If false, then all samples will be used in each tree 

model_RF.fit(X_train, y_train)
y_pred_RF = model_RF.predict(X_test)
model_RF.score(X_test, y_test)

print(confusion_matrix(y_test,      y_pred_RF))
print(classification_report(y_test, y_pred_RF))

model_RF = RandomForestClassifier(n_estimators = 10, # Default is 100
                                  criterion = "gini", # Can use {“gini”, “entropy”, “log_loss”}
                                  bootstrap = True, # True = use a subset of samples, False = use all samples
                                  max_features = 'sqrt', # This is the number of randomly chosen features to decide between at each split
                                  class_weight = "balanced") # or a dictionary with key:value pair set-up: {class_label: weight}

model_RF.fit(X_train, y_train)
model_RF.score(X_test, y_test)

n_estimators_list = [10, 50, 100, 200]

for n in n_estimators_list:
    model_rf = RandomForestClassifier(
        n_estimators=n,
        random_state=1
    )
    
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    print(f"\n==== Random Forest with {n} estimators ====")
    print("Accuracy:", acc)
    print("Confusion Matrix:\n", cm)
