# Decision trees

In this file, we create decision trees that try to predict whether or not a patient should get a diagnosis, which is a more visual approach to predicting. This could (in theory) be used in real life by a doctor, for example.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import data_reader
from sklearn.model_selection import cross_val_score

In [None]:
data = data_reader.get_data_dict('./data/alzheimers_disease_data.csv')

biological_features = [
    'Age', 'Ethnicity', 'Gender', 'BMI', 'FamilyHistoryAlzheimers',
    'CardiovascularDisease', 'Diabetes', 'Hypertension',
    'SystolicBP', 'DiastolicBP', 'CholesterolTotal',
    'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides']

cognitive_features = [
    'MMSE', 'FunctionalAssessment', 'MemoryComplaints',
    'BehavioralProblems', 'ADL', 'Confusion', 'Disorientation',
    'PersonalityChanges', 'DifficultyCompletingTasks', 'Forgetfulness']

lifestyle_features = [
    'Smoking', 'AlcoholConsumption', 'PhysicalActivity',
    'DietQuality', 'SleepQuality', 'Depression', 'HeadInjury',
    'EducationLevel']

feature_sets = {'Biological Features': biological_features, 'Cognitive Features': cognitive_features,'Lifestyle Features': lifestyle_features}

df = pd.DataFrame({key: value for key, value in data.items() if key != 'DoctorInCharge'})

for feature_set_name, features in feature_sets.items():
    print(f"\nUsing {feature_set_name}:")

    X = df[features]
    y = df['Diagnosis']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    # Define the classifier used to create the decision tree.
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Simplify the tree using the entropy criterion, which indicates the amount
    # of uncertainty.
    model_entropy = DecisionTreeClassifier(criterion="entropy", max_depth=3)
    model_entropy.fit(X_train, y_train)
    y_pred_entropy = model_entropy.predict(X_test)
    entropy_accuracy = accuracy_score(y_test, y_pred_entropy)
    print(f"Entropy-Based Tree Accuracy: {entropy_accuracy:.2f}")

    # Visualize the simpler tree of depth 3.
    plt.figure(figsize=(18, 12))
    plot_tree(
        model_entropy,
        feature_names=features,
        class_names=['No Diagnosis', 'Diagnosis'],
        filled=True,
        rounded=True,
        fontsize=10,
        proportion=True
    )
    ax = plt.gca()
    for arrow in ax.patches:
        arrow.set_linewidth(0.55)

    plt.title(f"Decision Tree Visualization for {feature_set_name}\nModel Accuracy: {accuracy_score(y_test, y_pred_entropy):.2f}", fontsize=16)
    plt.show()


In [None]:
# Cross-validation: iterating over various depths to check for overfitting 

depths = range(1, 21)
scores = []

for depth in depths:
    model = DecisionTreeClassifier(max_depth=depth, random_state=1)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    scores.append(cv_scores.mean())

plt.plot(depths, scores, marker='o')
plt.xlabel('Max Depth')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Selecting Optimal Tree Depth')
plt.show()