# Heart Disease Cleveland
A quick tree evaluation of the Heart Diseas Cleveland dataset from Kaggle.

In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree

import matplotlib.pyplot as pyplot
pyplot.rcParams['figure.dpi'] = 150
pyplot.rcParams['savefig.dpi'] = 150

The ultimate aim is to predict whether a patient has heart disease or not. First
we load the available data, which contains the information for each patient,
including the coding whether the have heart desease (condition == 1) or not
(condition == 0).

In [None]:
heart_cleveland = pd.read_csv('heart_cleveland_upload.csv')
print(heart_cleveland.head())
print(heart_cleveland.shape)

Next we extract the X (predictors) and Y (condition) values and split them into
test and train sets.

In [None]:
X = heart_cleveland.drop(columns=['condition'])
Y = heart_cleveland.loc[:, 'condition']

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=.2, random_state=42)

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

To start to identify the most powerful predictors we build a basic classification
tree.

In [None]:
accuracies = {}
best_accuracy = 0

for min_sample_split in range(train_y.size, 2, -1):
    heart_tree = tree.DecisionTreeClassifier(max_leaf_nodes=min_sample_split)
    heart_tree = heart_tree.fit(train_x, train_y)
    pred_y = heart_tree.predict(test_x)
    try:
        accuracies[accuracy_score(test_y, pred_y)].append(min_sample_split)
    except KeyError:
        accuracies[accuracy_score(test_y, pred_y)] = [min_sample_split]

    if accuracy_score(test_y, pred_y) >= best_accuracy:
        best_accuracy = accuracy_score(test_y, pred_y)
        best_model = heart_tree

print("Most accurate model has a maximum of ",
      best_model.get_n_leaves(), " leaf nodes.")

best_pred_y = best_model.predict(test_x)
print("The accurace of the model is", accuracy_score(test_y, best_pred_y))

tree.plot_tree(best_model, max_depth=2, feature_names=train_x.columns)