# Visualizing Performance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
# import graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, validation_curve, learning_curve, train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
# load data
heart = pd.read_csv('../data/heart.csv')
heart.head()

In [None]:
# assess missing data
heart.isnull().sum()

In [None]:
# drop missing values
heart = heart.dropna()

In [None]:
# target array and feature matrix
y = heart['AHD']
X = heart.drop('AHD', axis=1)

In [None]:
# encode categorical variables
lb = LabelEncoder()
lb.fit(y)
print(lb.classes_)
y = lb.transform(y)

In [None]:
lb.fit(X.ChestPain)
X.ChestPain = lb.transform(X.ChestPain)
print(lb.classes_)

In [None]:
lb.fit(X.Thal)
X.Thal = lb.transform(X.Thal.astype('str'))
print(lb.classes_)

In [None]:
# model fitting
model = DecisionTreeClassifier()
model.fit(X, y)

In [None]:
# model validation
cross_val_score(model, X, y, cv=10)

In [None]:
# visualize tree
# dot_data = export_graphviz(model, out_file=None, feature_names=X.columns)
# graph = graphviz.Source(dot_data)
# graph.render("heart")
# graph

## Validation Curve
A validation curve shows the impact of model complexity (hyper-parameter tuning) on performance

In [None]:
depth_range = np.arange(1,10)
train_scores, test_scores = validation_curve(model, X, y, param_name='max_depth',
                                            param_range=depth_range, cv=10)

In [None]:
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_mean

In [None]:
score = np.arange(101)
max_depth = np.arange(1,10)
plt.plot(max_depth, train_mean, label='Train')
plt.plot(max_depth, test_mean, label='Test')
plt.xlabel('Maximum Depth')
plt.ylabel('Score')
plt.title('Decision Tree Validation Curve')
plt.legend()
plt.show()

## Learning Curve
A learning curve shows the impact of sample size on performance

In [None]:
sizes = np.arange(10, 276, 25)
print('Sizes: ', sizes)
train_sizes, train_scores, test_scores = learning_curve(DecisionTreeClassifier(max_depth=3), 
                                                        X, y, cv=10, train_sizes=sizes)
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_mean

In [None]:
score = np.arange(101)
plt.plot(sizes, train_mean, label='Train')
plt.plot(sizes, test_mean, label='Test')
plt.xlabel('Training Set Size')
plt.ylabel('Score')
plt.title('Decision Tree Learning Curve')
plt.legend()
plt.show()

## ROC Curve
A receiver operating characteristics (ROC) curve compares model performance to the baseline of a random classifier. 

In [None]:
# split training and test 
training_size = 135
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=training_size, 
                                                test_size=X.shape[0]-training_size)
tree_model = DecisionTreeClassifier(max_depth=3)
tree_model.fit(Xtrain, ytrain)
print('Training score: ', tree_model.score(Xtest, ytest))
y_pred = tree_model.predict(Xtest)

In [None]:
# true positives and false positives
false_positive_rate, true_positive_rate, thresholds = roc_curve(ytest, y_pred, pos_label=1)
rates = pd.DataFrame(dict(fpr=false_positive_rate, tpr=true_positive_rate))
roc_auc = auc(rates['fpr'], rates['tpr'])
print('AUC: ', roc_auc)

In [None]:
plt.plot(rates.fpr, rates.tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.show()