#### classification tree are useful machine learning method when you need to know how the decision are being made

### Importing libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for graphs

from sklearn.tree import DecisionTreeClassifier # classifier
from sklearn.tree import plot_tree # to draw classifiction tree

from sklearn.model_selection import train_test_split # splitting data into training and test sets
from sklearn.model_selection import cross_val_score # crossvalidaton

from sklearn.metrics import confusion_matrix # create confusion matrix
from sklearn.metrics import plot_confusion_matrix # plotting confusion matrix

### Loading data

In [None]:
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.head()

### Identifying and handling missing data

In [None]:
df.dtypes

In [None]:
df.T.apply(lambda columns: columns.nunique(), axis=1)

In [None]:
df.isnull().any()

##### lucikly ther were no missing data

### Splitting data into dependent and independent variables

In [None]:
Y = df.target
X = df.drop(['target'], axis=1)
X.head()

### Format the data one hot encoding


In [None]:
X_encoded = pd.get_dummies(X, columns=['cp','restecg','slope','thal'])
X_encoded.head()

In [None]:
Y.unique()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=42)
clf_dt = DecisionTreeClassifier(random_state=42)
clf_dt = clf_dt.fit(X_train, Y_train)

In [None]:
plot_confusion_matrix(clf_dt, X_test, Y_test, display_labels=["Does not have heart disease","Have heart disease"]);
plt.xticks(rotation=90);

## pruning the tree 
##### tackle overfitting and imporving the performance of decision tree classifier 

In [None]:
path = clf_dt.cost_complexity_pruning_path(X_train, Y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas = ccp_alphas[:-1]

clf_dts = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train, Y_train)
    clf_dts.append(clf_dt)

In [None]:
train_scores = [clf_dt.score(X_train, Y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, Y_test) for clf_dt in clf_dts]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In the graph above, we see that the accuracy for the **Testing Dataset** hits its maximum value when `alpha` is about **0.016**. After this value for `alpha`, the accuracy of the **Training Dataset** drops off and that suggest we should set `ccp_alpha=0.016`.

However, since there are many ways we could have divided the original dataset into **Training** and **Testing** datasets, how do we know we used the best **Training Dataset** and how do we know we used the best **Testing Dataset**? Typically, we answer this question with **10-Fold Cross Validation**. So that's what we're going to do now, and we'll do it with the `cross_val_score()` function.

### Finding the best alpha
The graphs we just drew suggest one value for alpha, **0.016**, but another
set of data might suggest another optimal value. 

<!-- **Terminology Alert!!!** Since, ultimately, we have to decide on one value for `alpha`, and
the **Decision Tree** algorithm will not do this for us, `alpha` is called a **Hyperparameter** to differentiate it from the parameters that the **Decision Tree** algorithm can take care of on its own. -->

First, let's demonstrate that different training and testing datasets result in trees with different accuracies:

In [None]:
clf_dt = DecisionTreeClassifier(random_state=42, ccp_alpha=0.03)
scores = cross_val_score(clf_dt, X_train, Y_train, cv=5)
df = pd.DataFrame(data={'tree': range(5), 'accuracy': scores})

df.plot(x='tree', y='accuracy', marker='o', linestyle='--')

So we see that using different **Training** and **Testing** data results in different accuracies.

Now let's use **cross validation** to find the optimal value for `ccp_alpha`.

In [None]:
alpha_loop_values = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    scores = cross_val_score(clf_dt, X_train, Y_train, cv=5)
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])
    
alpha_results = pd.DataFrame(alpha_loop_values, 
                             columns=['alpha', 'mean_accuracy', 'std'])

alpha_results.plot(x='alpha', 
                   y='mean_accuracy', 
                   yerr='std', 
                   marker='o', 
                   linestyle='--')

Using cross validation, we can see that, over all, instead of setting `ccp_alpha=0.016`,  we need to set it to something closer to **0.014**. We can find the exact value with:

In [None]:
alpha_results[(alpha_results['alpha']>0.01)
             &
             (alpha_results['alpha']<0.015)]

In [None]:
ideal_ccp_alpha = alpha_results[(alpha_results['alpha']>0.01)
                                &
                                (alpha_results['alpha']<0.015)]['alpha']
ideal_ccp_alpha

In [None]:
ideal_ccp_alpha=float(ideal_ccp_alpha)
ideal_ccp_alpha

In [None]:
clf_dt_pruned = DecisionTreeClassifier(random_state=42, 
                                       ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned = clf_dt_pruned.fit(X_train, Y_train) 

In [None]:
plot_confusion_matrix(clf_dt_pruned, 
                      X_test, 
                      Y_test, 
                      display_labels=["Does not have HD", "Has HD"])

In [None]:
plt.figure(figsize=(15,7.5))
plot_tree(clf_dt_pruned, 
          filled=True, 
          rounded=True, 
          class_names=["No HD", "Yes HD"], 
          feature_names=X.columns) ;