# Decision Tree Classification

Let's build and evaluate the best decision tree classifier on the Wisconsin Breast Cancer data set.

In [1]:
import pandas as pd
from sklearn import tree
from treeviz import tree_print
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# get data 
df = pd.read_csv("assets/wdbc.csv")

# create our sklearn data
X  = df.drop(['ID','Diagnosis'],axis=1)
y = df['Diagnosis']

# setting up grid search
model = tree.DecisionTreeClassifier()
param_grid = {'max_depth': list(range(1,6)),
              'criterion': ['entropy', 'gini']
              }
grid = GridSearchCV(model, param_grid, cv=5)

# performing grid search 
grid.fit(X,y)

# print out what we found
print("Accuracy: {:.0f}%".format(grid.best_estimator_.score(X,y)*100))
print("Best parameters: {}".format(grid.best_params_))
print("Best tree:")
tree_print(grid.best_estimator_,X)

Accuracy: 98%
Best parameters: {'criterion': 'entropy', 'max_depth': 4}
Best tree:
if perimeter3 =< 105.94999694824219: 
  |then if concave_points3 =< 0.1350499987602234: 
  |  |then if area2 =< 48.974998474121094: 
  |  |  |then if texture3 =< 30.145000457763672: 
  |  |  |  |then B
  |  |  |  |else B
  |  |  |else if fractal_dimension2 =< 0.004316499922424555: 
  |  |  |  |then M
  |  |  |  |else B
  |  |else if texture3 =< 27.575000762939453: 
  |  |  |then if symmetry3 =< 0.35785001516342163: 
  |  |  |  |then B
  |  |  |  |else M
  |  |  |else M
  |else if perimeter3 =< 117.44999694824219: 
  |  |then if smoothness3 =< 0.13609999418258667: 
  |  |  |then if texture3 =< 25.670000076293945: 
  |  |  |  |then B
  |  |  |  |else M
  |  |  |else if texture1 =< 13.420000076293945: 
  |  |  |  |then B
  |  |  |  |else M
  |  |else if concave_points3 =< 0.08586500585079193: 
  |  |  |then if area3 =< 1214.0: 
  |  |  |  |then B
  |  |  |  |else M
  |  |  |else M
<---------->
Tree Depth:  

In [2]:
# build the confusion matrix 
predict_y = grid.best_estimator_.predict(X)
actual_y = y

labels = ['M','B']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Confusion Matrix:
     M    B
M  210    2
B    7  350


In [3]:
# confidence interval
from bootstrap import bootstrap
lb, ub = bootstrap(grid.best_estimator_, df.drop(['ID'],axis=1), 'Diagnosis')
print("Confidence interval: ({0:.0f}%, {1:.0f}%)".format(lb*100,ub*100))

Confidence interval: (91%, 99%)


### Restricting the model complexity to a depth of 2.

In [4]:
model = tree.DecisionTreeClassifier(max_depth=2)
model.fit(X, y)
print("Accuracy: {:.0f}%".format(model.score(X,y)*100))
tree_print(model, X)

Accuracy: 94%
if radius3 =< 16.795000076293945: 
  |then if concave_points3 =< 0.13580000400543213: 
  |  |then B
  |  |else M
  |else if texture1 =< 16.110000610351562: 
  |  |then B
  |  |else M
<---->
Tree Depth:  2


In [5]:
# build the confusion matrix 
predict_y = model.predict(X)
actual_y = y

labels = ['M','B']
cm = confusion_matrix(actual_y, predict_y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Confusion Matrix:
     M    B
M  199   13
B   20  337


In [6]:
# confidence interval
from bootstrap import bootstrap
lb, ub = bootstrap(model, df.drop(['ID'],axis=1), 'Diagnosis')
print("Confidence interval: ({0:.0f}%, {1:.0f}%)".format(lb*100,ub*100))

Confidence interval: (89%, 98%)
