In [1]:
% matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

from sklearn import tree

from L5.validation import confusion_matrix, print_scores

In [2]:
data = pd.read_csv("L5/data/agaricus-lepiota.data", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
data = data.apply(lambda x: pd.Categorical(x).codes)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [4]:
mushroom_labels = data[0].values
mushroom_data = data.drop([0], axis = 1).values

In [5]:
for max_leaf_nodes in range(10, 21, 2):
    classifier = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes)
    print_scores(classifier, mushroom_data, mushroom_labels, max_leaf_nodes)

10: 0.971935007386
[1.0, 1.0, 0.96063960639606394, 0.97660098522167482, 0.85854858548585489, 0.98891625615763545, 0.89655172413793105, 0.96186961869618692, 1.0, 0.96678966789667897]
0.960991644399


12: 0.988675529296


[1.0, 1.0, 0.99876998769987702, 0.97660098522167482, 0.92004920049200489, 0.9926108374384236, 0.89655172413793105, 0.97662976629766296, 1.0, 0.96678966789667897]
0.972800216918


14: 0.992614475628
[1.0, 1.0, 1.0, 0.99137931034482762, 0.9372693726937269, 0.99384236453201968, 0.94211822660098521, 0.97662976629766296, 1.0, 0.99261992619926198]
0.983385896667


16: 0.994583948794


[1.0, 1.0, 1.0, 1.0, 0.9372693726937269, 0.99630541871921185, 0.95320197044334976, 0.97785977859778594, 1.0, 0.99261992619926198]
0.985725646665


18: 0.996061053668
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.96182266009852213, 0.98646986469864695, 1.0, 1.0]
0.99482925248


20: 1.0


[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.96798029556650245, 1.0, 1.0, 1.0]
0.996798029557




In [6]:
for max_leaf_nodes in range(10, 21, 2):
    classifier = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, criterion="entropy")
    print_scores(classifier, mushroom_data, mushroom_labels, max_leaf_nodes)

10: 0.982274741507
[1.0, 1.0, 0.99384993849938497, 1.0, 1.0, 1.0, 0.93103448275862066, 0.955719557195572, 1.0, 0.96678966789667897]
0.984739364635


12: 0.999015263417
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.97047970479704793, 1.0, 0.95202952029520294]
0.992250922509


14: 

1.0
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99261992619926198, 1.0, 0.95940959409594095]
0.99520295203


16: 1.0


[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99876998769987702, 1.0, 0.95940959409594095]
0.99581795818


18: 1.0
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99507995079950795]
0.99950799508


20: 1.0


[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
1.0




In [7]:
mushroom_tree = (tree.DecisionTreeClassifier(max_leaf_nodes=20, criterion="entropy")
        .fit(mushroom_data, mushroom_labels))
with open("L5/trees/mushroom.dot", "w") as f:
    tree.export_graphviz(mushroom_tree, out_file=f)

In [9]:
confusion_matrix(
    tree.DecisionTreeClassifier(max_leaf_nodes=20, criterion="entropy"), 
    mushroom_data,
    mushroom_labels
)

array([[2775,    0],
       [   0, 2641]])