# Use sk.learn decisiontree on digits dataset

### Import statements

In [40]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, tree, metrics
import graphviz

In [8]:
digits = datasets.load_digits()

### Split into 70/30 train and test

In [10]:
num_split = int(0.7*len(digits.data))
train_features = digits.data[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits.data[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


### Set up classifier and fit to training data

In [42]:
classifier = tree.DecisionTreeClassifier(min_samples_leaf = 2, min_samples_split = 4) #Default params

classifier.fit(train_features,train_labels)   # Fit to training data

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Plot tree with graphviz

In [43]:
dot_data = tree.export_graphviz(classifier, out_file=None)  # Export a decision tree in dot format
graph = graphviz.Source(dot_data) 
graph.render("digits_train_graphviz_2_4", "gz_output")

'gz_output\\digits_train_graphviz_2_4.pdf'

### Predict the test data

In [32]:
predicted_labels = classifier.predict(test_features)

### Evaluate classifier 

In [38]:
print(metrics.classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86        53
           1       0.79      0.72      0.75        53
           2       0.88      0.70      0.78        53
           3       0.72      0.74      0.73        53
           4       0.81      0.88      0.84        57
           5       0.86      0.86      0.86        56
           6       0.82      0.87      0.85        54
           7       0.95      0.96      0.95        54
           8       0.73      0.63      0.68        52
           9       0.66      0.80      0.72        55

    accuracy                           0.80       540
   macro avg       0.81      0.80      0.80       540
weighted avg       0.81      0.80      0.80       540



In [39]:
print(metrics.confusion_matrix(test_labels, predicted_labels))

[[46  0  0  0  3  0  0  0  1  3]
 [ 0 38  0  2  1  0  1  0  3  8]
 [ 2  1 37  4  1  0  3  0  4  1]
 [ 0  1  3 39  1  3  0  1  2  3]
 [ 3  0  0  0 50  0  2  1  0  1]
 [ 2  0  0  2  0 48  4  0  0  0]
 [ 0  1  0  2  2  0 47  0  0  2]
 [ 0  0  0  0  0  1  0 52  0  1]
 [ 0  6  2  2  2  2  0  1 33  4]
 [ 1  1  0  3  2  2  0  0  2 44]]
