# Use sk.learn decisiontree on digits dataset

### Import statements

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, tree, metrics
import graphviz

In [2]:
digits = datasets.load_digits()

### Split into 70/30 train and test

In [3]:
num_split = int(0.7*len(digits.data))
train_features = digits.data[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits.data[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


### Set up classifier and fit to training data

In [4]:
classifier = tree.DecisionTreeClassifier(min_samples_split = 2) 

classifier.fit(train_features,train_labels)   # Fit to training data

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Plot tree with graphviz

In [5]:
dot_data = tree.export_graphviz(classifier, out_file=None)  # Export a decision tree in dot format
graph = graphviz.Source(dot_data) 
graph.render("digits_train_graphviz_10", "gz_output")

'gz_output\\digits_train_graphviz_10.pdf'

### Predict the test data

In [6]:
predicted_labels = classifier.predict(test_features)

### Evaluate classifier 

In [7]:
print(metrics.classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87        53
           1       0.80      0.68      0.73        53
           2       0.78      0.66      0.71        53
           3       0.77      0.68      0.72        53
           4       0.75      0.84      0.79        57
           5       0.86      0.86      0.86        56
           6       0.83      0.83      0.83        54
           7       0.83      0.98      0.90        54
           8       0.72      0.60      0.65        52
           9       0.67      0.84      0.74        55

    accuracy                           0.79       540
   macro avg       0.79      0.78      0.78       540
weighted avg       0.79      0.79      0.78       540



In [8]:
print(metrics.confusion_matrix(test_labels, predicted_labels))

[[46  0  1  0  3  0  0  0  0  3]
 [ 0 36  3  1  0  0  1  2  0 10]
 [ 2  2 35  6  0  0  2  0  5  1]
 [ 0  1  4 36  0  4  0  1  3  4]
 [ 2  0  0  0 48  0  2  4  0  1]
 [ 2  0  0  0  2 48  3  1  0  0]
 [ 0  1  0  0  8  0 45  0  0  0]
 [ 0  0  0  0  0  0  0 53  0  1]
 [ 0  5  2  2  2  3  1  3 31  3]
 [ 1  0  0  2  1  1  0  0  4 46]]
