# Use sk.learn decisiontree on digits dataset

### Import statements

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, tree, metrics
import graphviz

In [2]:
digits = datasets.load_digits()

### Split into 70/30 train and test

In [3]:
num_split = int(0.7*len(digits.data))
train_features = digits.data[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits.data[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


### Set up classifier and fit to training data

In [9]:
classifier = tree.DecisionTreeClassifier(min_samples_split = 10) 

classifier.fit(train_features,train_labels)   # Fit to training data

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Plot tree with graphviz

In [10]:
dot_data = tree.export_graphviz(classifier, out_file=None)  # Export a decision tree in dot format
graph = graphviz.Source(dot_data) 
graph.render("digits_train_graphviz_10", "gz_output")

'gz_output\\digits_train_graphviz_10.pdf'

### Predict the test data

In [11]:
predicted_labels = classifier.predict(test_features)

### Evaluate classifier 

In [12]:
print(metrics.classification_report(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84        53
           1       0.67      0.72      0.69        53
           2       0.66      0.72      0.68        53
           3       0.75      0.72      0.73        53
           4       0.83      0.79      0.81        57
           5       0.94      0.84      0.89        56
           6       0.85      0.81      0.83        54
           7       0.86      0.94      0.90        54
           8       0.81      0.56      0.66        52
           9       0.70      0.84      0.76        55

    accuracy                           0.78       540
   macro avg       0.79      0.78      0.78       540
weighted avg       0.79      0.78      0.78       540



In [8]:
print(metrics.confusion_matrix(test_labels, predicted_labels))

[[45  0  4  0  3  0  0  0  1  0]
 [ 0 37  3  0  1  1  1  1  1  8]
 [ 2  1 39  4  0  0  2  0  4  1]
 [ 0  1  3 38  0  5  0  1  1  4]
 [ 3  0  1  0 48  0  2  1  1  1]
 [ 2  0  0  2  0 48  2  1  1  0]
 [ 0  1  0  6  2  0 44  0  1  0]
 [ 0  0  0  0  0  1  0 51  0  2]
 [ 0  5  1  2  3  2  0  1 34  4]
 [ 1  1  0  2  2  4  0  0  3 42]]
