In [91]:
from sklearn.datasets import load_digits
from sklearn import cross_validation
import numpy as np
import pylab as plt
from copy import deepcopy
from queue import PriorityQueue
%matplotlib inline

##Data loading and preparation

In [92]:
#plotting 64, array as (8,8) gray picture
def image_show(data, title):
    data.shape = (8,8)
    plt.gray()
    plt.imshow(data)
    plt.title(title)
    plt.show()
    return

#loading data set
digits = load_digits()

data = digits['data']
images = digits['images']
target = digits['target']
target_names = digits['target_names']

x_all = data
y_all = target

#splitting data
x_train, x_test, y_train, y_test = \
    cross_validation.train_test_split(data, target, \
                                     test_size=0.4, random_state=0)

#extract threes from training set
x_threes = x_train[np.where(y_train == 3)]

## Tree

In [100]:
class Node:
    
    threshold_number = 10
    
    def __init__(self, data, domain, split_dim, total_score, tree):
        self.left_child = None
        self.right_child = None
        self.pot_left_child = None
        self.pot_right_child = None
        self.features = data
        self.split_dim = split_dim
        self.total_score = total_score
        self.domain = domain
        self.tree = tree
        self.volume = np.prod([self.domain[i, 1] - self.domain[i, 0] \
                               for i in range(len(self.domain))])
        self.thresholds = self.thresholds()
        
        self.best_score = float('inf')
        self.left_domain = deepcopy(self.domain)
        self.right_domain = deepcopy(self.domain)
        self.left_features = None
        self.right_features = None
        self.left_score = None
        self.right_score = None
        
        self.score()
        
        tree.queue.put(self, self.best_score)
        
    def thresholds(self):
        lower = self.domain[self.split_dim, 0]
        upper = self.domain[self.split_dim, 1]
        dist = upper - lower
        step = dist / (self.threshold_number + 1)
        thresholds = []
        t = lower
        for i in range(self.threshold_number):
            t += step
            thresholds.append(t)
        return thresholds
            
    def score(self):
        next_split = tree.next_split(self.split_dim)
        self.left_child = Node(self.left_features, self.left_domain, next_split, self.left_score, \
                               self.tree)
        
        for t in self.thresholds:
            print('\nthreshold', t)
            left = self.features[np.where(self.features[:, self.split_dim] <= t)]
            print('left.shape:', left.shape)
            right = self.features[np.where(self.features[:, self.split_dim] > t)]
            print('right.shape:', right.shape)
            left_fraction = (t - self.domain[self.split_dim, 0]) / \
                               (self.domain[self.split_dim, 1] - self.domain[self.split_dim, 0])
            print('left fraction:', left_fraction)
            right_fraction = (self.domain[self.split_dim, 1] - t) / \
                              (self.domain[self.split_dim, 1] - self.domain[self.split_dim, 0])
            print('right fraction', right_fraction)
            p_left = len(left) / self.tree.num_instances / self.volume / left_fraction
            print('p_left', p_left)
            p_right = len(right) / self.tree.num_instances / self.volume / right_fraction
            print('p_right', p_right)
            left_score = p_left**2 * left_fraction * self.volume
            right_score = p_right**2 * right_fraction * self.volume
            score = left_score + right_score - self.total_score
            print('threshold:', t, 'gives score:', score)
            if score < self.best_score:
                self.best_score = score
                self.left_domain[self.split_dim, 1] = t
                self.right_domain[self.split_dim, 0] = t
                self.left_features = left
                self.right_features = right    
        return
    
    def split(self):
        next_split = tree.next_split(self.split_dim)
        self.left_child = Node(self.left_features, self.left_domain, next_split, self.left_score, \
                               self.tree)
        self.right_child = Node(self.right_features, self.right_domain, next_split, self.right_score, \
                               self.tree)
        self.tree.leaf_count += 1
        return

class Tree():
     
    def __init__(self, data, domain, num_leaves):
        self.data = data
        self.domain = domain
        self.num_leaves = num_leaves
        self.num_instances = len(data)
        self.volume = np.prod([self.domain[i, 1] - self.domain[i, 0] \
                               for i in range(len(self.domain))])
        self.initial_score = 1 / self.volume
        self.root = Node(self.data, self.domain, 0, self.initial_score, self)
        self.queue = PriorityQueue()
        self.leaf_count = 1
        
         
    def next_split(self, current):
        if current == data.shape[1] - 1:
            return 0
        else:
            return current + 1
        
    def build(self):
        while self.leaf_count < self.num_leaves:
            print('\nnumber of leaves now:', self.leaf_count)
            leaf_to_split = self.queue.get()
            leaf_to_split.split()
        return 

In [101]:
domain = np.empty((64, 2))
domain[:, 0] = 0
domain[:, 1] = 16

tree = Tree(x_threes, domain, 10)
root = Node(x_threes, domain, 50, 0, tree)
tree.build()

<class 'numpy.ndarray'> (113, 64)


RuntimeError: maximum recursion depth exceeded in comparison

115792089237316195423570985008687907853269984665640564039457584007913129639936