In [1]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix

In [2]:
from sklearn.metrics import accuracy_score

In [3]:
data_train = pd.read_csv('data/spam/spam.train', delimiter=' ', header=None).values
data_train, target_train = data_train[:, 1:], data_train[:, 0]
data_test = pd.read_csv('data/spam/spam.test', delimiter=' ', header=None).values
data_test, target_test = data_test[:, 1:], data_test[:, 0]

In [11]:
clf = DecisionTreeClassifier()

In [12]:
clf.fit(data_train, target_train)
# clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
clf.tree_.max_depth

22

In [17]:
# accuracy_score(clf.predict(data_test), target_test)
accuracy_score(target_test, clf.predict(data_test))

0.79266109785202865

# CART with kNN on leaves

In [3]:
X = pd.read_csv('data/iris/iris.data', header=None).values

In [4]:
y = X[:, -1]
X = X[:, :-1]

In [6]:
import time

In [4]:
class Tree(object):
    
    class Node(object):
        def __init__(self, index, splitter):
            self.index = index
            self.splitter = splitter
            self.left = None
            self.right = None

    def __init__(self):
        self.root = self.Node(None, None)
        self.depth = 0

    def __str__(self):
        if self.root is not None:
            self._print(self.root)

        return ''

    def _print(self, node):
        self.depth += 1
        if node is not None:
            print('\t' * (self.depth - 1), node.index.size, node.splitter)
            self._print(node.left)
            self._print(node.right)
        
        self.depth -= 1

    def insert(self, data):
        if self.root is None:
            self.root = self.Node(data)

        else:
            self._insert(self.root, data)

    def _insert(self, node, data):
        if node.data < data:
            if node.right is None:
                node.right = self.Node(data)
            else:
                self._insert(node.right, data)
                
        else:
            if node.left is None:
                node.left = self.Node(data)
            else:
                self._insert(node.left, data)

In [5]:
class DecisionTree(Tree):
    
    def __init__(self, criterion='gini', max_depth=100, min_samples_leaf=1):
        """
        criterion : string, optional (default="gini")
            The function to measure the quality of a split. Supported criteria are
            "gini" for the Gini impurity and "entropy" for the information gain.
        """
        super().__init__()
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        if self.criterion == 'gini':
            self._fit(self.root, np.arange(X.shape[0]), self.gini)
            
        elif self.criterion == 'entropy':
            self._fit(self.root, np.arange(X.shape[0]), self.entropy)
            
        else:
            return "Error"
        
    def _fit(self, node, index, F, imp=1):
        self.depth += 1
        print('\t' * (self.depth - 1) + "DEPTH:", self.depth, imp)
        print('\t' * (self.depth - 1), np.unique(self.y[index], return_counts=True))
        node.index = index
        if index.size > self.min_samples_leaf and \
            self.depth < self.max_depth and \
            imp > 1e-5:
            min_info = 100000
#             TIME = []
            for i, j in enumerate(self.X[index].T):
                for s in set(j):
                    indL = index[j <= s]
                    indR = index[j > s]
#                     _time = time.clock()
                    if indL.size > 0 and indR.size > 0: #True
                        crit = self.criterion_func(indL, indR, F)
                        if np.sum(crit) < np.sum(min_info):
                            min_info = crit
                            splitter = (i, s)
                            indxs = indL, indR
#                     TIME += [time.clock() - _time]
                    
#             print(sum(TIME), np.average(TIME))
                        
            
            print('\t' * (self.depth - 1), splitter, index.size)
            node.splitter = splitter

#             print('left', indxs[0])
            node.left = self.Node(None, None)
            self._fit(node.left, indxs[0], F, min_info[0])

#             print('right', indxs[1])
            node.right = self.Node(None, None)
            self._fit(node.right, indxs[1], F, min_info[1])
        
        self.depth -= 1

    def predict(self, X):
        y = []
        for x in X:
            y += [self._predict(x, self.root)]
            
        return np.array(y)
    
    def _predict(self, x, node):
        if node.splitter is None:
#             return self.maxClass(node.index)
            return self.kNN(node.index, x)
        
        if x[node.splitter[0]] <= node.splitter[1]:
            return self._predict(x, node.left)

        else:
            return self._predict(x, node.right)
    
    def maxClass(index):
        ys = np.unique(self.y[index], return_counts=True)
        return ys[0][ys[1].argmax()]
    
    def kNN(self, index, x, neighborsNum = 5):
        """
            index: numpy.ndarray - Nx1, leaf data index
            x: numpy.ndarray - LxM, predict data
            neighborsNum: number of neighbors
        """
        if type(x) is not 'numpy.ndarray':
            x = np.array([x])
        dist = distance_matrix(x, self.X[index])
            
        knn = self.y[index][dist.argsort(axis=1)[:, :neighborsNum]]
        ret = []
        for n in knn:
            tmp = np.unique(n, return_counts=True)
            ret += [tmp[0][tmp[1].argmax()]]
            
        return ret

    def gini(self, index):
        p = np.unique(self.y[index], return_counts=True)[1] / index.size
        return 1 - np.sum(p ** 2)
    
    def entropy(self, index):
        p = np.unique(self.y[index], return_counts=True)[1] / index.size
        return -np.sum(p * np.log(p))
    
    def criterion_func(self, indL, indR, F):
        return np.array([indL.size * F(indL), indR.size * F(indR)]) / (indL.size + indR.size)

In [6]:
clf = DecisionTree()

In [7]:
%time clf.fit(data_train, target_train)

DEPTH: 1 1
 (array([ 0.,  1.]), array([2970, 4123]))
 (48, 0.18047121725464502) 7093
	DEPTH: 2 0.0574611382095
	 (array([ 0.,  1.]), array([1788,  230]))
	 (16, 0.54206486629448603) 2018
		DEPTH: 3 0.14372467862
		 (array([ 0.,  1.]), array([1765,  158]))
		 (35, 0.30219566852109103) 1923
			DEPTH: 4 0.0526037570428
			 (array([ 0.,  1.]), array([170,  72]))
			 (4, 0.70873673642436208) 242
				DEPTH: 5 0.337726155908
				 (array([ 0.,  1.]), array([168,  54]))
				 (52, 0.49087009732179104) 222
					DEPTH: 6 0.150188898576
					 (array([ 0.,  1.]), array([136,  19]))
					 (9, 0.43039642475294498) 155
						DEPTH: 7 0.0645161290323
						 (array([ 0.,  1.]), array([10, 10]))
						 (18, 0.46465942216244704) 20
							DEPTH: 8 0.0
							 (array([ 0.]), array([8]))
							DEPTH: 8 0.166666666667
							 (array([ 0.,  1.]), array([ 2, 10]))
							 (7, 0.499876035841536) 12
								DEPTH: 9 0.0
								 (array([ 1.]), array([10]))
								DEPTH: 9 0.0
								 (array([ 0.]), array([2]))

In [8]:
print(clf)

 7093 (48, 0.18047121725464502)
	 2018 (16, 0.54206486629448603)
		 1923 (35, 0.30219566852109103)
			 242 (4, 0.70873673642436208)
				 222 (52, 0.49087009732179104)
					 155 (9, 0.43039642475294498)
						 20 (18, 0.46465942216244704)
							 8 None
							 12 (7, 0.499876035841536)
								 10 None
								 2 None
						 135 (53, 0.18372650920247099)
							 9 (8, 0.43438369976739599)
								 6 None
								 3 None
							 126 (32, 0.99226632190185204)
								 125 (47, 0.89796988042193904)
									 124 (9, 0.71732505949140801)
										 122 None
										 2 (0, 0.41180754395510294)
											 1 None
											 1 None
									 1 None
								 1 None
					 67 (43, 0.33127550654438498)
						 50 (53, 0.57300742849742192)
							 39 (23, 0.78118076498675892)
								 35 (42, 0.18885959401935898)
									 3 (0, 0.32035649327145199)
										 1 None
										 2 None
									 32 (0, 0.32035649327145199)
										 1 None
										 31 None
								 4 (0, 0.44408005430866493)
	

In [9]:
pred = clf.predict(data_test)

In [10]:
accuracy_score(target_test, pred)

0.41268894192521877