## Classification Tree

### Terms

* _root node_ is the starting point of classfication tree
* _node_ is a subset of the set of variables, can be a terminal or nonterminal node.
* _nonterminal (parent) node_ is a node that splits into two children nodes
* _terminal node_ is a node that does not split

### Node impurity
With $K$ classes with corresponding probability $(p_1,\dots,p_K)$.

$n$ is number of total nodes, $n_R, n_L$ is number of right and left nodes.

* Entropy impurity
    \begin{equation}
        i(\tau) = -\sum^K_{k=1}p(k|\tau)\,\text{log}\,p(k|\tau),
    \end{equation}

    with binary classfication
    \begin{equation}
        i(\tau) = p\,\text{log}\,p - (1-p)\,\text{log}\,(1-p),
    \end{equation}

    where we set $p = p(1|\tau)$

* Gini impurity
    \begin{equation}
        i(\tau) = \sum_{k \neq k'} p(k|\tau) p(k'|\tau) = 1 - \sum_k \{p(k|\tau) \}^2,
    \end{equation}

    [](with binary classfication
    \begin{equation}
        i(\tau) = 2p(1-p),
    \end{equation})
    
    where $p = \frac{\text{number of samples with $p(k=1)$}} {\text{total samples}}$

In [47]:
from abc import ABCMeta
from ..externals import six

# General Tree interface
class Tree:
    def __init__(self, n_features, n_classes, 
                 max_depth, node_count, nodes, value):
        self.n_features = n_features
        self.n_classes = n_classes
        self.max_depth = max_depth
        self.node_count = node_count
        self.nodes = nodes
        self.value = value

class DepthFirstTreeBuilder():
    def __init__(self, max_depth):
        self.max_depth = max_depth    
        
    def build(self, Tree):    
    
class Criterion(six.with_metaclass(ABCMeta, object)):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def impurity_score(self):
        raise NotImplementedError()        
        
class Splitter():
    def __init__(self, X, y, feature_idx, split_value):
        self.X = X
        self.y = y
        self.feature_idx = feature_idx
        self.split_value = split_value
    
    """Split data base on with split_value of 1 feature of input
    """ 
    def split(self):
        left = []
        right = []
        for i, row in enumerate(X):
            if row[self.feature_idx] < split_value
                left.extend(row, y[i])
            else: 
                right.append(row, y[i])
        return left, right        
                
    """Loop through all values of all features
    """     
    def best_split(self, criterion):
        classes = list(set(self.y))
        for feature_idx in range(len(y)-1):
            for row in X:
                criterion = criterion(self.X, self.y)
                groups = split()
                impurity_score = criterion.impurity_score()
                
class Gini(Criterion):
    def __init__(self, X, y, groups):
        self.X = X
        self.y = y
        self.groups = groups
        
    def impurity_score(self):
        X = self.X
        y = self.y
        groups = self.groups
        sum_total = 0
        n_samples = X.shape[0]
        size = len(groups)
        classes = list(set(y))
        for k in range(classes):
            p = [row[-1] for row in group].count(k)/size
            score += p*p
        sum_total += (1 - score)/(size/n_samples)
        return sum_total
                    
class DecisionTree:
    def __init__(self, criterion='Gini', max_depth):
        self.criterion = criterion
        self.max_depth = max_depth      
                        
    def fit(self, X, y)    
        n_samples, self.n_features_ = X.shape
        criterion = self.criterion
    
    def apply(self, X):

# References
[1] http://avesbiodiv.mncn.csic.es/estadistica/curso2011/regm38.pdf

[2] https://machinelearningmastery.com/implement-decision-tree-algorithm-scratch-python/

[3] https://github.com/scikit-learn/scikit-learn/tree/master/sklearn/tree

[4] http://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart