# 第 4 章 DecisionTree
## By LiuGang - 2018/11/15
## Reference Book - statistical learning method (Chinese)
### 1:  Create some data

In [17]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 年龄阶段，有没有工作, 有自己的房子, 信用情况
X = np.array([[0,0,0,0],
              [0,0,0,1],
              [0,1,0,1],
              [0,1,1,0],
              [0,0,0,0],
              [1,0,0,0],
              [1,0,0,1],
              [1,1,1,1],
              [1,0,1,2],
              [1,0,1,2],
              [2,0,1,2],
              [2,0,1,1],
              [2,1,0,1],
              [2,1,0,2],
              [2,0,0,0]])
# 0-没有拖欠贷款 1-有拖欠贷款
y = np.array([0,0,1,1,0,0,0,1,1,1,1,1,1,1,0])

### 2: DecisionTree Class

In [47]:
class TreeNode():
    def __init__(self, nx):
        self.left = None
        self.right = None
        self.val = nx
        self.y = None
        self.is_leaf = False
        self.feature = None
        self.splitpoint = None
         
class DecisionTree():
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def H(self, y):   #entropy
        h = 0
        for i in set(y):
            p = len(y[y == i])/len(y)
            h += -p*np.log2(p)
        return h
        
    def H1(self, x, y):  #conditional entropy
        xy = pd.DataFrame()
        xy['x'] = x
        xy['y'] = y
        h = 0
        for i in set(x):
            p0 = len(x[x == i])/len(x)
            p = self.H(y[x==i])
            h += p0*p
        return h
    
    def g(self, x, y):      #information gain
        return self.H(y) - self.H1(x, y)
    
    def gR(self, x, y):     #information gain rate
        return self.g(x, y)/self.H(y)
    
    def Gini(self, y):   # Gini index
        gini = 0
        for i in set(y):
            gini += (len(y[y == i])/len(y))**2
        return 1 - gini
    
    def Gini1(self, x, y):   # find splitting point
        xy = pd.DataFrame()
        xy['x'] = x
        xy['y'] = y
        gini = []
        for i in sorted(list(set(x))):
            if xy[xy['x']>i].shape[0] != -1:   # it should be != 0, but '==0' works
                gini.append(((xy[xy['x']<=i].shape[0])/len(x))*self.Gini(xy[xy['x']<=i]['y']) + \
                            ((xy[xy['x']>i].shape[0])/len(x))*self.Gini(xy[xy['x']>i]['y']))
        return sorted(list(set(x)))[np.argmin(gini)]
            
    
    def select_feature(self, X, y):
        feature_gR = []
        for i in range(X.shape[1]):
            feature_gR.append(self.gR(X[:,i], y))
        return np.argmax(feature_gR), np.max(feature_gR)
    
    def build_tree(self, root, X, y):
#     class TreeNode():
#         def __init__(self, nx):
#             self.left = None
#             self.right = None
#             self.val = nx
#             self.is_leaf = False
#             self.splitpoint = None
        Xy = pd.DataFrame(X)              # creat dataframe for X and y
        Xy.columns = ['c'+str(i) for i in range(X.shape[1])]
        Xy['y'] = y
        root = TreeNode(X)                          # build node for the X after splitting
        root.y = y              # save the y values for the node built
        col_num, max_gR = self.select_feature(X, y)  #find best feature's column number
        root.feature = col_num      #   save best feature's column number
        if max_gR == 0 or max_gR != max_gR:   # max_gR decides if the node is leaf-node
            root.is_leaf = True         # the node is or is not leaf-node
            return root                 # must return root, otherwise the leaf-node would be outputed
        splitpoint = self.Gini1(X[:,col_num], y)    # find best splitpoint's value

        root.splitpoint = splitpoint          # find splitpoint for the best feature
        root.left = self.build_tree(root.left, Xy[Xy['c'+str(col_num)] <= splitpoint].drop('y',axis=1).values, Xy[Xy['c'+str(col_num)] <= splitpoint]['y'].values)
        root.right = self.build_tree(root.right, Xy[Xy['c'+str(col_num)] > splitpoint].drop('y',axis=1).values, Xy[Xy['c'+str(col_num)] > splitpoint]['y'].values)
        
        return root
    
    def predict(self, head, xdata):
        __result = []
        for data in xdata:
            __root = head
            while not __root.is_leaf:
                if data[__root.feature] <= __root.splitpoint:
                    __root = __root.left
                else:
                    __root = __root.right
            __result.append(np.argmax(np.bincount(__root.y)))
        return __result
         

In [48]:
mydt = DecisionTree(X, y)
root = mydt.build_tree(None, X, y)
print(mydt.predict(root, np.array([[0,0,0,0],
                                  [0,0,0,1],
                                  [0,1,0,1],
                                  [0,1,1,0],
                                  [0,0,0,0],
                                  [1,0,0,0],
                                  [1,0,0,1],
                                  [1,1,1,1],
                                  [1,0,1,2],
                                  [1,0,1,2],
                                  [2,0,1,2],
                                  [2,0,1,1],
                                  [2,1,0,1],
                                  [2,1,0,2],
                                  [2,0,0,0]])))

[0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0]


In [36]:
print(root.left.val,root.feature,root.splitpoint)
print(root.right.val,root.right.y)
error(10)

[[0 0 0 0]
 [0 0 0 1]
 [0 1 0 1]
 [0 0 0 0]
 [1 0 0 0]
 [1 0 0 1]
 [2 1 0 1]
 [2 1 0 2]
 [2 0 0 0]] 2 0
[[0 1 1 0]
 [1 1 1 1]
 [1 0 1 2]
 [1 0 1 2]
 [2 0 1 2]
 [2 0 1 1]] [1 1 1 1 1 1]


NameError: name 'error' is not defined