Decision Tree Constructor
--------------------------
This notebook works through the construction of the DecisionTree and node classes created to solve problem 1. (It will eventually be imported into its own python module)

In [1]:
import numpy as np

In [34]:
class DecisionTree:
    """Build and store a decision tree, based on supplied training data. 
       Use this tree to predict classifications."""

    def __init__(self,treedepth=5,params=None):
        self.depth = treedepth
        self.tree = None
        
    def entropy(self,C,D,c,d):
        """Calculate entropy based on classifications above and below the splitrule"""
        if C != 0:
            Cfactor = -(C/(C+D))*np.log2(C/(C+D))
        else:
            Cfactor = 0
        if D != 0:
            Dfactor = -(D/(C+D))*np.log2(D/(C+D))
        else:
            Dfactor = 0
        if c != 0:
            cfactor = -(c/(c+d))*np.log2(c/(c+d))
        else:
            cfactor = 0
        if d != 0:
            dfactor = -(d/(c+d))*np.log2(d/(c+d))
        else:
            dfactor = 0
        H_left = Cfactor + Dfactor
        H_right = cfactor + dfactor
        H = ((C+D)*H_left + (c+d)*H_right)/(C+D+c+d)
        
        return H
    
    
    def segment(self,data,labels):
        totals = np.bincount(labels)
        if len(totals)==1:
            np.append(totals,[0])
        print(totals)
        # Quick safety check
        print(data,labels)
        if len(labels) != len(data):
            print('ERROR: There must be the same number of labels as datapoints.')
        
        # Calculate the initial entropy, used to find info gain
        C,D = 0,0                      # C = in class left of split; D = not in class left of split
        c,d = totals[1],totals[0]      # c = in class right of split; d = not in class right of split
        H_i = self.entropy(C,D,c,d) # the initial entropy, before any splitting
        
        # Initialize objects to store optimal split rules for iterative comparison
        maxinfogain = 0
        splitrule = []   
        
        for feature_i in range(len(data)):
            # Order the data for determining ideal splits
            lbldat = np.concatenate(([data[:,feature_i]],[labels]),axis=0)
            fv = np.sort(lbldat.T,axis=0)
            lastfeature = np.array(['',''])
            
            C,D = 0,0                      # Reset the counters
            c,d = totals[1],totals[0]
            
            for point_i in range(len(fv)-1):
                
                # Update C,D,c,d to minmize runtime of entrop calc (keep at O(1) time)
                if fv[point_i,1] == 1:
                    C += 1
                    c -= 1
                elif fv[point_i,1] == 0:
                    D += 1
                    d -= 1
                else:
                    print("ERROR: Classifications can only be 0 or 1.")
                
                # Skip splitting values that are not separable
                if fv[point_i,0] == fv[point_i+1,0]:
                    continue
                else:
                    H_f = self.entropy(C,D,c,d)
                    infogain = H_i-H_f
                    if infogain > maxinfogain:
                        maxinfogain = infogain
                        splitrule = [feature_i,fv[point_i,0]]
        return splitrule
            
    def train(self,data,labels):
        # Grow decision tree
        depthlim = self.depth
        splitrule = self.segment(data,labels)
        node = self.Node(data,labels,splitrule)
        if node.nodetype == 'LeafNode':
            print('You made a leaf node!')
            self.tree = node
        elif node.nodetype == 'SplitNode':
            print('Splitting node left and right')
            node.left = self.train(node.leftdata,node.leftlabels)
            node.right = self.train(node.rightdata,node.rightlabels)
            #restrict occurences based on depthlim
        else:
            print('ERROR: The node type could not be identified!')
          
    #def predict(self,data):
        
        #return predictions
    
    
    class Node:
        """Store a decision tree node, coupled in series to construct tree;
        includes a left branch, right branch, and splitrule"""
    
        def __init__(self,data,labels,splitrule):
            # Determine if this is a leaf node
            if splitrule:
                indsabove = self.datainds_above_split(data,splitrule)
                
                self.rule = splitrule
                self.left = None        # To be defined on next call to DecisionTree.train(.)
                self.leftdata,self.leftlabels = self.leftDL(data,labels,indsabove)
                self.right = None       # To be defined on next call to DecisionTree.train(.)
                self.rightdata,self.rightlabels = self.rightDL(data,labels,indsabove)
                
                self.nodetype = 'SplitNode'
            elif not splitrule:
                self.nodetype = 'LeafNode'
            #   self.label = <majority voting>
        
        def datainds_above_split(self,data,splitrule):
            # Collect indices of points with values of the splitting feature greater than the split rule
            indsabove = []
            fv = data[:,splitrule[0]]
            for point_i in range(len(fv)):
                if fv[point_i] > splitrule[1]:
                    indsabove.append(point_i)
            return indsabove
        
        def leftDL(self,data,labels,indsabove):
            # Return arrays of only left data and labels
            leftdata = np.delete(data,indsabove,axis=0)
            leftlabels = np.delete(labels,indsabove,axis=0)
            return leftdata,leftlabels
        
        def rightDL(self,data,labels,indsabove):
            # Return arrays of only right data and labels
            rightdata = data[indsabove]
            rightlabels = labels[indsabove]
            return rightdata,rightlabels
        

In [35]:
classifier = DecisionTree()

In [36]:
data = np.array([[2,2,2,5],[1,1,1,2],[2,1,5,3],[8,1,2,1]])
labels = np.array([1,1,0,1])

In [37]:
classifier.train(data,labels)

[1 3]
[[2 2 2 5]
 [1 1 1 2]
 [2 1 5 3]
 [8 1 2 1]] [1 1 0 1]
Splitting node left and right
[0 1]
[[1 1 1 2]] [1]
You made a leaf node!
[1 2]
[[2 2 2 5]
 [2 1 5 3]
 [8 1 2 1]] [1 0 1]
Splitting node left and right
[1 1]
[[2 2 2 5]
 [2 1 5 3]] [1 0]
Splitting node left and right
[1]
[[2 1 5 3]] [0]


IndexError: index 1 is out of bounds for axis 0 with size 1

In [38]:
x=np.array([1,2])
print(x)
print(np.append(x,[0]))

[1 2]
[1 2 0]
