# Preliminaries

In [None]:
# import modules
import numpy as np
... # your code here

In [4]:
# base classes

class Node:
    pass

class Tree:
    def __init__(self):
        self.root = Node()
    
    def find_leaf(self, x):
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

# Density Tree

In [17]:
class DensityTree(Tree):
    def __init__(self):
        super(DensityTree, self).__init__()
        
    def train(self, data, prior):
        '''
        data: the feature matrix for the digit under consideration
        prior: the prior probability of this digit
        '''
        self.prior = prior
        N, D = data.shape
        D_try = int(np.sqrt(D)) # number of features to consider for each split decision

        # filter features and initialize bounding box
        # (If m[j] == M[j] for some j, the bounding box has zero volume, 
        #  causing divide-by-zero later on. We must ignore these features
        #  and adjust the bounding box accordingly.)
        m, M = np.min(data, axis=0), np.max(data, axis=0)
        valid_features   = np.where(m != M)[0]
        invalid_features = np.where(m == M)[0]
        M[invalid_features] = m[invalid_features] + 1

        # initialize the root node
        self.root.data = data
        self.root.box = m.copy(), M.copy()
        stack = [self.root]

        n_min = 20 # termination criterion: don't split if node contains fewer instances
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min:
                # Call 'make_density_split_node()' with 'D_try' randomly selected 
                # indices from 'valid_features'. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                ... # your code here
            else:
                # Call 'make_density_leaf_node()' to turn 'node' into a leaf node.
                ... # your code here

    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(x | y) * p(y)
        return ... # your code here

In [10]:
def make_density_split_node(node, N, feature_indices):
    '''
    node: the node to be split
    N:    the total number of training instances for the current class
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape
    m, M = node.box

    # find best feature j (among 'feature_indices') and best threshold t for the split
    # Hint: For each feature considered, first remove duplicate feature values using 
    # 'np.unique()'. Describe here why this is necessary.
    ... # your code here

    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and bounding boxes
    # according to the optimal split found above
    ... # your code here

    # turn the current 'node' into a split node
    # (store children and split condition)
    ... # your code here

    # return the children (to be placed on the stack)
    return left, right

In [18]:
def make_density_leaf_node(node, N):
    '''
    node: the node to become a leaf
    N:    the total number of training instances for the current class
    '''
    # compute and store leaf response
    ... # your code here

# Decision Tree

In [None]:
class DecisionTree(Tree):
    def __init__(self):
        super(DecisionTree, self).__init__()
        
    def train(self, data, labels):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D)) # how many features to consider for each split decision

        # initialize the root node
        self.root.data = data
        self.root.labels = labels
        stack = [self.root]

        n_min = 20 # termination criterion: don't split if node contains fewer instances
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min and not node_is_pure(node):
                # Call 'make_decision_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                ... # your code here
            else:
                # Call 'make_decision_leaf_node()' to turn 'node' into a leaf node.
                ... # your code here
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return ... # your code here

In [22]:
def make_decision_split_node(node, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    # find best feature j (among 'feature_indices') and best threshold t for the split
    ... # your code here

    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    ... # your code here

    # turn the current 'node' into a split node
    # (store children and split condition)
    ... # your code here

    # return the children (to be placed on the stack)
    return left, right    

In [32]:
def make_decision_leaf_node(node):
    '''
    node: the node to become a leaf
    '''
    # compute and store leaf response
    ... # your code here

In [25]:
def node_is_pure(node):
    '''
    check if 'node' ontains only instances of the same digit
    '''
    return ... # your code here

# Evaluation of Density and Decision Tree

In [None]:
# read and prepare the digits data
... # your code here

In [None]:
# train trees, plot training error confusion matrices, and comment on your results
... # your code here

# Density and Decision Forest

In [7]:
class DensityForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DensityTree() for i in range(n_trees)]
    
    def train(self, data, prior):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            ... # your code here

    def predict(self, x):
        # compute the ensemble prediction
        return ... # your code here

In [35]:
class DecisionForest():
    def __init__(self, n_trees):
        # create ensemble
        self.trees = [DecisionTree() for i in range(n_trees)]
    
    def train(self, data, labels):
        for tree in self.trees:
            # train each tree, using a bootstrap sample of the data
            ... # your code here

    def predict(self, x):
        # compute the ensemble prediction
        return ... # your code here

# Evaluation of Density and Decision Forest

In [None]:
# train forests (with 20 trees per forest), plot training error confusion matrices, and comment on your results
... # your code here