# Exercise 6 - Lucas Oswald, Maximilian Stucke, Milon Miah

In [1]:
# import modules
import numpy as np
from sklearn.datasets import load_digits
from abc import abstractmethod
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Base Classes

In [2]:
class Node:
    '''
      this class will later get the following attributes
      all nodes:
          features
          responses
      split nodes additionally:
          left
          right
          split_index
          threshold
      leaf nodes additionally
          prediction
    '''
        

class Tree:
    '''
      base class for RegressionTree and ClassificationTree
    '''
    def __init__(self, n_min=10):
        '''n_min: minimum required number of instances in leaf nodes
        '''
        self.n_min = n_min 
    
    def predict(self, x):
        ''' return the prediction for the given 1-D feature vector x
        '''
        # first find the leaf containing the 1-D feature vector x
        node = self.root
        while not hasattr(node, "prediction"):
            j = node.split_index
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        # finally, return the leaf's prediction
        return node.prediction
        
    def train(self, features, responses, D_try=None):
        '''
        features: the feature matrix of the training set
        response: the vector of responses
        '''
        N, D = features.shape
        assert(responses.shape[0] == N)

        if D_try is None:
            D_try = int(np.sqrt(D)) # number of features to consider for each split decision
        
        # initialize the root node
        self.root = Node()
        #Only here the attributes of nodes are defined!
        self.root.features  = features
        self.root.responses = responses

        # build the tree
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            active_indices = self.select_active_indices(D, D_try)
            left, right = self.make_split_node(node, active_indices)
            if left is None: # no split found
                self.make_leaf_node(node)
            else:
                stack.append(left)
                stack.append(right)
    
    def make_split_node(self, node, indices):
        '''
        node: the node to be split
        indices: a numpy array of length 'D_try', containing the feature 
                         indices to be considered for the present split
                         
        return: None, None -- if no suitable split has been found, or
                left, right -- the children of the split
        '''
        # all responses equal => no improvement possible by any split
        if np.unique(node.responses).shape[0] == 1:
            return None, None
        
        # find best feature j_min (among 'indices') and best threshold t_min for the split
        l_min = float('inf')  # upper bound for the loss, later the loss of the best split
        j_min, t_min = None, None

        for j in indices:
            thresholds = self.find_thresholds(node, j)

            # compute loss for each threshold
            for t in thresholds:
                loss = self.compute_loss_for_split(node, j, t)

                # remember the best split so far 
                # (the condition is never True when loss = float('inf') )
                if loss < l_min:
                    l_min = loss
                    j_min = j
                    t_min = t

        if j_min is None: # no split found
            return None, None

        # create children for the best split
        left, right = self.make_children(node, j_min, t_min)

        # turn the current 'node' into a split node
        # (store children and split condition)
        node.left = left
        node.right = right
        node.split_index = j_min
        node.threshold = t_min
        
        #raise NotImplementedError("make_split_node(): remove this exception after adding your code above.")
        
        # return the children (to be placed on the stack)
        return left, right
    
    def select_active_indices(self, D, D_try):
        ''' return a 1-D array with D_try randomly selected indices from 0...(D-1).
        '''
        return np.random.randint(low = 0, high = D-1, size = D_try) # your code here
        #raise NotImplementedError("select_active_indices(): remove this exception after adding your code above.")
        
    def find_thresholds(self, node, j):
        ''' return: a 1-D array with all possible thresholds along feature j
        '''
        #Get all values for feature j
        feature_values = node.features[:, j]
        
        #Sort feature values in ascending order
        sorted_feature_values = np.sort(feature_values)
        
        #Find unique values to ensure distinct thresholds
        unique_feature_values = np.unique(sorted_feature_values)
        
        #Calculate the midpoints between adjacent unique values as possible thresholds
        # [:-1], all values except for the last 
        thresholds = (unique_feature_values[:-1] + unique_feature_values[1:]) / 2
        
        return thresholds
        #raise NotImplementedError("find_thresholds(): remove this exception after adding your code above.")
        
    def make_children(self, node, j, t):
        ''' execute the split in feature j at threshold t
        
            return: left, right -- the children of the split, with features and responses
                                   properly assigned according to the split
        '''
        left = Node()
        right = Node()

        indices_left = np.where(node.features[:,j] < t)[0]
        indices_right = np.where(node.features[:,j] > t)[0]
        left.features = node.features[indices_left]
        right.features = node.features[indices_right]
        left.responses = node.responses[indices_left]
        right.responses = node.responses[indices_right]
        
        #raise NotImplementedError("make_children(): remove this exception after adding your code above.")
        
        return left, right
        
    @abstractmethod
    def make_leaf_node(self, node):
        ''' Turn node into a leaf by computing and setting `node.prediction`
        
            (must be implemented in a subclass)
        '''
        raise NotImplementedError("make_leaf_node() must be implemented in a subclass.")
        
    @abstractmethod
    def compute_loss_for_split(self, node, j, t):
        ''' Return the resulting loss when the data are split along feature j at threshold t.
            If the split is not admissible, return float('inf').
        
            (must be implemented in a subclass)
        '''
        raise NotImplementedError("compute_loss_for_split() must be implemented in a subclass.")

# Regression Tree

In [3]:
class RegressionTree(Tree):
    def __init__(self, n_min=10):
        super(RegressionTree, self).__init__(n_min)
        
    def compute_loss_for_split(self, node, j, t):
        # return the loss if we would split the instance along feature j at threshold t
        # or float('inf') if there is no feasible split
        
        indices_left = np.where(node.features[:,j] < t)[0]
        n_left = indices_left.size
        indices_right = np.where(node.features[:,j] > t)[0]
        n_right = indices_right.size
        
        if n_left >= self.n_min and n_right >= self.n_min:
            mean_left = 1/n_left*np.sum(node.responses[indices_left])
            mean_right = 1/n_right*np.sum(node.responses[indices_right])
            loss = np.sum((node.responses[indices_left] - mean_left)**2) + np.sum((node.responses[indices_right] - mean_right)**2)
            return loss
        
        else:
            return float('inf')
        
        raise NotImplementedError("compute_loss_for_split(): remove this exception after adding your code above.")
        
    def make_leaf_node(self, node):
        # turn node into a leaf node by computing `node.prediction`
        # (note: the prediction of a regression tree is a real number)
        n_node = node.responses.size
        node.prediction = 1/n_node * np.sum(node.responses) 
        #raise NotImplementedError("make_leaf_node(): remove this exception after adding your code above.")

# Classification Tree

In [4]:
class ClassificationTree(Tree):
    '''implement classification tree so that it can handle arbitrary many classes
    '''
    
    def __init__(self, classes, n_min=10):
        ''' classes: a 1-D array with the permitted class labels
            n_min: minimum required number of instances in leaf nodes
        '''
        super(ClassificationTree, self).__init__(n_min)
        self.classes = classes
        
    def compute_loss_for_split(self, node, j, t):
        # return the loss if we would split the instance along feature j at threshold t
        # or float('inf') if there is no feasible split
        
        # Calculate probabilities of each class 
        indices_left = np.where(node.features[:,j] < t)[0]
        n_left = indices_left.size
        indices_right = np.where(node.features[:,j] > t)[0]
        n_right = indices_right.size
        
        if n_left >= self.n_min and n_right >= self.n_min:
            
            responses_left = node.responses[indices_left]
            responses_right = node.responses[indices_right]
            
            p_left = np.zeros(self.classes.size)
            p_right = np.zeros(self.classes.size)
            for clas in range(p_left.size):
                
                #Use one-hot encoding for each class
                class_one_hot_left = np.where(responses_left == self.classes[clas], 1, 0)
                p_left[clas] = np.mean(class_one_hot_left)
                class_one_hot_right = np.where(responses_right == self.classes[clas], 1, 0)
                p_right[clas] = np.mean(class_one_hot_right)
                
            # Compute Gini loss
            loss = n_left*(1-np.sum(p_left**2)) + n_right*(1-np.sum(p_right**2))
            return loss
        
        else:
            return float('inf')
        #raise NotImplementedError("compute_loss_for_split(): remove this exception after adding your code above.")
        
    def make_leaf_node(self, node):
        # turn node into a leaf node by computing `node.prediction`
        # (note: the prediction of a classification tree is a class label)
        #Use hard response
        labels, label_counts = np.unique(node.responses, return_counts = True)
        majority_idx = np.argmax(label_counts)
        majority = labels[majority_idx]
        node.prediction = majority
        #raise NotImplementedError("make_leaf_node(): remove this exception after adding your code above.")

# Evaluation of Regression and Classification Tree

In [5]:
# read and prepare the digits data and extract 3s and 9s
digits = load_digits()
print(digits.data.shape, digits.target.shape)

instances = (digits.target == 3) | (digits.target == 9)
features = digits.data[instances, :]
labels = digits.target[instances]

# for regression, we use labels +1 and -1
responses = np.array([1 if l == 3 else -1 for l in labels])

assert(features.shape[0] == labels.shape[0] == responses.shape[0])

(1797, 64) (1797,)


In [6]:
# perform 5-fold cross-validation (see ex01) with responses +1 and -1 (for 3s and 9s)
# using RegressionTree()
# and comment on your results
k_folds = 5
X_train_folds = np.zeros((k_folds, 326, 64))
X_test_folds = np.zeros((k_folds, 37, 64))
y_train_folds = np.zeros((k_folds, 326))
y_test_folds = np.zeros((k_folds, 37))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 37))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features, responses, test_size = 0.1, random_state = i)
    
    #Create RegressionTree instance
    regTree = RegressionTree()
    
    #Train tree with data
    regTree.train(X_train_folds[i], y_train_folds[i])
    
    #Predict labels of test data
    for j in range(X_test_folds[i].shape[0]):
        predictions[i][j] = regTree.predict(X_test_folds[i][j])
    
    #Compute error for each fold
    error_test_rate[i] = np.mean(abs(predictions[i]-y_test_folds[i])) *100
    
    #Print errors for each fold
    print('Error rate of the Regression Tree prediction of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    
#Print averaged errors for both results
print('\nAveraged error of the Regression Tree prediction is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

Error rate of the Regression Tree prediction of the 1th fold is 29.85 %
Error rate of the Regression Tree prediction of the 2th fold is 42.85 %
Error rate of the Regression Tree prediction of the 3th fold is 22.12 %
Error rate of the Regression Tree prediction of the 4th fold is 14.94 %
Error rate of the Regression Tree prediction of the 5th fold is 31.01 %

Averaged error of the Regression Tree prediction is 28.15 +/- 4.19 %


### The given task of the exercise sheet is usually addressed to classification algorithms. The output of the regression tree, though, returns the likelihood for each digit to occur. The errors are calculated as the mean differences to the true labels (-1, 1) and, thus, do not vanish even though the tendency of the regression tree to predict the right outcome might be right. This is why the errors seem to be quite large.

In [7]:
# perform 5-fold cross-validation with labels 3 and 9
# using ClassificationTree(classes=np.unique(labels))
# and comment on your results
k_folds = 5
X_train_folds = np.zeros((k_folds, 326, 64))
X_test_folds = np.zeros((k_folds, 37, 64))
y_train_folds = np.zeros((k_folds, 326))
y_test_folds = np.zeros((k_folds, 37))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 37))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features, labels, test_size = 0.1, random_state = i)
    
    #Create RegressionTree instance
    claTree = ClassificationTree(classes = np.array([3,9]), n_min = 10)
    
    #Train tree with data
    claTree.train(X_train_folds[i], y_train_folds[i])
    
    #Predict labels of test data
    for j in range(X_test_folds[i].shape[0]):
        predictions[i][j] = claTree.predict(X_test_folds[i][j])
    
    #Compute error for each fold
    error_test_rate[i] = np.mean(predictions[i] != y_test_folds[i])*100
    
    #Print errors for each fold
    print('Error rate of the Classification Tree prediction of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    
#Print averaged errors for both results
print('\nAveraged error of the Classification Tree prediction is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

Error rate of the Classification Tree prediction of the 1th fold is 16.22 %
Error rate of the Classification Tree prediction of the 2th fold is 18.92 %
Error rate of the Classification Tree prediction of the 3th fold is 10.81 %
Error rate of the Classification Tree prediction of the 4th fold is 10.81 %
Error rate of the Classification Tree prediction of the 5th fold is 21.62 %

Averaged error of the Classification Tree prediction is 15.68 +/- 1.93 %


### For the single classification tree, the errors still seem to be quite large. Over-fitting is a major problem as features of each concrete training set are weighted very strongly in leave nodes. A forest of classification trees will lower the error by strengthening the underlying statistics of the predictions and reducing over-fitting.

# Regression and Classification Forest

In [8]:
def bootstrap_sampling(features, responses):
    '''return a bootstrap sample of features and responses
    '''
    
    #Create random indices with replacement via numpy
    bootstrap_idx = np.random.choice(len(responses), len(responses), replace = True)
    
    bootstrap_fea = features[bootstrap_idx]
    bootstrap_res = responses[bootstrap_idx]
    
    return bootstrap_fea, bootstrap_res
    
    #raise NotImplementedError("bootstrap_sampling(): remove this exception after adding your code above.")

#Define own function for one-against-the-rest classifier, as some extra work is necessary to allow for balanced +/-1 output of responses
def bootstrap_sampling_reg(features, responses):
    '''return a bootstrap sample of features and responses
    '''
    
    #For one-against-the-rest regression implement probability vector in order to balance bootstrapped array equally
    
    N_p1 = responses[responses == 1].size
    N_m1 = responses[responses == -1].size
    N = responses.size
    assert(N_p1 + N_m1 == N)
    
    #Set hyperparameter to equally distributed data for training
    p = 0.5
    P = np.where(responses == 1, p/N_p1, p/N_m1)
    
    #Create random indices with replacement via numpy
    bootstrap_idx = np.random.choice(len(responses), len(responses), replace = True, p = P)
    
    bootstrap_fea = features[bootstrap_idx]
    bootstrap_res = responses[bootstrap_idx]
    
    return bootstrap_fea, bootstrap_res
    
    #raise NotImplementedError("bootstrap_sampling(): remove this exception after adding your code above.")


In [9]:
class RegressionForest():
    def __init__(self, n_trees, n_min=10):
        # create ensemble
        self.trees = [RegressionTree(n_min) for i in range(n_trees)]
    
    def train(self, features, responses):
        for tree in self.trees:
            bootstrap_features, bootstrap_responses = bootstrap_sampling_reg(features, responses)
            tree.train(bootstrap_features, bootstrap_responses)

    def predict(self, x):
        # compute the response of the ensemble from the individual responses and return it
        prediction = 0
        for tree in self.trees:
            prediction += tree.predict(x)
            
        prediction = prediction/len(self.trees)
            
        return prediction
        #raise NotImplementedError("predict(): remove this exception after adding your code above.")

In [10]:
class ClassificationForest():
    def __init__(self, n_trees, classes, n_min=1):
        self.trees = [ClassificationTree(classes, n_min) for i in range(n_trees)]
        self.classes = classes
    
    def train(self, features, responses):
        for tree in self.trees:
            boostrap_features, bootstrap_responses = bootstrap_sampling(features, responses)
            tree.train(boostrap_features, bootstrap_responses)

    def predict(self, x):
        # compute the response of the ensemble from the individual responses and return it
        # Use hard classification
        predictions = []
        for tree in self.trees:
            predictions.append(tree.predict(x))
            
        predictions = np.array(predictions)
        labels, label_counts = np.unique(predictions, return_counts = True)
        majority_idx = np.argmax(label_counts)
        majority = labels[majority_idx]
        return majority 
        #raise NotImplementedError("predict(): remove this exception after adding your code above.")


# Evaluation of Regression and Decision Forest

In [11]:
# perform 5-fold cross-validation (see ex01) with responses +1 and -1 (for 3s and 9s)
# using RegressionForest(n_trees=10)
# and comment on your results
k_folds = 5
X_train_folds = np.zeros((k_folds, 326, 64))
X_test_folds = np.zeros((k_folds, 37, 64))
y_train_folds = np.zeros((k_folds, 326))
y_test_folds = np.zeros((k_folds, 37))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 37))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features, responses, test_size = 0.1, random_state = i)
    
    #Create RegressionTree instance
    regForest = RegressionForest(n_trees = 10)
    
    #Train tree with data
    regForest.train(X_train_folds[i], y_train_folds[i])
    
    #Predict labels of test data
    for j in range(X_test_folds[i].shape[0]):
        predictions[i][j] = regForest.predict(X_test_folds[i][j])
    
    #Compute error for each fold
    error_test_rate[i] = np.mean(abs(predictions[i]-y_test_folds[i])) *100
    
    #Print errors for each fold
    print('Error rate of the Regression Forest prediction of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    
#Print averaged errors for both results
print('\nAveraged error of the Regression Forest prediction is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

Error rate of the Regression Forest prediction of the 1th fold is 39.92 %
Error rate of the Regression Forest prediction of the 2th fold is 40.73 %
Error rate of the Regression Forest prediction of the 3th fold is 28.99 %
Error rate of the Regression Forest prediction of the 4th fold is 34.07 %
Error rate of the Regression Forest prediction of the 5th fold is 32.22 %

Averaged error of the Regression Forest prediction is 35.18 +/- 2.02 %


### The errors remain also for the regression forest quite large, however, this is still because of the method used to compute the errors. The actual likelihoods, though, are more precise when using multiple trees and overfitting is surpressed. 

In [12]:
# perform 5-fold cross-validation with labels 3 and 9
# using DecisionForest(n_trees=10, classes=np.unique(labels))
# and comment on your results
k_folds = 5
X_train_folds = np.zeros((k_folds, 326, 64))
X_test_folds = np.zeros((k_folds, 37, 64))
y_train_folds = np.zeros((k_folds, 326))
y_test_folds = np.zeros((k_folds, 37))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 37))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features, labels, test_size = 0.1, random_state = i)
    
    #Create ClassificationTree instance
    claForest = ClassificationForest(classes = np.array([3,9]), n_trees = 10)
    
    #Train tree with data
    claForest.train(X_train_folds[i], y_train_folds[i])
    
    #Predict labels of test data
    for j in range(X_test_folds[i].shape[0]):
        predictions[i][j] = claForest.predict(X_test_folds[i][j])
    
    #Compute error for each fold
    error_test_rate[i] = np.mean(predictions[i] != y_test_folds[i])*100
    
    #Print errors for each fold
    print('Error rate of the Classification Forest prediction of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    
#Print averaged errors for both results
print('\nAveraged error of the Classification Forest prediction is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

Error rate of the Classification Forest prediction of the 1th fold is 2.70 %
Error rate of the Classification Forest prediction of the 2th fold is 8.11 %
Error rate of the Classification Forest prediction of the 3th fold is 5.41 %
Error rate of the Classification Forest prediction of the 4th fold is 5.41 %
Error rate of the Classification Forest prediction of the 5th fold is 5.41 %

Averaged error of the Classification Forest prediction is 5.41 +/- 0.76 %


### Using a Classification Forest yields very good results. Using multiple trees and the underlying statistics causes the errors to significantly decrease and prevent overfitting. 

# Multi-class Classification Forest

In [None]:
# Train DecisionForest(n_trees=10, classes=np.unique(digits.target))
# for all 10 digits simultaneously.
# Compute and plot the confusion matrix after 5-fold cross-validation and comment on your results.
features_all = digits.data
labels_all = digits.target

k_folds = 5
X_train_folds = np.zeros((k_folds, 1617, 64))
X_test_folds = np.zeros((k_folds, 180, 64))
y_train_folds = np.zeros((k_folds, 1617))
y_test_folds = np.zeros((k_folds, 180))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 180))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features_all, labels_all, test_size = 0.1, random_state = i)
    
    #Create ClassificationTree instance
    claForest = ClassificationForest(classes = np.unique(digits.target), n_trees = 10)
    
    #Train tree with data
    claForest.train(X_train_folds[i], y_train_folds[i])
    
    #Predict labels of test data
    for j in range(X_test_folds[i].shape[0]):
        predictions[i][j] = claForest.predict(X_test_folds[i][j])
    
    #Compute error for each fold
    error_test_rate[i] = np.mean(predictions[i] != y_test_folds[i])*100
    
    #Print errors for each fold
    print('Error rate of the Classification Forest prediction (entire dataset) of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    
#Print averaged errors for both results
print('\nAveraged error of the Classification Forest prediction (entire dataset) is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

#Plot Confusion matrix for each fold
for i in range(k_folds):
    
    cm = confusion_matrix(y_test_folds[i], predictions[i])
    fig, ax = plt.subplots()
    plt.title('Multi-class Classification Forest Confusion Matrix, Fold {}'.format(i+1))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    cbar = ax.figure.colorbar(im, ax=ax)

Error rate of the Classification Forest prediction (entire dataset) of the 1th fold is 6.11 %
Error rate of the Classification Forest prediction (entire dataset) of the 2th fold is 3.89 %
Error rate of the Classification Forest prediction (entire dataset) of the 3th fold is 6.67 %


### Also for a multi-class classification problem, the results with the classification forest are very satisfying. The non-diagonal elements of the confusion matrix are significantly lower than its diagonal for each fold of the cross-validation. The error remains in the same range as in the previous task.

# One-against-the-rest classification with RegressionForest

In [None]:
# Train ten one-against-the-rest regression forests for the 10 digits.
# Make sure that all training sets are balanced between the current digit and the rest.
# Assign test instances to the digit with highest score, 
# or to "unknown" if all scores are negative.
# Compute and plot the confusion matrix after 5-fold cross-validation and comment on your results.

features_all = digits.data
labels_all = digits.target

k_folds = 5
X_train_folds = np.zeros((k_folds, 1617, 64))
X_test_folds = np.zeros((k_folds, 180, 64))
y_train_folds = np.zeros((k_folds, 1617))
y_test_folds = np.zeros((k_folds, 180))

error_test_rate = np.zeros(k_folds)

predictions = np.zeros((k_folds, 10, 180))

for i in range(k_folds):
    
    X_train_folds[i], X_test_folds[i], y_train_folds[i], y_test_folds[i] = model_selection.train_test_split(
        features_all, labels_all, test_size = 0.1, random_state = i)

    resp_digit = np.zeros((len(np.unique(digits.target)), 1617))
    regForests_OAR = []
    for digit in np.unique(digits.target):
        
        #Convert responses with current digit to 1 and -1 otherwise
        resp_digit[digit] = np.array([1 if l == digit else -1 for l in y_train_folds[i]])
        
        regForests_OAR.append(RegressionForest(n_trees = 10))
        
        #Train tree with data
        regForests_OAR[digit].train(X_train_folds[i], resp_digit[digit])
        
        for j in range(X_test_folds[i].shape[0]):
                predictions[i][digit][j] = regForests_OAR[digit].predict(X_test_folds[i][j])
                
#Find digit with highest score 
    
# Set all negative scores to nan
idx, maxi = np.argmax(predictions, axis = 1), np.max(predictions, axis = 1)
predictions_final = np.where(maxi < 0, np.nan, idx)

#Compute error and plot confusion matrix
for i in range(k_folds):
    error_test_rate[i] = np.mean(predictions_final[i] != y_test_folds[i])*100
    
    #Print errors for each fold
    print('Error rate for the one-against-the-rest RegressionForest prediction (entire dataset) of the {}th fold is {:.2f} %'.format(i+1, error_test_rate[i]))
    predictions_final[i][np.isnan(predictions_final[i])] = -1
    cm = confusion_matrix(y_test_folds[i], predictions_final[i])
    fig, ax = plt.subplots()
    plt.title('One-against-the-rest classification via RegressionForest - Confusion Matrix, Fold {}'.format(i+1))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    cbar = ax.figure.colorbar(im, ax=ax)
    
#Print averaged errors for both results
print('\nAveraged error for the one-against-the-rest RegressionForest prediction (entire dataset) is {:.2f} +/- {:.2f} %'.format(error_test_rate.mean(), np.sqrt(error_test_rate.var()/len(error_test_rate))))

### One-against-the-rest classification with RegressionForest also delivers more or less equally reliable results. For each fold the confusion matrix is almost diagonal (with a few less significant exceptions). Considering the complexity of the code though, it was more straight-forward to use ClassificationForest and its corresponding loss function instead of computing the results for each digit individually and putting the results together at the end.