In [4]:
from lifelong_forests import *
import matplotlib.pyplot as plt

n_tasks = 10 # should divide 100 evenly
# K = int(len(class_idx)/n_tasks)

import pickle
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def homogenize_labels(a):
    u = np.unique(a)
    return np.array([np.where(u == i)[0][0] for i in a])

In [99]:
class LifelongForest:
    """
    Lifelong Forest class.
    """
    def __init__(self, acorn=None):
        """
        Two major things the Forest Class needs access to:
            1) the realized random forest model (self.models_ is a list of forests, 1 for each task)
            2) old data (to update posteriors when a new task is introduced)
        """
        self.models_ = []
        self.X_ = []
        self.y_ = []
        self.n_tasks = 0
        self.n_classes = None
        
        if acorn is not None:
            np.random.seed(acorn)
    
    def new_forest(self, X, y, n_estimators=200, max_samples=0.32,
                        bootstrap=True, max_depth=30, min_samples_leaf=1,
                        acorn=None):
        """
        Input
        X: an array-like object of features; X.shape == (n_samples, n_features)
        y: an array-like object of class labels; len(y) == n_samples
        n_estimators: int; number of trees to construct (default = 200)
        max_samples: float in (0, 1]: number of samples to consider when 
            constructing a new tree (default = 0.32)
        bootstrap: bool; If True then the samples are sampled with replacement
        max_depth: int; maximum depth of a tree
        min_samples_leaf: int; minimum number of samples in a leaf node
        
        Return
        model: a BaggingClassifier fit to X, y
        """
        
        if X.ndim == 1:
            raise ValueError('1d data will cause headaches down the road')
            
        if acorn is not None:
            np.random.seed(acorn)
            
        self.X_.append(X)
        self.y_.append(y.astype(int))
            
        n = X.shape[0]
        K = len(np.unique(y))
        
        if self.n_classes is None:
            self.n_classes = K
        
        max_features = int(np.ceil(np.sqrt(X.shape[1])))

        model=BaggingClassifier(DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                                         max_features = max_features),
                                  n_estimators=n_estimators,
                                  max_samples=max_samples,
                                  bootstrap=bootstrap)

        model.fit(X, y)
        self.models_.append(model)
        self.n_tasks += 1
        self.n_classes = len(np.unique(y))
        
        return model
    
    
    def _get_leaves(self, estimator):
        """
        Internal function to get leaf node ids of estimator.
        
        Input
        estimator: a fit DecisionTreeClassifier
        
        Return
        leaf_ids: numpy array; an array of leaf node ids
        
        Usage
        _estimate_posteriors(..)
        """
        
        # adapted from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
        n_nodes = estimator.tree_.node_count
        children_left = estimator.tree_.children_left
        children_right = estimator.tree_.children_right
        feature = estimator.tree_.feature
        threshold = estimator.tree_.threshold

        leaf_ids = []
        stack = [(0, -1)] 
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()

            # If we have a test node
            if (children_left[node_id] != children_right[node_id]):
                stack.append((children_left[node_id], parent_depth + 1))
                stack.append((children_right[node_id], parent_depth + 1))
            else:
                leaf_ids.append(node_id)

        return np.array(leaf_ids)
    
    
    def _finite_sample_correction(self, class_probs, row_sums):
        """
        An internal function for finite sample correction of posterior estimation.
        
        Input
        class_probs: numpy array; array of posteriors to correct
        row_sums: numpy array; array of partition counts
        
        Output
        class_probs: numpy array; finite sample corrected posteriors
        
        Usage
        _estimate_posteriors(..)
        
        """
    
        where_0 = np.argwhere(class_probs == 0)
        for elem in where_0:
            class_probs[elem[0], elem[1]] = 1 / (2 * row_sums[elem[0], None])
        where_1 = np.argwhere(class_probs == 1)
        for elem in where_1:
            class_probs[elem[0], elem[1]] = 1 - 1 / (2 * row_sums[elem[0], None])
    
        return class_probs
    
    
    def _estimate_posteriors(self, test, representation=0, decider=0, subsample=1, acorn=None):
        """
        An internal function to estimate the posteriors.
        
        Input
        task_number: int; indicates which model in self.model_ to use
        test: array-like; test observation
        in_task: bool; True if test is an in-task observation(s)
        subsample: float in (0, 1]; proportion of out-of-task samples to use to
            estimate posteriors
            
        Return
        probs: numpy array; probs[i, k] is the probability of observation i
            being class k
            
        Usage
        predict(..)
        """
        
        if acorn is not None:
            acorn = np.random.seed(acorn)
            
        if representation==decider:
            in_task=True
        else:
            in_task=False
            
        train = self.X_[decider]
        y = self.y_[decider]
            
        model = self.models_[representation]

        n, d = train.shape
        
        if test.ndim > 1:
            m, d_ = test.shape
        else:
            m = len(test)
            d_ = 1

        class_counts = np.zeros((m, model.n_classes_))
        for idx, tree in enumerate(model):
            # get out of bag indicies
            
           
            if in_task:
                sampled_indices = model.estimators_samples_[idx]
                prob_indices = np.delete(range(n), sampled_indices)
            else:
                prob_indices = np.random.choice(range(n), size=int(subsample*n), replace=False)

            leaf_nodes = self._get_leaves(tree)
            unique_leaf_nodes = np.unique(leaf_nodes)

            # get all node counts
            node_counts = tree.tree_.n_node_samples
            # get probs for eval samples
            posterior_class_counts = np.zeros((len(unique_leaf_nodes), model.n_classes_))

            for prob_index in prob_indices:
                temp_node = tree.apply(train[prob_index].reshape(1, -1)).item()
                posterior_class_counts[np.where(unique_leaf_nodes == temp_node)[0][0], y[prob_index]] += 1

            # total number of points in a node
            row_sums = posterior_class_counts.sum(axis=1)

            # no divide by zero
            row_sums[row_sums == 0] = 1

            # posteriors
            class_probs = (posterior_class_counts / row_sums[:, None])
            # posteriors with finite sampling correction

            class_probs = self._finite_sample_correction(class_probs, row_sums)

            # posteriors as a list
            class_probs.tolist()

            partition_counts = np.asarray([node_counts[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)])
            # get probability for out of bag samples
            eval_class_probs = [class_probs[np.where(unique_leaf_nodes == x)[0][0]] for x in tree.apply(test)]
            eval_class_probs = np.array(eval_class_probs)
            # find total elements for out of bag samples
            elems = np.multiply(eval_class_probs, partition_counts[:, np.newaxis])
            # store counts for each x (repeat fhis for each tree)
            class_counts += elems
        # calculate p(y|X = x) for all x's
        probs = class_counts / class_counts.sum(axis=1, keepdims=True)

        return probs


    def predict(self, test, representation=0, decider='all', subsample=1, acorn=None):
        """
        Predicts the class labels for each sample in test.
        
        Input
        test: array-like; either a 1d array of length n_features
            or a 2d array of shape (m, n_features) 
        task_number: int; task number 
        """
        
        sum_posteriors = np.zeros((test.shape[0], self.n_classes))
        
        if representation is 'all':
            representation = np.arange(self.n_tasks)
        elif isinstance(representation, int):
            representation = np.array([representation])
        elif isinstance(representation, list):
            representation = np.array(representation)
            
        if not isinstance(representation, np.ndarray):
            raise ValueError('bad representation type %s: int, list of ints or numpy arrays only'%(str(type(representation)))
                            )
        else:
            representation = representation.astype(int)
            
        for i, rep in enumerate(representation):
            sum_posteriors += self._estimate_posteriors(test,
                                                        i,
                                                        decider,
                                                        subsample,
                                                        acorn)            
                
        return np.argmax(sum_posteriors, axis=1)

In [100]:
train_file = 'cifar-100-python/train'
unpickled_train = unpickle(train_file)
train_keys = list(unpickled_train.keys())
fine_labels = np.array(unpickled_train[train_keys[2]])

train_data = unpickled_train[list(train_keys)[-1]]
class_idx = [np.where(fine_labels == u)[0] for u in np.unique(fine_labels)]

train_by_task = [np.concatenate(class_idx[i*n_tasks: (i+1)*n_tasks]) for i in range(n_tasks)]

K = int(len(class_idx)/n_tasks)

n_trees = int(np.sqrt(len(class_idx[0])))

test_file = 'cifar-100-python/test'
unpickled_test = unpickle(test_file)
test_keys = list(unpickled_test.keys())
test_labels = np.array(unpickled_test[test_keys[2]])

test_data = unpickled_test[test_keys[-1]]
test_class_idx = [np.where(test_labels == u)[0] for u in np.unique(test_labels)]
test_by_task = [np.concatenate(test_class_idx[i*n_tasks: (i+1)*n_tasks]) for i in range(n_tasks)]

In [101]:
np.random.seed(1)

lifelong_forest = LifelongForest()

n=100
temp_n_tasks=2


for i in range(n_tasks):
    X = train_data[np.concatenate(class_idx[i*n_tasks: (i+1)*n_tasks])]
    labels = homogenize_labels(np.concatenate([n_tasks*i*np.ones(500) + j for j in range(n_tasks*i, n_tasks*(i+1))]))
    if i > 0:
        np.random.shuffle(labels)
    lifelong_forest.new_forest(X, labels)

In [103]:
stl_errors = np.zeros(n_tasks)
homogenized_labels = [homogenize_labels(test_labels[t]) for t in test_by_task[:n_tasks]]
llf_errors = [np.zeros((n_tasks-i)) for i in range(n_tasks)]

In [104]:
subsample=1

for i, test_set in enumerate(tqdm(test_by_task)):
    if i != 0:
        np.random.shuffle(homogenized_labels[i])
    for j in tqdm(range(i, n_tasks)):
        if i == j:
            stl_temp_pred = lifelong_forest.predict(test_data[test_set],
                                                representation=i,
                                                decider=i,
                                                subsample=subsample
                                                )
            stl_errors[i] = np.mean(stl_temp_pred == homogenized_labels[i])
            
            llf_temp_pred = lifelong_forest.predict(test_data[test_set],
                                                   representation=np.arange(i+1),
                                                   decider=i,
                                                   subsample=subsample
                                                   )
            llf_errors[i][i] = np.mean(stl_temp_pred == homogenized_labels[i])
        else:
            llf_temp_pred = lifelong_forest.predict(test_data[test_set],
                                                   representation=np.arange(j+1),
                                                   decider=i,
                                                   subsample=subsample
                                                   )
            llf_errors[i][j] = np.mean(llf_temp_pred == homogenized_labels[i])













  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A












  0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A












 10%|█         | 1/10 [04:22<39:26, 262.99s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












 20%|██        | 2/10 [09:03<35:46, 268.28s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












 30%|███       | 3/10 [16:11<36:53, 316.17s/it][A[A[A[A[A[A[A[A[A[A[A[A[A












 40%|████      | 4/10 [25:55<39:38, 396.42s/it][A[A[A[A[A[A[A[A[A[A[A[A[A

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

fig, ax = plt.subplots(1,1, figsize=(6,6))
c = sns.color_palette('Paired', n_colors=10)
for i in range(n_tasks - 1):
    ns = np.arange(i + 1, n_tasks + 1)
    ax.plot(ns,(stl_errors[i]) /(np.array(llf_errors[i])) , label = 'task %i'%(i + 1), c=c[i])
    
ax.scatter(10, (stl_errors[-1]) / (llf_errors[-1]), c = c[9], label='task 10', s = 5)
    
ax.set_title('Lifelong Forests on "Permuted" CIFAR-10x10', fontsize=20)
ax.set_xlabel('Number of tasks seen', fontsize=18)
ax.set_ylabel('Transfer Efficiency', fontsize=18)
# ax.set_ylim(0.05 - 0.01, 0.5 + 0.01)
# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.legend(loc='upper left', fontsize=14)
ax.set_yticks([1, 1.1, 1.2, 1.3])
ax.set_xticks(np.arange(1,11))
ax.tick_params(labelsize=14)
ax.grid(axis='x')
plt.tight_layout()