# Label Shuffle Experiment for Progressive Learning

Most state of the art algorithms are unable to transfer knowledge forward, and none are able to transfer knowledge backward, both key capabilities in progressive learning. This inability to transfer has been identified as one of the key obstacles limiting the capabilities of artificial intelligence. 

Representation ensembling algorithms sequentially learn a representation for each task, and ensemble both old and new representations for all future decisions. Two algorithms for progressive learning is proposed. Lifelong Learning Forest `L2F` uses decision forests as the transformers, specifically a variant of decision forests called ‘Uncertainty Forest’ `UF`. To obtain consistent estimates of the posteriors, each tree is ‘honest’, meaning that it uses each data point for either learning the transformer or voter, but not both. Lifelong Learning Network `L2N` uses deep networks as the transformers. 

### Import necessary packages and modules

In [None]:
import sys
import random
import numpy as np
import tensorflow as tf
import keras
from keras import layers
from joblib import Parallel, delayed
from multiprocessing import Pool
import time
from itertools import product
import pandas as pd
import pickle
from sklearn.model_selection import StratifiedKFold
from math import log2, ceil 
import seaborn as sns
import matplotlib.pyplot as plt

### Import models from proglearn
Append the path for where your proglearn to sys

Important: Change internal calculation of k in voter.py to `16 * int(np.log2(len(X)))`

In [None]:
sys.path.append("../proglearn/")
from forest import LifelongClassificationForest 
from network import LifelongClassificationNetwork

### Load CIFAR100 data 
First we load the CIFAR100 data from keras.datasets and store it in `data_x`

In [None]:
(X_train, y_train), (X_test, y_test) = keras.datasets.cifar100.load_data()
data_x = np.concatenate([X_train, X_test])

### Functions
We define the functions for training the model

`cross_val_data`: splits the data and class labels to training and test 

`run_parallel_exp`: is a wrapper function for LF_experiment, and configures GPUs for training

`LF_experiment`: Function that creates the progressive learner model and trains it. In `file_to_save`, specify the directory for the pickle files. 

For the `DNN`, `network` stores the keras sequential model for the neural network. The architecture consists of 5 convolutional layers of kernel size 3x3, with relu activation. Each activation is followed by a batch normalization layer. After the convolutional layers, there are 3 fully connected layers, and the last layer has a softmax activation. 

In [None]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
def cross_val_data(data_x, data_y, num_points_per_task, total_task=10, shift=1):
    x = data_x.copy()
    y = data_y.copy()
    idx = [np.where(data_y == u)[0] for u in np.unique(data_y)]
    
    batch_per_task=5000//num_points_per_task
    sample_per_class = num_points_per_task//total_task

    for task in range(total_task):
        for batch in range(batch_per_task):
            for class_no in range(task*10,(task+1)*10,1):
                indx = np.roll(idx[class_no],(shift-1)*100)
                
                if batch==0 and class_no==0 and task==0:
                    train_x = x[indx[batch*sample_per_class:(batch+1)*sample_per_class]]
                    test_x = x[indx[batch*total_task+num_points_per_task:(batch+1)*total_task+num_points_per_task]]
                    train_y = np.random.randint(low = 0, high = total_task, size = sample_per_class)
                    test_y = np.random.randint(low = 0, high = total_task, size = total_task)
                else:
                    train_x = np.concatenate((train_x, x[indx[batch*sample_per_class:(batch+1)*sample_per_class]]), axis=0)
                    test_x = np.concatenate((test_x, x[indx[batch*total_task+num_points_per_task:(batch+1)*total_task+num_points_per_task]]), axis=0)
                    if task == 0:
                        train_y = np.concatenate((train_y, y[indx[batch*sample_per_class:(batch+1)*sample_per_class]]), axis=0)
                        test_y = np.concatenate((test_y, y[indx[batch*total_task+num_points_per_task:(batch+1)*total_task+num_points_per_task]]), axis=0)
                    else:
                        train_y = np.concatenate((train_y, np.random.randint(low = 0, high = total_task, size = sample_per_class)), axis=0)
                        test_y = np.concatenate((test_y, np.random.randint(low = 0, high = total_task, size = total_task)), axis = 0)
                
    return train_x, train_y, test_x, test_y

In [None]:
def run_parallel_exp(data_x, data_y, n_trees, model, num_points_per_task, slot=0, shift=1):
    train_x, train_y, test_x, test_y = cross_val_data(data_x, data_y, num_points_per_task, shift=shift)
    
    if model == "dnn":
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        with tf.device('/gpu:'+str(shift % 4)):
            LF_experiment(train_x, train_y, test_x, test_y, n_trees, shift, slot, model, num_points_per_task, acorn=12345)
    else:
        LF_experiment(train_x, train_y, test_x, test_y, n_trees, shift, slot, model, num_points_per_task, acorn=12345)

In [1]:
def LF_experiment(train_x, train_y, test_x, test_y, ntrees, shift, slot, model, num_points_per_task, acorn=None):
    #Initialize the dataframe for pickle files to store results   
    df = pd.DataFrame()
    shifts = []
    tasks = []
    base_tasks = []
    accuracies_across_tasks = []
    train_times_across_tasks = []
    inference_times_across_tasks = []
    
    #Initialize the progressive_learner model 
    progressive_learner = None
    
    if model == "dnn":
        network = keras.Sequential()
        network.add(layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', input_shape=np.shape(train_x)[1:]))
        network.add(layers.BatchNormalization())
        network.add(layers.Conv2D(filters=32, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu'))
        network.add(layers.BatchNormalization())
        network.add(layers.Conv2D(filters=64, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu'))
        network.add(layers.BatchNormalization())
        network.add(layers.Conv2D(filters=128, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu'))
        network.add(layers.BatchNormalization())
        network.add(layers.Conv2D(filters=254, kernel_size=(3, 3), strides = 2, padding = "same", activation='relu'))

        network.add(layers.Flatten())
        network.add(layers.BatchNormalization())
        network.add(layers.Dense(2000, activation='relu'))
        network.add(layers.BatchNormalization())
        network.add(layers.Dense(2000, activation='relu'))
        network.add(layers.BatchNormalization())
        network.add(layers.Dense(units=10, activation = 'softmax'))
        
        progressive_learner = LifelongClassificationNetwork(network=network)
        
    elif model == "uf":
        progressive_learner = LifelongClassificationForest(n_estimators=ntrees)

    for task_ii in range(10):
        print("Starting Task {} For Fold {} For Slot {}".format(task_ii, shift, slot))
        if acorn is not None:
            np.random.seed(acorn)

        train_start_time = time.time()
        progressive_learner.add_task(
            X = train_x[task_ii*5000+slot*num_points_per_task:task_ii*5000+(slot+1)*num_points_per_task], 
            y = train_y[task_ii*5000+slot*num_points_per_task:task_ii*5000+(slot+1)*num_points_per_task]
            )
        train_end_time = time.time()
        
        inference_start_time = time.time()
        llf_task=progressive_learner.predict(
            test_x[:1000], task_id=0
            )
        inference_end_time = time.time()
        acc = np.mean(
                    llf_task == test_y[:1000]
                    )
        accuracies_across_tasks.append(acc)
        shifts.append(shift)
        train_times_across_tasks.append(train_end_time - train_start_time)
        inference_times_across_tasks.append(inference_end_time - inference_start_time)
        
        print("Accuracy Across Tasks: {}".format(accuracies_across_tasks))
        print("Train Times Across Tasks: {}".format(train_times_across_tasks))
        print("Inference Times Across Tasks: {}".format(inference_times_across_tasks))
            
    df['data_fold'] = shifts
    df['task'] = range(1, 11)
    df['task_1_accuracy'] = accuracies_across_tasks
    df['train_times'] = train_times_across_tasks
    df['inference_times'] = inference_times_across_tasks

    file_to_save = './result/'+model+str(ntrees)+'_'+str(shift)+'_'+str(slot)+'.pickle'
    with open(file_to_save, 'wb') as f:
        pickle.dump(df, f)

### Define hyperparameters for the model and run model
For `model`, choose between dnn (deep neural network) and uf (uncertainty forest) 

`UF`: The default parameters is `n_trees=10`, `shift=7`.

`DNN`: The default parameters is `shift=7`

`n_trees` specifies the number of transformers in the uncertainty forest. Running the cell below will train the model, based on the parameter specified

In [None]:
#Define which type of model here
model = "dnn"
num_points_per_task = 500

#Data preprocessing
if model == "uf":
    data_x = data_x.reshape((data_x.shape[0], data_x.shape[1] * data_x.shape[2] * data_x.shape[3]))
data_y = np.concatenate([y_train, y_test])
data_y = data_y[:, 0]

slot_fold = range(int(5000 // num_points_per_task))

if model == "uf":
    shift_fold = range(1,7,1)
    n_trees=[10]
    iterable = product(n_trees,shift_fold,slot_fold)
    Parallel(n_jobs=-2,verbose=1)(
        delayed(run_parallel_exp)(
                data_x, data_y, ntree, model, num_points_per_task, slot=slot, shift=shift
                ) for ntree,shift,slot in iterable
                )
elif model == "dnn":
    
    for slot in slot_fold:
        def perform_shift(shift):
            return run_parallel_exp(data_x, data_y, 0, model, num_points_per_task, slot=slot, shift=shift)
        
        stage_1_shifts = range(1, 5)
        with Pool(4) as p:
            p.map(perform_shift, stage_1_shifts) 
            
        stage_2_shifts = range(5, 7)
        with Pool(4) as p:
            p.map(perform_shift, stage_2_shifts) 

### Functions
Functions for calculating the backward transfer efficiency. The backward transfer efficiency of $f_n$ for task $t$ given $n$ samples is $BTE^t(fn) := \mathbb{E}[R^t (f_n)/R^t(f_n)]$ .

We say an algorithm (positive) backward transfers for task t if and only if $BTE^t(fn) > 1$, or if $\log BTEt(fn) > 0$. In other words, if $BTE^t(fn) > 1$, then the algorithm has used data associated with new tasks to improve performance on previous tasks.

In [None]:
def get_bte(err):
    bte = []
    
    for i in range(10):
        bte.append(err[0] / err[i])
    
    return bte

### Plotting
Run cell to generate plot of transfer efficiency

In the `filename`, add the directory to where you output the pickle files earlier in training

In [None]:
slots = 1
shifts = 6
alg_name = ['L2N','L2F']

reps = slots*shifts
btes = np.zeros((len(alg_name),10),dtype=float)

for alg_no,alg in enumerate(alg_name):
    bte_tmp = [[] for _ in range(reps)]

    count = 0   
    for slot in range(slots):
        for shift in range(shifts):
            if alg_no==0:
                filename = './result/dnn0_'+str(shift+1)+'_'+str(slot)+'.pickle'
            elif alg_no==1:
                filename = './result/uf10_'+str(shift+1)+'_'+str(slot)+'.pickle'
            else:
                filename = 'benchmarking_algorthms_result/'+alg+'_'+str(shift+1)+'_'+str(slot)+'.pickle'

            multitask_df = unpickle(filename)

            err = []

            for ii in range(10):
                err.extend(
                1 - np.array(
                    multitask_df[multitask_df['task']==ii+1]['task_1_accuracy']
                )
                )
            bte = get_bte(err)
        
            bte_tmp[count].extend(bte)
            count+=1
    
    btes[alg_no] = np.mean(bte_tmp, axis = 0)
    
clr = ["#00008B", "#e41a1c", "#a65628", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#CCCC00"]
c = sns.color_palette(clr, n_colors=len(clr))
fig, ax = plt.subplots(1,1, figsize=(10,8))

for alg_no,alg in enumerate(alg_name):
    if alg_no<2:
        ax.plot(np.arange(1,11),btes[alg_no], c=c[alg_no], label=alg_name[alg_no], linewidth=3)
    else:
        ax.plot(np.arange(1,11),btes[alg_no], c=c[alg_no], label=alg_name[alg_no])

ax.set_yticks([.9,.95, 1, 1.05,1.1,1.15,1.2])
ax.set_xticks(np.arange(1,11))
ax.tick_params(labelsize=20)
ax.set_xlabel('Number of tasks seen', fontsize=24)
ax.set_ylabel('Transfer Efficiency', fontsize=24)
ax.set_title("Label Shuffled CIFAR", fontsize = 24)
ax.hlines(1,1,10, colors='grey', linestyles='dashed',linewidth=1.5)
right_side = ax.spines["right"]
right_side.set_visible(False)
top_side = ax.spines["top"]
top_side.set_visible(False)
plt.tight_layout()
ax.legend(loc='center left', bbox_to_anchor=(1,0.5), fontsize=22)