Widen jupyter notebook 


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from Context_Layer import Context as Ntask

# Graphing
import matplotlib.pyplot as plt
%matplotlib inline

# Seeding
from random import randrange 
import random
random.seed(5)
tf.set_random_seed(5)
tf.enable_eager_execution()

# Datasets
import dataset_8_logic_gates as data
import logic_gate_test

# General
import numpy as np
from tqdm.notebook import tqdm

### Turn off GPU


In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

### Datasets

8 logic gates


In [4]:
all_data = data.all_data

### Neural Network Architecture

In [5]:
# context layer index in neural network
NTASK_LAYER_IDX = 2

# We want a context for each task we want the network to learn
num_task_contexts=len(all_data)

In [6]:
inp = Input(2,)
x = Dense(20, activation='relu')(inp)
x = Ntask(num_task_contexts, hardcoded_contexts=False)(x)
x = Dense(1, activation="sigmoid")(x)

### Model

In [7]:
model = Model(inputs=inp, outputs=x)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
dense (Dense)                (None, 20)                60        
_________________________________________________________________
context (Context)            (None, 20)                160       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 241
Trainable params: 81
Non-trainable params: 160
_________________________________________________________________


### Loss Function is Binary Crossentropy

In [8]:
def loss_fn(labels, predictions):
    return tf.keras.losses.binary_crossentropy(y_true=labels, y_pred=predictions)

### Optimizer is Adam

In [9]:
optimizer = tf.keras.optimizers.Adam(1e-4)

# Context Switching Functions

In [10]:
def switch_to_better_fitting_context(next_context_idx, model, NTASK_LAYER_IDX, epoch_grads, cur_epoch_context_loss):

    """
    Swith Ntask layer to better fitting context, when provided better fitting context.
    Resets several variables.
    """
    
    # hot_context_idx is the index of the context that is currently "on" within the context layer
    hot_context_idx = next_context_idx
    
    # switch the context within the context layer to be that associated with hot_context_idx
    model.layers[NTASK_LAYER_IDX].set_hot_context( hot_context_idx )
    
    # Reset vars
    epoch_grads.clear()
    cur_epoch_context_loss[hot_context_idx] = 0
    
    return hot_context_idx


In [11]:

def switch_to_next_context(next_context_idx, model, context_idx, hot_context_idx):
    
    """
    Switch Ntask layer hot context to the next context & update hot_context_idx.
    ex: If hot_context_idx := 2 
            func call results in:
                hot_context_idx := 3
                the hot context is now the context at idx 3
    """
    # hot_context_idx of the index of the context that is currently "on" within the context layer
    hot_context_idx = next_context_idx
    
    # switch the context within the context layer to be that associated with hot_context_idx
    model.layers[NTASK_LAYER_IDX].set_hot_context( hot_context_idx )

    return hot_context_idx


In [12]:

def calc_context_loss(gradients, model, ntask_layer_idx_in_model, idx_of_next_layer_bias_gradient, idx_of_next_layer_weights_in_get_weights_call=0):
    """
    IMPORTANT: 
    1) Assumes no use of activation function on Ntask layer
    2) Assumes that the layer following the Ntask layer:
        a) Is a Dense layer
        b) Is using bias 
           — ex: Dense(20, ... , use_bias=True) 
           — note Keras Dense layer uses bias by default if no value is given for use_bias param
    3) Assumes index of the next layer's gradient is known within the gradients list returned from gradient tape in a tape.gradient call
    4) If the above points aren't met, things will break and it may be hard to locate the bugs
    """
    # from the delta rule in neural network math
    delta_at_next_layer = gradients[idx_of_next_layer_bias_gradient]
    transpose_of_weights_at_next_layer = tf.transpose(model.layers[ntask_layer_idx_in_model+1].get_weights()[idx_of_next_layer_weights_in_get_weights_call])
      
    # Calculate delta at ntask layer
    context_delta = np.dot( delta_at_next_layer, transpose_of_weights_at_next_layer ).astype(np.float)
    
    # Calculate Context Error
    # Keras MSE must have both args be arrs of floats, if one or both are arrs of ints, the output will be rounded to an int
    # This is how responsible the context layer was for the loss
    context_loss = tf.keras.losses.mean_squared_error(np.zeros(len(context_delta)), context_delta)

    return context_loss


### Training Functions

In [13]:

def custom_forward_pass(dataset, model, epoch_grads, context_idx, cur_epoch_context_loss, all_epoch_losses):

    """
    This is the training forward pass for an entire epoch
    
    !!!!! Read through this code as it is a nonstandard training forward pass ( different than model.fit() )
    & NOTE that this does not apply the gradients ie. this does not do a weight update/learn
    
    """

    sum_loss = 0
    
    # Tensorflow 2 style training -- info can be found here: https://www.tensorflow.org/guide/effective_tf2 
    # This is similar to model.fit(), however this is a custom training loop -- ie. it does things differently than model.fit()
    # look at each input and label (there are 4 for the logic gates)
    for x, y in dataset:   

        # Get the prediction and loss for the current sample
        # This needs to be under the GradientTape call to extract the gradients a couple of lines below...
        with tf.GradientTape(persistent=True) as tape:            
            predictions = model(x, training=True) # forward pass
            pred_loss = loss_fn(y, predictions)   # get loss

        
        # We will be using a sum of the loss for the epoch before updating weights
        sum_loss += pred_loss
        
        # Extract the gradients for the loss of the current sample
        gradients = tape.gradient(pred_loss, model.trainable_variables)
        
        # We collect the gradients from each sample in the dataset for the epoch
        epoch_grads.append(gradients)

        # How responsbile the context layer was for the loss
        context_loss = calc_context_loss(gradients, 
                                           model, 
                                           context_idx, 
                                           idx_of_next_layer_bias_gradient=3, 
                                           idx_of_next_layer_weights_in_get_weights_call=0)
        
        # Accumulating the context loss
        cur_epoch_context_loss[hot_context_idx] += context_loss

    # avg loss for epoch
    avg_loss_for_epoch = sum_loss / len(dataset)

    # Save the epoch losses
    all_epoch_losses.append(avg_loss_for_epoch)

    
    
    return  


In [14]:

def apply_grads_and_update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model):


    """
    This is where a weight update/learning/applying gradients occurs
    
    Some conditional variables are also updated
    
    This is called when it has been decided that the current context is good enough to learn on
    
    """
    
    # Update moving avg context loss
    moving_avg_context_loss[hot_context_idx] = (moving_avg_context_loss[hot_context_idx] + cur_epoch_context_loss[hot_context_idx]) / 2.0
    
    # Reset the diff_errs
    diff_errs = [0 for x in range(num_task_contexts)]

    
    # Backprop
    # We apply all the gradients from the epoch 
    # I think there are gradients from each sample which is ( !!!! nonstandard  )
    # As opposed to like average gradients over a batch
    for grads in epoch_grads:
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Reset counter
    num_epochs_without_learning = 0

    return diff_errs, num_epochs_without_learning 


### Training Hyperparameters

In [15]:
# for context switching behavior
thresh = 1.0
moving_avg_context_loss = [thresh for x in range(num_task_contexts)]

# init vals before training
hot_context_idx = 0
diff_errs = [thresh for x in range(num_task_contexts)]
cur_epoch_context_loss = [0 for x in range(num_task_contexts)]
all_epoch_losses = []
context_switch_threshold = 0


# Train

In [16]:
num_epochs_without_learning = 0

def train(model, dataset, n_epochs, debug, plotting_debug, num_task_contexts):
    
    
    # Global variables are used for sharing things
    # Some of these values are mutated in functions and not seen as return values
    global all_epoch_losses
    global hot_context_idx
    global cur_epoch_context_loss
    global moving_avg_context_loss
    global diff_errs
    global context_switch_threshold
    global num_epochs_without_learning


    for epoch in range(n_epochs):

        # init
        epoch_grads = []
        cur_epoch_context_loss[hot_context_idx] = 0
        
        
        #=====================================================================#
        # Nonstandard Forward Pass -- this is for every sample in the dataset #      
        #---------------------------------------------------------------------#
        custom_forward_pass(dataset, model, epoch_grads, NTASK_LAYER_IDX, cur_epoch_context_loss, all_epoch_losses)

        
        # If gone num_task_contexts epochs without learning on a context
        # And No Context Fits Well, Need To Pick Best Fit
        if num_epochs_without_learning >= num_task_contexts:
            
            
            # Find Best Fitting Context For Current Task
            # because went over all the contexts on the current task
            # the diff_errs tells us which context fits best for this task
            next_context_idx = diff_errs.index(max(diff_errs))
            
            # Best fitting context is the one that just had a forward pass performed on it
            # So -> Apply the Gradients
            # Continue to next epoch 
            if next_context_idx == hot_context_idx:
                diff_errs, num_epochs_without_learning = apply_grads_and_update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model)
                continue
            
            # Current Context does not have the best fit, so don't apply its grads
            # Now that the best fitting context has been found, train on it
            else:
                
                # Switch to best fitting Context
                #hot_context_idx, epoch_loss = switch_to_better_fitting_context(next_context_idx, model, NTASK_LAYER_IDX, epoch_grads, cur_epoch_context_error, epoch_loss)
                hot_context_idx = switch_to_better_fitting_context(next_context_idx, model, NTASK_LAYER_IDX, epoch_grads, cur_epoch_context_loss)

                #=====================================================================#
                # Nonstandard Forward Pass -- this is for every sample in the dataset #      
                #---------------------------------------------------------------------#
                custom_forward_pass(dataset, model, epoch_grads, NTASK_LAYER_IDX, cur_epoch_context_loss, all_epoch_losses)

                #Apply the Gradients
                #continue
                diff_errs, num_epochs_without_learning = apply_grads_and_update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model)
                continue
        
        # get the next context index in line: ie if at 0 get 1 
        next_context_idx = (hot_context_idx + 1) % len(moving_avg_context_loss)
                
        # very important for switching behavior
        # a kind of dynamic threshold
        diff_errs[hot_context_idx] = moving_avg_context_loss[hot_context_idx] - cur_epoch_context_loss[hot_context_idx]
        

        # our dynamic threshold hasn't passed the point where we switch
        if diff_errs[hot_context_idx] < context_switch_threshold:       
            
            # we want to count how many epochs we don't learn on
            # we use this to check that we haven't gone through every context without learning
            # otherwise we could keep performing epochs that never learn forever
            num_epochs_without_learning += 1
            
                      
            # go to the next context in line; if at context 0 go to context 1
            hot_context_idx = switch_to_next_context(next_context_idx, model, NTASK_LAYER_IDX, hot_context_idx)
            
            

            
        #--------------------------    
        # didnt switch, so must have been on correct contest already, so apply grads
        else:
            #update moving avg err
            moving_avg_context_loss[hot_context_idx] = (moving_avg_context_loss[hot_context_idx] + cur_epoch_context_loss[hot_context_idx]) / 2.0
            
            # learn on the gradients (nonstandard approach)
            for grads in epoch_grads:
                optimizer.apply_gradients(zip(grads, model.trainable_variables))


### Train Randomly on tasks for N cycles

We want to randomly throw tasks at the model while it is dynamically training.
SO the model has to switch its context on the fly while training before any contexts have fully learned a task mapped to a context.

In [17]:
def random_training_in_cycles(all_data, model, num_tasks, num_cycles, num_epochs):
    
    prev_task_data_idx = num_tasks-1    # init first choice as last task
    order_of_tasks_learned_on = []

    #FYI this is correct:
    # same as for c in range(cycle): for t in range num_tasks:
    for i in tqdm(range( num_cycles * num_tasks )):

        cur_task_data_idx = randrange(num_tasks)

        # Don't learn on the same task as the previous time
        while cur_task_data_idx == prev_task_data_idx:
            cur_task_data_idx = randrange(num_tasks)

        # get current task data
        cur_task_data = all_data[cur_task_data_idx]

        # so we can see the order of tasks after training
        order_of_tasks_learned_on.append(cur_task_data_idx)


        # train on the current task for specified number of epochs 
        train(model, cur_task_data, n_epochs=num_epochs, debug=False, plotting_debug=False, num_task_contexts=num_tasks)

        prev_task_data_idx = cur_task_data_idx
        
    return order_of_tasks_learned_on

# ****Training Loop

In [18]:
task_order = random_training_in_cycles(all_data, model, num_tasks=num_task_contexts, num_cycles=20, num_epochs=500)


HBox(children=(FloatProgress(value=0.0, max=160.0), HTML(value='')))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



# Has the modeled learned the 8 logic gates mapped to its 8 contexts?

In [19]:
def test(num_contexts, model, cont_idx):
    for i in range(num_contexts):
        model.layers[cont_idx].set_hot_context(i)
        a, b = logic_gate_test.test(model)
        
        print(b)
        #return a, b
        print()

# Yes it has
#### each line is the rounded output for a task
#### XOR looks like: 0 1 1 0 

In [20]:
test(num_task_contexts, model, NTASK_LAYER_IDX)

[array([[1.]], dtype=float32), array([[1.]], dtype=float32), array([[1.]], dtype=float32), array([[0.]], dtype=float32)]

[array([[1.]], dtype=float32), array([[0.]], dtype=float32), array([[0.]], dtype=float32), array([[1.]], dtype=float32)]

[array([[1.]], dtype=float32), array([[0.]], dtype=float32), array([[1.]], dtype=float32), array([[0.]], dtype=float32)]

[array([[0.]], dtype=float32), array([[1.]], dtype=float32), array([[1.]], dtype=float32), array([[1.]], dtype=float32)]

[array([[0.]], dtype=float32), array([[1.]], dtype=float32), array([[0.]], dtype=float32), array([[1.]], dtype=float32)]

[array([[0.]], dtype=float32), array([[1.]], dtype=float32), array([[1.]], dtype=float32), array([[0.]], dtype=float32)]

[array([[0.]], dtype=float32), array([[0.]], dtype=float32), array([[0.]], dtype=float32), array([[1.]], dtype=float32)]

[array([[1.]], dtype=float32), array([[0.]], dtype=float32), array([[0.]], dtype=float32), array([[0.]], dtype=float32)]



# Here's another way of looking to see if the network has learned to map the 8 tasks to its 8 different contexts

In [21]:
def consolidate_task_preds(preds):
    
    return [ preds[idx][0][0] for idx, x in enumerate(preds) ]

In [22]:
def get_all_raw_and_rounded_preds(num_tasks, cont_idx, model):
    all_raw_preds = []
    all_rounded_preds = []

    for i in range(num_tasks):
        model.layers[cont_idx].set_hot_context(i)

        raw_preds, rounded_preds = logic_gate_test.test(model)

        raw_preds = consolidate_task_preds(raw_preds)
        rounded_preds = consolidate_task_preds(rounded_preds)

        all_raw_preds.append( raw_preds )
        all_rounded_preds.append( rounded_preds )

    return all_raw_preds, all_rounded_preds



In [23]:
def remove_duplicates(all_rounded_preds):
    set_rounded_preds = set(tuple(x) for x in all_rounded_preds)
    dups_removed_rounded_preds = [ list(x) for x in set_rounded_preds ]
    dups_removed_rounded_preds.sort(key = lambda x: all_rounded_preds.index(x) )
    
    return dups_removed_rounded_preds

In [24]:
def get_accuracy_over_all_tasks(all_rounded_preds, dups_removed_rounded_preds, labels, num_tasks):

    """
    Purpose:
        Find accuracy over all tasks.
        For a task to be considered correct, it must produce the EXACT right output.
        Producing the EXACT right output more than once only counts correct once.
        If labels are: [ [a, b], [a, c], [b, c] ] and model produces: [ [a, b], [a, b], [a, a] ] -> 33% accurate
        If labels are: [ [a, b], [a, c], [b, c] ] and model produces: [ [a, b], [a, c], [a, a] ] -> 66% accurate
    
    """
    
    
    num_correct_duplicates = len(all_rounded_preds) - len(dups_removed_rounded_preds)

    num_wrong = 0

    for i in range(len(dups_removed_rounded_preds)):
        if dups_removed_rounded_preds[i] not in labels:
            num_wrong += 1

    num_wrong += num_correct_duplicates

    num_correct = num_tasks - num_wrong

    acc = ( num_correct / num_tasks ) * 100

    return acc

In [25]:
def get_labels(all_data):

    labels = []

    for i in range(len(all_data)):

        label = []
        for inp in all_data[i]:
            label.append(  inp[-1] )

        labels.append(label)
    
    return labels

In [26]:
all_raw_preds, all_rounded_preds = get_all_raw_and_rounded_preds(num_task_contexts, NTASK_LAYER_IDX, model)

In [27]:
no_duplicates_all_rounded_preds = remove_duplicates(all_rounded_preds)


In [28]:
labels = get_labels(all_data)


# 100% accurate - ie. learned the 8 tasks mapped to its 8 different contexts

In [29]:
acc = get_accuracy_over_all_tasks(all_rounded_preds, no_duplicates_all_rounded_preds, labels, num_task_contexts)
acc

100.0

# The above type of testing could be thought of as STATIC testing

# However, we want to have dynamic testing

# This is where things start to break down...

# ------------------------------------------

# Dynamic testing is basically the above dynamic training code with all of the learning parts removed

# Dynamic testing functions derived from training functions

In [30]:
def update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model):

    moving_avg_context_loss[hot_context_idx] = (moving_avg_context_loss[hot_context_idx] + cur_epoch_context_loss[hot_context_idx]) / 2.0
    
    diff_errs = [0 for x in range(num_task_contexts)]

    #Backprop
    #for grads in epoch_grads:
    #    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    num_epochs_without_learning = 0

    return diff_errs, num_epochs_without_learning 

In [31]:
epoch_accuracies = []
double_epoch_count = 0

In [32]:
global test_switch_epoch_counter

test_switch_epoch_counter = np.nan

In [33]:
def test_forward_pass(dataset, model, epoch_grads, context_idx, cur_epoch_context_loss, all_epoch_losses):
 
    global epoch_accuracies
    
    global test_switch_epoch_counter
    
    if  ( not np.isnan(test_switch_epoch_counter) ) and ( test_switch_epoch_counter < num_task_contexts-1 ):
        test_switch_epoch_counter += 1
        #append switch
        epoch_accuracies.append("switch")
        
        return
    
    if test_switch_epoch_counter >= num_task_contexts-1:
        test_switch_epoch_counter = np.nan
    
    
    sum_loss = 0
    
    acc = 0.0
    #print("acc at top of loop:", acc)
    #print()
    for x, y in dataset:   

        with tf.GradientTape(persistent=True) as tape:            
            predictions = model(x, training=True) # forward pass
            pred_loss = loss_fn(y, predictions)   # get loss

        rounded_pred = int(tf.math.round(predictions).numpy()[0][0])
            
        #print("predictions:", rounded_pred )   
        #print("label:", y)
        
        #print()
        
        if rounded_pred == y:
            acc += 1.0
            
        sum_loss += pred_loss
        
        gradients = tape.gradient(pred_loss, model.trainable_variables)
        epoch_grads.append(gradients)

        context_loss = calc_context_loss(gradients, 
                                           model, 
                                           context_idx, 
                                           idx_of_next_layer_bias_gradient=3, 
                                           idx_of_next_layer_weights_in_get_weights_call=0)
        
        
        cur_epoch_context_loss[hot_context_idx] += context_loss
    
    acc /= len(dataset)
    #print("acc at end of epoch:", acc)

    epoch_accuracies.append(acc)
    
    avg_loss_for_epoch = sum_loss / len(dataset)

    all_epoch_losses.append(avg_loss_for_epoch)

    return  

# General dynamic testing derived from training

In [34]:
num_epochs_without_learning = 0

def test(model, dataset, n_epochs, debug, plotting_debug, num_task_contexts):

    #global weights_before
    #global weights_after
    
    global double_epoch_count
    
    #global a
    #global b
    #global c
    #global d
    
    global all_epoch_losses

    global hot_context_idx
    
    global cur_epoch_context_loss
    global moving_avg_context_loss
    global diff_errs
    global context_switch_threshold
    
    global num_epochs_without_learning


    
    #weights_before.append(model.get_weights())

    
    for epoch in range(n_epochs):
        
        #if epoch == 0:
        #    weights_before.append([model.get_weights()])
        
        #if a and b and c and d:
            
        #    weights_after.append(model.get_weights())
            
        #    break
        
        #avg_loss_for_epoch = 0
        epoch_grads = []
        cur_epoch_context_loss[hot_context_idx] = 0
        
        
        #======================#
        # General Forward Pass #      
        #----------------------#
        test_forward_pass(dataset, model, epoch_grads, NTASK_LAYER_IDX, cur_epoch_context_loss, all_epoch_losses)

        
        # If gone num_task_contexts epochs without learning on a context
        # And No Context Fits Well, Need To Pick Best Fit
        if num_epochs_without_learning >= num_task_contexts:
            
            
            # Find Best Fitting Context For Current Task
            # bc went over all the contexts on the current task
            # the diff_errs accurately tells us which context fits best for this task
            next_context_idx = diff_errs.index(max(diff_errs))
            
            # Best fitting context is the one that just had a forward pass performed on it
            # So -> Apply the Gradients
            # Continue to next epoch 
            if next_context_idx == hot_context_idx:
                diff_errs, num_epochs_without_learning = update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model)
                
                #a=True
                
                continue
            
            # Current Context does not have the best fit, so don't apply its grads
            # Now that the best fitting context has been found, train on it
            else:
                
                # Switch to best fitting Context
                #hot_context_idx, epoch_loss = switch_to_better_fitting_context(next_context_idx, model, NTASK_LAYER_IDX, epoch_grads, cur_epoch_context_error, epoch_loss)
                hot_context_idx = switch_to_better_fitting_context(next_context_idx, model, NTASK_LAYER_IDX, epoch_grads, cur_epoch_context_loss)

                #==================#
                # General Training #
                #==================#
                test_forward_pass(dataset, model, epoch_grads, NTASK_LAYER_IDX, cur_epoch_context_loss, all_epoch_losses)

                #this is running an extra epoch, we need to keep track
                double_epoch_count += 1
            
                diff_errs, num_epochs_without_learning = update_conditional_vars_etc(moving_avg_context_loss, hot_context_idx, cur_epoch_context_loss, epoch_grads, optimizer, model)
                
                # b = True
                continue
        
        next_context_idx = (hot_context_idx + 1) % len(moving_avg_context_loss)
                
        
        diff_errs[hot_context_idx] = moving_avg_context_loss[hot_context_idx] - cur_epoch_context_loss[hot_context_idx]
        

        if diff_errs[hot_context_idx] < context_switch_threshold:            
            num_epochs_without_learning += 1
            
            
        
            
            hot_context_idx = switch_to_next_context(next_context_idx, model, NTASK_LAYER_IDX, hot_context_idx)
            
            #c = True
            
            

            
        #--------------------------    
        # didnt switch, so apply grads
        else:
            #update moving avg err
            moving_avg_context_loss[hot_context_idx] = (moving_avg_context_loss[hot_context_idx] + cur_epoch_context_loss[hot_context_idx]) / 2.0
            
            #d = True
            
            #for grads in epoch_grads:
            #    optimizer.apply_gradients(zip(grads, model.trainable_variables))


# Testing loop derived from training loop

In [35]:
def random_testing_in_cycles(all_data, model, num_tasks, num_cycles, num_epochs):
    
    global epoch_accuracies
    global double_epoch_count
    global weights_after
    global weights_before
    
    
    global a
    global b
    global c
    global d
    
    double_epoch_count = 0
    
    epoch_accuracies = []
    
    prev_task_data_idx = num_tasks-1    # init first choice as last task
    order_of_tasks_learned_on = []

    weights_before = []
    weights_after = []
    
    #FYI this is correct:
    # same as for c in range(cycle): for t in range num_tasks:
    for i in tqdm(range( num_cycles * num_tasks )):
    #while len(weights_after) == 0:
    
        cur_task_data_idx = randrange(num_tasks)

        while cur_task_data_idx == prev_task_data_idx:
            cur_task_data_idx = randrange(num_tasks)

        cur_task_data = all_data[cur_task_data_idx]
        #print("Currently training on data from all_data[ ", cur_task_data_idx)
        order_of_tasks_learned_on.append(cur_task_data_idx)

        
        global test_switch_epoch_counter
        test_switch_epoch_counter = 0
        
        test(model, cur_task_data, n_epochs=num_epochs, debug=False, plotting_debug=False, num_task_contexts=num_tasks)
        
     #   print("Len of weights before:", len(weights_before))
        
      #  print(a, b, c, d)

        prev_task_data_idx = cur_task_data_idx
        
    return order_of_tasks_learned_on

# For checking percent of correct epochs during testing

In [36]:
def percent_correct_epochs(epoch_acc_list):
    
    
    print( "number of epochs:", len( epoch_acc_list ) )
    
    correct_indices = []
    
    num_switches = 0
    
    num_correct = 0
    for idx, acc in enumerate(epoch_acc_list):
        if acc == 1.0:
            num_correct += 1
            correct_indices.append(idx)
            
        elif acc == "switch":
            num_switches += 1
    
    
    print("num_correct:",num_correct)
    print(len(correct_indices))
    
    print("num switches", num_switches)
    
    return num_correct / ( float(len( epoch_acc_list )-num_switches) )
            
    

# Run the dynamic testing after we trained earlier

In [37]:
task_order = random_testing_in_cycles(all_data, model, num_tasks=num_task_contexts, num_cycles=10, num_epochs=100)


HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




# Get the percent of correct epochs from the dynamic testing

### 33% acc on dynamic testing, even though we saw above that the model has learned to map the 8 tasks to its 8 contexts. We would expecte MUCH higher dynamic testing accuracy

In [38]:
post_train_percent_correct_epochs = percent_correct_epochs( epoch_accuracies )
post_train_percent_correct_epochs

number of epochs: 8043
num_correct: 2526
2526
num switches 560


0.33756514766804757

#### Note that we expected 8000 epochs, but there were actually 8043 with our double epochs

In [39]:
# The number of epochs should be num_cycles*num_epochs, however one of the branches in the training/testing adds extra epochs on the fly
double_epoch_count

43

# Manually inspect to see how the model did during testing
# A perfect model (post_train_percent_correct_epochs := 1.0 ) would look like :
"switch"
"switch"
"switch"
"switch"
"switch"
"switch"
"switch"
1.0
...
1.0
"switch"
"switch"
"switch"
"switch"
"switch"
"switch"
"switch"
1.0
...
1.0
...

In [40]:
# The accuracies for each epoch
# 0 -> none of the predictions for the 4 logic gates were correct
# 0.25 -> one of the predictions for the 4 logic gates were correct
# 0.5 -> two of the predictions for the 4 logic gates were correct
# 0.75 -> three of the predictions for the 4 logic gates were correct
# 1.0 -> all of the predictions for the 4 logic gates were correct

# When a task has switched, we append "switch" for num_tasks - 1 epochs
# The desired behavior is that after the n-1 switches, the model is now on the correct context and doesn't switch its context until the task switches
# We don't count the n-1 "switch" epochs in the computation of accuracy here
# This doesn't seem to happen as desired...
epoch_accuracies

['switch',
 'switch',
 'switch',
 'switch',
 'switch',
 'switch',
 'switch',
 1.0,
 0.0,
 0.25,
 0.25,
 0.5,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 'switch',
 'switch',
 'switch',
 'switch',
 'switch',
 'switch',
 'switch',
 0.5,
 0.0,
 0.75,
 0.25,
 0.5,
 0.5,
 0.0,
 0.75,
 0.25,
 0.5,
 0.75,
 0.25,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

# *** Where might errors lie?
- maybe in the diff_errs and moving_avg_context_loss
    - this is a kind of thresholding system, which may work for training but not work for testing
    - potentially, with the existing code, if the model was trained better, it could be overfit to each of the 8 tasks which would hopefully allow it to switch faster and stay on the correct context
    
- maybe an issue with sharing or not sharing global variables between training and testing (ex: diff_errs, moving_avg_context_loss, etc.)
    - these maybe could be thought of as "learned weights" and are getting shared incorrectly or reset etc before testing.
    
- maybe the model needs to be trained better

- maybe there is a better way to trigger task switching and perseverating
   
- maybe dynamic testing was implemented incorrectly