In [1]:
from __future__ import absolute_import, print_function, division
from builtins import *
import time
import math
from collections import OrderedDict
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import os

os.environ['CUDA_DEVICE'] = '0' # pick your device

# -- initialize the device
import pycuda.autoinit

In [2]:

import re
import pandas as pd

datapath = '/media/sdb/green_ball/'
filelist = os.listdir(datapath)

nparrayfile = re.compile('(.+?)\.npy')
csvfile = re.compile('(.+?)\.csv')
testfile_exp = re.compile('.+?test.+?')

test_data = {}
train_data = {}
did_it = []
for filename in filelist:
    filename_wo_ext = filename.rsplit('.', 1)[0]
    if filename_wo_ext not in did_it:
        fname = datapath + filename_wo_ext
        arr = np.load(fname + '.npy')
        df = pd.read_csv(fname + '.csv')

        if testfile_exp.match(filename) is None:
            train_data[filename_wo_ext] = (arr, df)
        else:
            test_data[filename_wo_ext] = (arr, df)
        did_it.append(filename_wo_ext)

In [3]:
input_edge = 6
n_color = 3
input_size = input_edge * input_edge * n_color
hidden_size = 49
output_sizes = [1, 2*2, 4*4, 8*8, 6*6, 16*16]
heat_map_size = 256
structure = [16, 8, 4, 3, 2, 1]

edge_n_pixels = input_edge*structure[0]
heat_map_edge = int(math.sqrt(heat_map_size))

train_data_reformat = {}
for key, data_and_label in train_data.items():
    data_arr, df = data_and_label
    n_frames = len(df)


    rescale_arr = data_arr/255.
    ground_truth_heat_map_list = []
    for idx in range(n_frames):
        rescale_factor = heat_map_edge / edge_n_pixels
        x, y, x_w, y_w = rescale_factor * df.iloc[idx]
        # keep in mind that this only works because the values
        # that are negative are factions because of the scaling down
        # if I were to keep the image the same size the frames with no
        # ball would still give x, y, x_w, y_w = -1
        x = int(x)
        y = int(y)
        x_w = int(x_w)
        y_w = int(y_w)
        ground_truth_heat_map = np.zeros((heat_map_edge, heat_map_edge))

        ground_truth_heat_map[y:y+y_w, x:x+x_w] = 1
        ground_truth_heat_map = ground_truth_heat_map.reshape(heat_map_size)
        ground_truth_heat_map_list.append(ground_truth_heat_map)
        
    train_data_reformat[key] = (rescale_arr, ground_truth_heat_map_list)

In [4]:
from PVM_PyCUDA import PVM_CUDA, PVMtracker
from RectangularGridConstructor import make_connections

# resultpath = './GreenBallTraining/'
# filename = resultpath + 'green_ball_learning_rate_0.01_hidden_49_3000000steps'
# dir_list = os.listdir('./')
# if resultpath.split('/')[1] not in dir_list:
#     os.mkdir(resultpath)

In [5]:
connect_dict = make_connections(structure, output_sizes)

In [5]:
tracker = PVMtracker(structure, output_sizes, heat_map_size, input_size, hidden_size, context_from_top_0_0=True)

In [None]:
tracker.reset_state()

In [6]:
resultpath = './GreenBallTraining/'
filename = resultpath + 'green_ball_learning_rate_0.01_hidden_49_5000000steps'
tracker.load_parameters(filename)

In [6]:
resultpath = './GreenBallTraining/'
filename = resultpath + 'green_ball_learning_rate_0.01_hidden_49_5000000steps'
dir_list = os.listdir('./')
if resultpath.split('/')[1] not in dir_list:
    os.mkdir(resultpath)
learning_rate_list = [0.01]*5000000 #[0.0002]*1000000 + [0.00005]*1500000 + [0.00001]*39000000
tracker.train(train_data_reformat, learning_rate_list, print_every=100000,
              save_every_print=False, filename=filename, interval=100000)
tracker.save_mse(filename)

    100000 frames: 0.00741105504244
    200000 frames: 0.00516860081658
    300000 frames: 0.00473766347278
    400000 frames: 0.00449034257444
    500000 frames: 0.00441235010534
    600000 frames: 0.00429010623325
    700000 frames: 0.00427266457775
    800000 frames: 0.0041830899829
    900000 frames: 0.00419109633875
   1000000 frames: 0.00410702779253
   1100000 frames: 0.00414249707359
   1200000 frames: 0.00406286576883
   1300000 frames: 0.00409605852761
   1400000 frames: 0.00403729559867
   1500000 frames: 0.00405114192711
   1600000 frames: 0.00402921508202
   1700000 frames: 0.00400914999289
   1800000 frames: 0.00402255315661
   1900000 frames: 0.00399040191548
   2000000 frames: 0.0040089247076
   2100000 frames: 0.00396746342339
   2200000 frames: 0.00400348520676
   2300000 frames: 0.00396162847154
   2400000 frames: 0.00399585937487
   2500000 frames: 0.00394929680966
   2600000 frames: 0.00398794913605
   2700000 frames: 0.00393103658187
   2800000 frames: 0.003982703

In [7]:
from FormattingFiles import norm_and_heatmap, unflatten_image

In [8]:
test_data_reformat = {}
for key, data in test_data.items():
    data_arr, df = data
    
    test_data_reformat[key] = norm_and_heatmap(data_arr, df, heat_map_edge, edge_n_pixels)
    

In [12]:
success_threshold = 0.6
precision_threshold = 20 * 16 / 96
success_list, precision_list, accuracy_list = tracker.test(test_data_reformat, 
                                                           (heat_map_edge, heat_map_edge),
                                                           success_threshold,
                                                           precision_threshold,
                                                           accuracy_threshold=1.0)

In [13]:
np.mean(success_list), np.mean(precision_list), np.mean(accuracy_list)

(0.71367510363828313, 0.87831191904627026, 0.88549579009707235)

In [9]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation

In [10]:
# cutting some stuff of memory
del train_data, train_data_reformat, test_data

In [11]:
# saving animation works better with ArtistAnimation
# %matplotlib tk

heat_map_edges = (heat_map_edge, heat_map_edge)

fig = plt.figure(figsize=(12, 9))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

ims = []
for key, data in test_data_reformat.items():
    rescale_arr, ground_truth_heat_map_list = data
    n_frame = len(ground_truth_heat_map_list)
    tracker.reset_state()
    for i in range(n_frame):
        flat_image = rescale_arr[i,:]#(256 - (255 * (rescale_arr[i,:])).astype(np.int8))
        tracker.forward(flat_image)
        
        im1 = ax1.imshow(unflatten_image(flat_image,
                                         (edge_n_pixels,edge_n_pixels, n_color),
                                         (input_edge, input_edge)), animated=True)
        
         
#         im2 = ax2.imshow(unflatten_image(abs(flat_image - tracker.pred[:tracker.L_input].get()),
#                                          (edge_n_pixels, edge_n_pixels, n_color),
#                                          (input_edge, input_edge)), animated=True)
        
        im2 = ax2.imshow(unflatten_image(tracker.pred[:tracker.L_input].get(),
                                         (edge_n_pixels, edge_n_pixels, n_color),
                                         (input_edge, input_edge)), animated=True)
        
        im3 = ax3.imshow(ground_truth_heat_map_list[i].reshape(heat_map_edges),
                         cmap='gray', animated=True)
        
        im4 = ax4.imshow(tracker.avg_heatmap.get().reshape(heat_map_edges),
                         vmin=0, vmax=1, cmap='gray', animated=True)
        ims.append([im1, im2, im3, im4])

ani = animation.ArtistAnimation(fig, ims, interval=1000/60, blit=True)
ani.save(resultpath + 'green_ball_learning_rate_0.01_hidden_49_5000000steps.mp4',
         writer='ffmpeg', fps=60)

In [None]:
# realtime animation
%matplotlib tk

heat_map_edges = (heat_map_edge, heat_map_edge)

fig = plt.figure(figsize=(12, 9))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)

def gen_func():
    global test_data_reformat, tracker
    for key, data in test_data_reformat.items():
        rescale_arr, ground_truth_heat_map_list = data
        n_frame = len(ground_truth_heat_map_list)
        tracker.reset_state()
        for i in range(n_frame):
            flat_image = rescale_arr[i,:]#(256 - (255 * (rescale_arr[i,:])).astype(np.int8))
            tracker.forward(flat_image)
            yield flat_image, tracker.pred[:tracker.L_input].get(),\
                ground_truth_heat_map_list[i], tracker.avg_heatmap.get()
                
def update(vals):
    flat_image, pred, gt_heat_map, heatmap = vals
    im1 = ax1.imshow(unflatten_image(flat_image,
                                     (edge_n_pixels,edge_n_pixels, n_color),
                                     (input_edge, input_edge)), animated=True)
        
        
    im2 = ax2.imshow(unflatten_image(abs(flat_image - pred),
                                     (edge_n_pixels, edge_n_pixels, n_color),
                                     (input_edge, input_edge)), animated=True)

    im3 = ax3.imshow(gt_heat_map.reshape(heat_map_edges),
                     cmap='gray', animated=True)

    im4 = ax4.imshow(heatmap.reshape(heat_map_edges),
                     vmin=0, vmax=1, cmap='gray', animated=True)
    return im1, im2, im3, im4

vals = gen_func().next()
flat_image, pred, gt_heat_map, heatmap = vals
im1 = ax1.imshow(unflatten_image(flat_image,
                                 (edge_n_pixels,edge_n_pixels, n_color),
                                 (input_edge, input_edge)), animated=True)


im2 = ax2.imshow(unflatten_image(pred,
                                 (edge_n_pixels, edge_n_pixels, n_color),
                                 (input_edge, input_edge)), animated=True)

im3 = ax3.imshow(gt_heat_map.reshape(heat_map_edges),
                 cmap='gray', animated=True)

im4 = ax4.imshow(heatmap.reshape(heat_map_edges),
                 vmin=0, vmax=1, cmap='gray', animated=True)
ani = animation.FuncAnimation(fig, update, frames=gen_func, interval=16, blit=True)
plt.show()

In [17]:
tracker.grad_bias_h2op, tracker.grad_bias_i2h, tracker.delta_tracker

(array([ 0.00070687,  0.02102356,  0.01186935, ...,  0.03868137,
         0.03213972, -0.00101318]),
 array([  1.13520786e-03,   1.80932772e-02,   1.28967698e-02, ...,
         -3.56679307e-02,  -1.66260265e-05,  -4.96235665e-02]),
 array([  3.68078829e-07,   2.85900087e-07,   3.56774643e-07, ...,
          1.10296916e-06,   3.12090251e-07,   1.54280066e-07]))

In [18]:
tracker.grad_weight_h2op, tracker.grad_weight_i2h, tracker.grad_weight_tracker

(array([  1.58924490e-06,   6.82992561e-04,   3.16866442e-04, ...,
         -5.20864310e-04,  -1.01311242e-03,  -3.77799398e-04]),
 array([ 0.00059209,  0.00030272,  0.00019588, ..., -0.03295553,
        -0.04961997, -0.01737747]),
 array([  1.71262927e-07,   6.71469537e-08,   6.14081263e-08, ...,
          1.20036118e-07,   1.15290952e-07,   1.88227648e-08]))

In [None]:
tracker2 = PVMtracker(structure, output_sizes, heat_map_size, input_size, hidden_size, context_from_top_0_0=True)

In [None]:
tracker2.load_parameters(filename)

In [None]:
tracker2.h2op_bias.get() - tracker.h2op_bias.get()

In [None]:
#tracker.save_parameters('green_ball_training_experiment1')

In [None]:
# tracker.i2h_weights

In [None]:
# tracker.load_parameters('./test_save_parameters')

In [None]:
# tracker.i2h_weights

In [None]:
# while True:
#     n = raw_input("Please enter 'hello':")
#     if n.strip() == 'hello':
#         break

In [None]:
train_data_reformat['green_ball_01_small']

In [None]:
def feeds2(structure, layer, x, y):
    """
    Specifies which PVM-units the location layer, x, y
    feeds to in the next layer assuming that the structure is
    rectangular and doesn't differ along an edge by more than 1 unit
    or by double the dimension of units
    """
    assert layer < len(structure) - 1
    
    def shape_check(shape):
        if isinstance(shape, int):
            L_x = shape
            L_y = shape
        elif len(shape) == 2:
            L_x, L_y = shape
        elif len(shape) == 1:
            L_x = shape[0]
            L_y = shape[0]
        else:
            raise ValueError('Not a valid entry must be a single integer\n' +
                         'or a tuple of length one or two.')
        return L_x, L_y
    
    def find_edge_pos_upper_layer(pos, L, L_n):
        if (L - L_n) == 1:
            if pos == 0:
                p_list = [0]
            elif pos == L-1:
                p_list = [L_n-1]
            else:
                p_list = [pos-1, pos]
        elif L/L_n % 2 == 0:
            pos_ov2 = pos / 2
            p_list = [int(pos_ov2 - pos_ov2 % 1)]
        elif L == L_n:
            p_list = [pos]
        else:
            raise ValueError(
                'Underlying assumption about layer structure violated.\n'+
                'Edge sizes must differ by at most 1 or a mutliple of 2')
        return p_list
    
    shape = structure[layer]
    L_x, L_y = shape_check(shape)
    
    
    shape_next = structure[layer+1]
    L_x_n, L_y_n = shape_check(shape_next)
    
    p_x_list = find_edge_pos_upper_layer(x, L_x, L_x_n)
    p_y_list = find_edge_pos_upper_layer(y, L_y, L_y_n)
    
    feed2List = []
    for p_x in p_x_list:
        feed2List += [(p_x, p_y) for p_y in p_y_list]
        
    return feed2List

In [None]:
def fedfrom(structure, layer, x, y):
    """
    Specifies which PVM-units the location layer, x, y is fed
    from in the layer directly below assuming that the structure is
    rectangular and doesn't differ along an edge by more than 1 unit
    or by double the dimension of units
    """
    assert layer > 0
    
    def shape_check(shape):
        if isinstance(shape, int):
            L_x = shape
            L_y = shape
        elif len(shape) == 2:
            L_x, L_y = shape
        elif len(shape) == 1:
            L_x = shape[0]
            L_y = shape[0]
        else:
            assert len(shape) < 2 and len(shape) > 0
        return L_x, L_y
    
    def find_edge_pos_lower_layer(pos, L, L_b):
        if (L_b - L) == 1:
            p_list = [pos, pos + 1]
        elif L_b/L % 2 == 0:
            pos_doub = 2*pos
            p_list = [pos_doub, pos_doub+1]
        elif L == L_b:
            p_list = [pos]
        else:
            raise ValueError(
                'Underlying assumption about layer structure violated.\n'+
                'Edge sizes must differ by at most 1 or a mutliple of 2')
        return p_list
    
    shape = structure[layer]
    L_x, L_y = shape_check(shape)
    
    
    shape_b = structure[layer-1]
    L_x_b, L_y_b = shape_check(shape_b)
    
    p_x_list = find_edge_pos_lower_layer(x, L_x, L_x_b)
    p_y_list = find_edge_pos_lower_layer(y, L_y, L_y_b)
    
    fedfromList = []
    for p_x in p_x_list:
        fedfromList += [(p_x, p_y) for p_y in p_y_list]
        
    return fedfromList

In [None]:
def make_connections(structure, output_sizes):
    """
    A helper function to make the dictionaries for characterizing 
    the connections of PVM hierarchy
    """
    N_layers = len(structure)
    
    assert N_layers == len(output_sizes), \
    "must specify the output sizes such that they match the number of layers"
    
    def shape_check(shape):
        if isinstance(shape, int):
            L_x = shape
            L_y = shape
        elif len(shape) == 2:
            L_x, L_y = shape
        elif len(shape) == 1:
            L_x = shape[0]
            L_y = shape[0]
        else:
            assert len(shape) < 2 and len(shape) > 0
        return L_x, L_y
    
    ret_name = lambda ls: ['_{0}_{1}_{2}'.format(l, x_, y_)
                           for l, x_, y_ in ls]
    
    connect_dict = OrderedDict()#{}
    Unit_count = 0
    for layer, shape in enumerate(structure):
        output_size = output_sizes[layer]
        L_x, L_y = shape_check(shape)
        for x in range(L_x):
            for y in range(L_y):
                name = '_{0}_{1}_{2}'.format(layer, x, y)
                
                nn_list = []
                if x == 0:
                    nn_list.append((layer, 1, y))
                elif x == L_x-1:
                    nn_list.append((layer, x-1, y))
                else:
                    nn_list += [(layer, x-1, y), (layer, x+1, y)]
                if y == 0:
                    nn_list.append((layer, x, 1))
                elif y == L_y-1:
                    nn_list.append((layer, x, y-1))
                else:
                    nn_list += [(layer, x, y-1), (layer, x, y+1)]
                
                try:
                    feeds2list = feeds2(structure, layer, x, y)
                except AssertionError:
                    feeds2list = []
                try:
                    fedfromlist = fedfrom(structure, layer, x, y)
                    fedfromlist = [(layer-1, x_, y_) 
                                   for x_, y_ in fedfromlist]
                except AssertionError:
                    fedfromlist = []
                
                if layer < N_layers-2:
                    latsuplist = nn_list + [(layer+1, x_, y_)
                                            for x_, y_ in feeds2list]\
                                            + [(N_layers-1, 0, 0)]
                elif layer == N_layers-2:
                    latsuplist = nn_list + [(N_layers-1, 0, 0)]
                else:
                    latsuplist = [] #None
                
                try:
                    connect_dict[name] = (Unit_count, 
                                          output_size,
                                          ret_name(fedfromlist), 
                                          ret_name(latsuplist))
                except ValueError:
                    try:
                        connect_dict[name] = (Unit_count, 
                                              output_size,
                                              fedfromlist, 
                                              ret_name(latsuplist))
                    except TypeError:
                        connect_dict[name] = (Unit_count, 
                                              output_size,
                                              fedfromlist, 
                                              latsuplist)
                except TypeError:
                    connect_dict[name] = (Unit_count, 
                                          output_size,
                                          ret_name(fedfromlist), 
                                          latsuplist)
                Unit_count += 1
    return connect_dict

In [None]:
connect_dict = make_connections([(16, 16), 8, 4, 3, 2, 1],
                                [1, 2*2, 4*4, 8*8, 6*6, 16*16])

In [None]:
connect_dict

In [None]:
def weight_initialize(connect_dict, input_size, hidden_size):
    """
    Glorot initializing the weight matrix of each PVM unit based on the
    output of the function make_connections or any other ordered dictionary 
    with keys of the format '_{layer}_{column}_{row}' with values
    (unit_count, output_size, fedfromlist, latsuplist)
    input_size is the size of the input to one pvm_unit in the lowest layer
    """
    
    # lists with mappings for memory shuffling
    input_map = [] #full_input[input_map[i]] = input[i]
    der_map = [] #full_input[der_map[i]] = der[i]
    int_map = [] #full_input[int_map[i]] = integral[i]
    err_map = [] #full_input[err_map[i]] = error[i]
    
    hid_map = [] #full_input[j] = hidden[hid_map[j]] if hid_map[j] != -1
    hid_append_map = [] 
    #input_app[len(input)+i] = hidden[hid_append_map[i]]
    
    out_map = [] # output[i] = op[out_map[i]]
    pred_map = [] # pred[i] = op[pred_map[i]]
    
    # inputs (+ context) to hidden weights and biases
    i2h_bias = []
    # weights will be a sparse array in CSR format
    i2h_weights  = [] # the values of the weights
    i2h_pointers = [0] # sparse array pointers
    i2h_indices  = [] # column indices
    
    # contains the index of each PVM unit in the raw input
    input_new_unit = [0] 
    
    # hidden to output & prediction weights and biases
    h2op_bias = []
    # weights will be a sparse array in CSR format
    h2op_weights  = []
    h2op_pointers = [0]
    h2op_indices  = []
    
    # contains the index of each PVM unit of the 
    # output and predictions will be of length N_pmv_units + 1
    op_new_unit = [0] 
    
    i2h_col_start = 0
    h2op_col_start = 0
    for key, val in connect_dict.items():
        unit_count, output_size, fedfromlist, latsuplist = val
        
        # counting the number of units feeding into the array
        N_units_feeding_into = len(fedfromlist)
        N_units_latsup       = len(latsuplist)
        
        if N_units_feeding_into == 0:
            raw_fed_size = input_size
        else:
            raw_fed_size = N_units_feeding_into * hidden_size
        
        # input, derivative, integral, error, hidden and 
        # lateral/superior context are all contributing to
        # the input size to the sigmoid layer
        fed_size = 4 * raw_fed_size + (N_units_latsup + 1) * hidden_size
        
        # prediction of the input and output heatmap
        full_output_size = raw_fed_size + output_size
        
        
        i2h_col_end = i2h_col_start + fed_size
        # assigning the weights and bias for hidden calculation
        for row in range(hidden_size):
            # random Gaussian variables with 1/fed_size variance
            i2h_weights.append(np.random.randn(fed_size) 
                               / math.sqrt(fed_size))
            i2h_pointers.append(i2h_pointers[-1] + fed_size)
            i2h_indices.append(np.arange(i2h_col_start, i2h_col_end))
        
        i2h_bias.append(np.random.randn(fed_size))
        
        # the mappings for inputs, derivatives and integral
        if key[:2] == '_0':
            #raw_fed_size is the same as input_size in this case
            input_map += list(range(i2h_col_start,
                                    i2h_col_start + input_size))
            der_map += list(range(i2h_col_start + input_size,
                                  i2h_col_start + 2 * input_size))
            int_map += list(range(i2h_col_start + 2 * input_size,
                                  i2h_col_start + 3 * input_size))
            err_map += list(range(i2h_col_start + 3 * input_size,
                                  i2h_col_start + 4 * input_size))
        else:
            der_map += list(range(i2h_col_start + raw_fed_size,
                                  i2h_col_start + 2 * raw_fed_size))
            int_map += list(range(i2h_col_start + 2 * raw_fed_size,
                                  i2h_col_start + 3 * raw_fed_size))
            err_map += list(range(i2h_col_start + 3 * raw_fed_size,
                                  i2h_col_start + 4 * raw_fed_size))
        
        # initialize all hid_map in the relevant range to -1
        hid_map += [-1]*(fed_size)
        
        hid_start = i2h_col_start
        hid_end = hid_start + hidden_size
        for unit in fedfromlist:
            uc = connect_dict[unit][0] #unit count of lat and superior
            hid_map[hid_start:hid_end] = list(range(uc 
                                                    * hidden_size,
                                                    (uc + 1)
                                                    * hidden_size))
            hid_append_map += hid_map[hid_start:hid_end]
            hid_start += hidden_size
            hid_end   += hidden_size
            
        
        hid_start = i2h_col_start + 4 * raw_fed_size
        hid_end = hid_start + hidden_size
        
        hid_map[hid_start:hid_end] = list(range(unit_count 
                                                * hidden_size,
                                                (unit_count + 1)
                                                * hidden_size))
        
        for unit in latsuplist:
            hid_start += hidden_size
            hid_end   += hidden_size
            uc = connect_dict[unit][0] #unit count of lat and superior
            hid_map[hid_start:hid_end] = list(range(uc 
                                                    * hidden_size,
                                                    (uc + 1)
                                                    * hidden_size))
        
        
        i2h_col_start = i2h_col_end
        input_new_unit.append(i2h_col_start)
        
        
        h2op_col_end = h2op_col_start + hidden_size
        # assigning the weights and biases of output + prediction
        for row in range(full_output_size):
            # random Gaussian variables with 1/hidden_size variance
            h2op_weights.append(np.random.randn(hidden_size)
                                / math.sqrt(hidden_size))
            h2op_pointers.append(h2op_pointers[-1] + hidden_size)
            h2op_indices.append(np.arange(h2op_col_start, h2op_col_end))
        
        h2op_bias.append(np.random.randn(full_output_size))
        
        new_op_start = op_new_unit[-1]
        new_op_out_ends = new_op_start + output_size
        new_op_end   = op_new_unit[-1] + full_output_size
        out_map  += list(range(new_op_start, new_op_out_ends))
        pred_map += list(range(new_op_out_ends, new_op_end))
        
        h2op_col_start = h2op_col_end
        op_new_unit.append(new_op_end)
    
    i2h_bias      = np.concatenate(i2h_bias,    axis = 0)
    i2h_indices   = np.concatenate(i2h_indices, axis = 0)
    i2h_weights   = np.concatenate(i2h_weights, axis = 0)
    i2h_pointers  = np.array(i2h_pointers, dtype=np.int32)
    
    h2op_bias     = np.concatenate(h2op_bias,    axis = 0)
    h2op_indices  = np.concatenate(h2op_indices, axis = 0)
    h2op_weights  = np.concatenate(h2op_weights, axis = 0)
    h2op_pointers = np.array(h2op_pointers, dtype=np.int32)
    
    
    return (i2h_pointers, i2h_indices, i2h_weights, i2h_bias),\
           (h2op_pointers, h2op_indices, h2op_weights, h2op_bias),\
           (input_map, der_map, int_map, err_map, hid_map, 
            hid_append_map, out_map, pred_map),\
            input_new_unit, op_new_unit

In [None]:
def tracker_weight_initialize(structure,
                              output_sizes, heatmap_size):
    N_layers = len(structure)
    
    assert N_layers == len(output_sizes), \
    "must specify the output sizes such that they match the number of layers"
    
    def shape_check(shape):
        if isinstance(shape, int):
            L_x = shape
            L_y = shape
        elif len(shape) == 2:
            L_x, L_y = shape
        elif len(shape) == 1:
            L_x = shape[0]
            L_y = shape[0]
        else:
            assert len(shape) < 2 and len(shape) > 0
        return L_x, L_y
    
    tracker_bias     = []
    tracker_weights  = []
    tracker_pointers = [0]
    tracker_indices  = []
    
    
    tracker_col_start = 0
    for layer in range(N_layers):
        L_x, L_y = shape_check(structure[layer])
        
        output_size = output_sizes[layer]
        
        tracker_input_size = L_x * L_y * output_size
        
        tracker_col_end = tracker_col_start + tracker_input_size
        for row in range(heatmap_size):
            # random Gaussian variables with 
            # 1/tracker_input_size variance
            tracker_weights.append(np.random.randn(tracker_input_size)
                                   / math.sqrt(tracker_input_size))
            tracker_pointers.append(tracker_pointers[-1]
                                    + tracker_input_size)
            tracker_indices.append(np.arange(tracker_col_start,
                                             tracker_col_end))
        
        tracker_bias.append(np.random.randn(heatmap_size))
        
        tracker_col_start = tracker_col_end
    
    tracker_bias     = np.concatenate(tracker_bias,    axis = 0)
    tracker_indices  = np.concatenate(tracker_indices, axis = 0)
    tracker_weights  = np.concatenate(tracker_weights, axis = 0)
    tracker_pointers = np.array(tracker_pointers, dtype=np.int32)
    
    return tracker_pointers, tracker_indices, tracker_weights, tracker_bias

In [None]:
int('_0_1_2'.split('_')[1])

In [None]:
outs = weight_initialize(connect_dict, 6*6*3, 49)

In [None]:
outs[2][0]

In [None]:
outs[0][0][-1] == len(outs[0][2])

In [None]:
outs[1][0][-1] == len(outs[1][2])

In [None]:
def test_no_overlap_in_mapping(outs, i):
    for idx in outs[2][i]:
        if outs[2][4][idx] != -1:
            print('We have a problem: ', idx)
    print('Done.')

for i in range(4):
    test_no_overlap_in_mapping(outs, i)

In [None]:
len(outs[2][4])

In [None]:
outs[1][0][-1]

In [None]:
outs[1][1][:100], len(outs[1][1])

In [None]:
np.concatenate([np.array([outs[3]]).T,
                np.array([outs[4]]).T], axis = 1)

In [None]:
len(outs[3]), len(outs[4])

In [None]:
filename = "./Useful_Kernels.cu"

In [None]:
with open(filename) as fid:
    kernel_code = fid.read()
    
# compile the kernel code
mod = compiler.SourceModule(kernel_code)

# basic elementwise addition
# (I might want to compare the speed 
# with the default addition hook on gpu arrays)
add = mod.get_function('ArrayAddKernel')
add.prepare(['P', 'P', 'P', np.int32])

# basic elementwise subtraction
# (I might want to compare the speed
# with the default subtraction hook on gpu arrays)
sub = mod.get_function('ArrayDifferenceKernel')
sub.prepare(['P', 'P', 'P', np.int32])

# basic elementwise multiplication
# (I might want to compare the speed
# with the default multiplication hook on gpu arrays)
hadamard = mod.get_function('HadamardProductKernel')
hadamard.prepare(['P', 'P', 'P', np.int32])

# All vectors are dense, all matrices are CSR

# sets all the elements of an array equal to zero
# this is needed for the spmTv_csr_kernel since atomic add is used
zerofill = mod.get_function('ZeroFillKernel')
zerofill.prepare(['P', np.int32])

# outer product that maps result to the shape of the sparse weight matrix
# (indices and pointers needed)
kron = mod.get_function('spvv_csr_outer_kernel')
kron.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# sparse matrix-vector multiplication using CSR format 
dot = mod.get_function('spmv_csr_kernel')
dot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# sparse matrix-transpose-vector multiplication using CSR format 
Tdot = mod.get_function('spmTv_csr_kernel')
Tdot.prepare([np.int32, 'P', 'P', 'P', 'P', 'P'])

# elementwise application of the sigmoid function
sig = mod.get_function('SigmoidKernel')
sig.prepare(['P', 'P', np.int32])

# elementwise application of the derivative of the sigmoid
dsig = mod.get_function('SigmoidPrimeKernel')
dsig.prepare(['P', 'P', np.int32])

# updating weights and biases
update = mod.get_function('UpdateKernel')
update.prepare(['P', 'P', np.float64, np.int32])

In [None]:
tracker_filename = "/home/mhazoglou/CUDA_Kernels/tracker_kernels.cu"

In [None]:
with open(tracker_filename) as fid:
    tracker_kernel_code = fid.read()
    
# compile the kernel code
mod_tracker = compiler.SourceModule(tracker_kernel_code)

der_and_error = mod_tracker.get_function('der_and_error_kernel')
der_and_error.prepare(['P', 'P', 'P', np.int32])

integral = mod_tracker.get_function('integral_kernel')
integral.prepare(['P', 'P', 'P', np.int32])

append_hid = mod_tracker.get_function('hid_append_kernel')
append_hid.prepare(['P', 'P', 'P', 'P', np.int32, np.int32])

avg_pool = mod_tracker.get_function('tracker_avg_pool_kernel')
avg_pool.prepare(['P', 'P', np.int32, np.int32])

input_shuffling = mod_tracker.get_function('full_input_map_kernel')
input_shuffling.prepare(['P', 'P', 'P', np.int32])

input_hidden_shuffling = mod_tracker.get_function(\
                                        'hidden_map_to_full_input_kernel')
input_hidden_shuffling.prepare(['P', 'P', 'P', np.int32])

output_pred_shuffling = mod_tracker.get_function('output_pred_map_kernel')
output_pred_shuffling.prepare(['P', 'P', 'P', np.int32])

rev_output_pred_shuffling = mod_tracker.get_function('rev_output_pred_map_kernel')
rev_output_pred_shuffling.prepare(['P', 'P', 'P', np.int32])

sq_err_der_tracker = mod_tracker.get_function('SquareErrorDerTrackerKernel')
sq_err_der_tracker.prepare(['P', 'P', 'P', np.int32, np.int32])

In [None]:
i2h_stuff, h2op_stuff, map_stuff, input_new_unit, op_new_unit = \
                    weight_initialize(connect_dict, input_size, hidden_size)
i2h_pointers, i2h_indices, i2h_weights, i2h_bias = i2h_stuff
h2op_pointers, h2op_indices, h2op_weights, h2op_bias = h2op_stuff
input_map, der_map, int_map, err_map, hid_map, hid_append_map, out_map,\
                                                        pred_map = map_stuff

In [None]:
tracker_stuff = tracker_weight_initialize(structure,
                                          output_sizes, heat_map_size)
tracker_pointers, tracker_indices,\
                            tracker_weights, tracker_bias = tracker_stuff

In [None]:
from scipy import sparse as sp

tracker_w_sparse = sp.csr_matrix((tracker_weights, tracker_indices, tracker_pointers))

tracker_pointers[-1] == len(tracker_weights)

In [None]:
i2h_pointers = gpuarray.to_gpu(i2h_pointers)
i2h_indices  = gpuarray.to_gpu(i2h_indices)
i2h_weights  = gpuarray.to_gpu(i2h_weights)
i2h_bias     = gpuarray.to_gpu(i2h_bias)

h2op_pointers = gpuarray.to_gpu(h2op_pointers)
h2op_indices  = gpuarray.to_gpu(h2op_indices)
h2op_weights  = gpuarray.to_gpu(h2op_weights)
h2op_bias     = gpuarray.to_gpu(h2op_bias)

In [None]:
tracker_pointers = gpuarray.to_gpu(tracker_pointers)
tracker_indices  = gpuarray.to_gpu(tracker_indices)
tracker_weights  = gpuarray.to_gpu(tracker_weights)
tracker_bias     = gpuarray.to_gpu(tracker_bias)

In [None]:
L_hidden_inputs = np.int32(len(hid_append_map)) # length of hidden as inputs
L_input = np.int32(len(input_map))

# L_pred == len(der_map) == len(int_map) == len(err_map)
L_pred = np.int32(len(pred_map)) #same as L_hidden_inputs + L_input
L_full_input = np.int32(len(hid_map)) #length of inputs plus context
L_out = np.int32(len(out_map))
L_op = np.int32(L_out + L_pred)

N_units = np.int32(len(connect_dict))
N_layers = np.int32(len(structure))

L_hidden = np.int32(N_units * hidden_size)
L_heatmaps = np.int32(N_layers * heat_map_size)
L_avg_heatmap = np.int32(heat_map_size)
i2h_weights_nnz = np.int32(len(i2h_weights))
h2op_weights_nnz = np.int32(len(h2op_weights))
tracker_weights_nnz = np.int32(len(tracker_weights))

In [None]:
maxthreads = 1024
max_grid_size = 1024
grid_x_i2h = min(int(math.ceil(L_hidden/maxthreads)),
                 max_grid_size)
grid_x_input = min(int(math.ceil(L_input/maxthreads)),
                   max_grid_size)
grid_x_int_der_err = min(int(math.ceil(L_pred/maxthreads)),
                         max_grid_size)
grid_x_h2op = min(int(math.ceil(L_op/maxthreads)),
                  max_grid_size)
grid_x_o_shuf = min(int(math.ceil(L_out/maxthreads)),
                    max_grid_size)
grid_x_tracker = min(int(math.ceil(L_heatmaps/maxthreads)),
                    max_grid_size)
grid_x_avgpool = min(int(math.ceil(L_avg_heatmap/maxthreads)),
                    max_grid_size)
grid_x_update_i2h = min(int(math.ceil(i2h_weights_nnz / maxthreads)),
                                max_grid_size)
grid_x_update_h2op = min(int(math.ceil(h2op_weights_nnz / maxthreads)),
                         max_grid_size)
grid_x_update_tracker = min(int(math.ceil(tracker_weights_nnz / maxthreads)),
                            max_grid_size)

In [None]:
train_data_reformat.keys()

In [None]:
single_frame = train_data_reformat['green_ball_01_small'][0][0, :]
next_frame   = train_data_reformat['green_ball_01_small'][0][1, :]
ground_truth_heatmap = train_data_reformat['green_ball_01_small'][1][0]

In [None]:
{'a':[]}*10

In [None]:
input_.get()

In [None]:
input_ = next_input

In [None]:
# data to transfer
input_     = gpuarray.to_gpu(single_frame)
next_input = gpuarray.to_gpu(next_frame)
gt_heatmap = gpuarray.to_gpu(ground_truth_heat_map)

In [None]:
# allocating the arrays on the gpu
hidden = gpuarray.to_gpu(np.zeros(L_hidden))
hid_affine = gpuarray.empty_like(hidden)

hidden_inputs = gpuarray.to_gpu(np.zeros(L_hidden_inputs))
in_and_hid = gpuarray.to_gpu(np.zeros(L_pred))

der  = gpuarray.to_gpu(np.zeros(L_pred))
err  = gpuarray.to_gpu(np.zeros(L_pred))
int_ = gpuarray.to_gpu(np.zeros(L_pred))

out_and_pred = gpuarray.to_gpu(np.zeros(L_op))
out_and_pred_affine = gpuarray.empty_like(out_and_pred)

pred = gpuarray.to_gpu(np.zeros(L_pred))
output = gpuarray.to_gpu(np.zeros(L_out))

prev_input = gpuarray.to_gpu(np.zeros(L_pred))
full_input = gpuarray.to_gpu(np.zeros(L_full_input))

heatmaps = gpuarray.to_gpu(np.zeros(L_heatmaps))
heatmaps_affine = gpuarray.empty_like(heatmaps)
avg_heatmap = gpuarray.to_gpu(np.zeros(L_avg_heatmap))

In [None]:
# variables for back-propagation
ideal_pred    = gpuarray.to_gpu(np.zeros(L_pred))

delta_tracker = gpuarray.to_gpu(np.zeros(L_heatmaps))
delta_output  = gpuarray.to_gpu(np.zeros(L_out))
delta_pred    = gpuarray.to_gpu(np.zeros(L_pred))
delta_h2op    = gpuarray.to_gpu(np.zeros(L_op))
delta_i2h     = gpuarray.to_gpu(np.zeros(L_hidden))

a_prime_tracker = gpuarray.empty_like(heatmaps)
a_prime_h2op    = gpuarray.empty_like(out_and_pred)
a_prime_i2h     = gpuarray.empty_like(hidden)

grad_weight_tracker = gpuarray.empty_like(tracker_weights)
grad_weight_h2op    = gpuarray.empty_like(h2op_weights)
grad_weight_i2h     = gpuarray.empty_like(i2h_weights)

grad_bias_tracker = gpuarray.empty_like(tracker_bias)
grad_bias_h2op    = gpuarray.empty_like(h2op_bias)
grad_bias_i2h     = gpuarray.empty_like(i2h_bias)

In [None]:
# put all maps on gpu for memory shuffling between arrays
input_map_gpu = gpuarray.to_gpu(np.array(input_map, dtype=np.int32))
der_map_gpu   = gpuarray.to_gpu(np.array(der_map, dtype=np.int32))
int_map_gpu   = gpuarray.to_gpu(np.array(int_map, dtype=np.int32))
err_map_gpu   = gpuarray.to_gpu(np.array(err_map, dtype=np.int32))
hid_map       = gpuarray.to_gpu(np.array(hid_map, dtype=np.int32))
hid_append_map = gpuarray.to_gpu(np.array(hid_append_map, dtype=np.int32))
out_map_gpu   = gpuarray.to_gpu(np.array(out_map, dtype=np.int32))
pred_map_gpu  = gpuarray.to_gpu(np.array(pred_map, dtype=np.int32))

In [None]:
stream1 = driver.Stream()
stream2 = driver.Stream()
stream3 = driver.Stream()
stream4 = driver.Stream()
stream5 = driver.Stream()

In [None]:
append_hid.prepared_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                         input_.gpudata, hidden.gpudata, in_and_hid.gpudata,
                         hid_append_map.gpudata, L_input, L_pred)

In [None]:
in_and_hid.get() 

In [None]:
%%time
der_and_error.prepared_async_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                                  stream1,
                                  in_and_hid.gpudata, prev_input.gpudata,
                                  der.gpudata, L_pred)
der_and_error.prepared_async_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                                  stream2,
                                  pred.gpudata, in_and_hid.gpudata,
                                  err.gpudata, L_pred)
integral.prepared_async_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                             stream3,
                             in_and_hid.gpudata, int_.gpudata,
                             int_.gpudata, L_pred)

In [None]:
err.get()

In [None]:
(pred.get() - in_and_hid.get() + 1)*0.5

In [None]:
int_.get()

In [None]:
%%time

input_shuffling.prepared_async_call((grid_x_int_der_err, 1),
                                    (maxthreads, 1, 1), stream4,
                                    full_input.gpudata, input_.gpudata,
                                    input_map_gpu.gpudata, L_input)

input_hidden_shuffling.prepared_async_call((grid_x_i2h, 1),
                                           (maxthreads, 1, 1), stream5,
                                           full_input.gpudata, hidden.gpudata,
                                           hid_map.gpudata, L_full_input)

input_shuffling.prepared_async_call((grid_x_int_der_err, 1),
                                    (maxthreads, 1, 1), stream1,
                                    full_input.gpudata, der.gpudata,
                                    der_map_gpu.gpudata, L_pred)

input_shuffling.prepared_async_call((grid_x_int_der_err, 1),
                                    (maxthreads, 1, 1), stream2,
                                    full_input.gpudata, err.gpudata,
                                    err_map_gpu.gpudata, L_pred)

input_shuffling.prepared_async_call((grid_x_int_der_err, 1),
                                    (maxthreads, 1, 1), stream3,
                                    full_input.gpudata, int_.gpudata,
                                    int_map_gpu.gpudata, L_pred)

prev_input = in_and_hid[:]

In [None]:
id(prev_input), id(in_and_hid)

In [None]:
prev_input, in_and_hid

In [None]:
full_input[-4000:-3000]

In [None]:
%%time
dot.prepared_call((grid_x_i2h, 1), (maxthreads, 1, 1),
    # number of rows
    L_hidden,
    # CSR sparse matrix
    i2h_pointers.gpudata, i2h_indices.gpudata, i2h_weights.gpudata,
    # vector
    full_input.gpudata,
    # result
    hid_affine.gpudata)

add.prepared_call((grid_x_i2h, 1), (maxthreads, 1, 1), 
    hid_affine.gpudata, i2h_bias.gpudata,
    hid_affine.gpudata,
    L_hidden)

sig.prepared_call((grid_x_i2h, 1), (maxthreads, 1, 1),
    hid_affine.gpudata, hidden.gpudata,
    L_hidden)

dot.prepared_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                  # number of rows
                  L_op,
                  # CSR sparse matrix
                  h2op_pointers.gpudata, h2op_indices.gpudata,
                  h2op_weights.gpudata,
                  #vector
                  hidden.gpudata,
                  # results
                  out_and_pred_affine.gpudata)

add.prepared_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                  out_and_pred_affine.gpudata, h2op_bias.gpudata,
                  out_and_pred_affine.gpudata,
                  L_op)

sig.prepared_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                  out_and_pred_affine.gpudata, out_and_pred.gpudata,
                  L_op)

In [None]:
full_input.get()

In [None]:
hid_affine

In [None]:
hidden

In [None]:
#%%time
output_pred_shuffling.prepared_async_call((grid_x_o_shuf, 1),
                                          (maxthreads, 1, 1), stream4,
                                          output.gpudata,
                                          out_and_pred.gpudata,
                                          out_map_gpu.gpudata, L_out)

output_pred_shuffling.prepared_async_call((grid_x_int_der_err, 1),
                                          (maxthreads, 1, 1), stream5,
                                          pred.gpudata,
                                          out_and_pred.gpudata,
                                          pred_map_gpu.gpudata, L_pred)

In [None]:
out_map

In [None]:
out_and_pred

In [None]:
%%time
dot.prepared_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                  # number of rows
                  L_heatmaps,
                  # CSR sparse matrix
                  tracker_pointers.gpudata, tracker_indices.gpudata,
                  tracker_weights.gpudata,
                  #vector
                  output.gpudata,
                  # results
                  heatmaps_affine.gpudata)

add.prepared_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                  heatmaps_affine.gpudata, tracker_bias.gpudata,
                  heatmaps_affine.gpudata,
                  L_heatmaps)

sig.prepared_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                  heatmaps_affine.gpudata, heatmaps.gpudata, L_heatmaps)

avg_pool.prepared_call((grid_x_avgpool, 1), (maxthreads, 1, 1),
                       heatmaps.gpudata, avg_heatmap.gpudata,
                       L_avg_heatmap, N_layers)

In [None]:
test = gpuarray.to_gpu(np.ones(L_heatmaps))
out_test = gpuarray.to_gpu(np.zeros(L_out))

In [None]:
cpu_result = (tracker_w_sparse.transpose()).dot(np.ones(L_heatmaps))

In [None]:
cpu_result

In [None]:
Tdot.prepared_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                         L_heatmaps,
                         tracker_pointers.gpudata, tracker_indices.gpudata,
                         tracker_weights.gpudata,
                         #vector
                         test.gpudata,
                         # results
                         out_test.gpudata)

In [None]:
np.sqrt(sum((cpu_result - out_test.get())**2))

In [None]:
max(tracker_indices.get()), L_heatmaps

In [None]:
dsig.prepared_async_call((grid_x_i2h, 1), (maxthreads, 1, 1),
                         stream1,
                         hid_affine.gpudata, a_prime_i2h.gpudata, L_hidden)

dsig.prepared_async_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                         stream2,
                         heatmaps_affine.gpudata,
                         a_prime_tracker.gpudata, L_heatmaps)

dsig.prepared_async_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                         stream3,
                         out_and_pred_affine.gpudata, a_prime_h2op.gpudata, L_op)

In [None]:
1/(1 + np.exp(-hid_affine.get()))

In [None]:
s = 1/(1 + np.exp(-hid_affine.get()))
s * (1 - s)

In [None]:
a_prime_i2h.get()

In [None]:
%%time
append_hid.prepared_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                         next_input.gpudata, hidden.gpudata, ideal_pred.gpudata,
                         hid_append_map.gpudata, L_input, L_pred)

sub.prepared_async_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                        stream1,
                        pred.gpudata, ideal_pred.gpudata,
                        delta_pred.gpudata, L_pred)

sq_err_der_tracker.prepared_async_call((grid_x_avgpool, 1), (maxthreads, 1, 1),
                                       stream2,
                                       avg_heatmap.gpudata, gt_heatmap.gpudata,
                                       delta_tracker.gpudata, L_avg_heatmap, N_layers)



hadamard.prepared_async_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                             stream2,
                             delta_tracker.gpudata, a_prime_tracker.gpudata,
                             delta_tracker.gpudata, L_heatmaps)

zerofill.prepared_async_call((grid_x_o_shuf, 1), (maxthreads, 1, 1),
                             stream2,
                             delta_output.gpudata, L_out)

# data type issue 446464 is the values pointers goes up to but 65535 is unsigned int range
# need to change unsigned int to unsigned long int
Tdot.prepared_async_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                         stream2,
                         L_heatmaps,
                         tracker_pointers.gpudata, tracker_indices.gpudata,
                         tracker_weights.gpudata,
                         #vector
                         delta_tracker.gpudata,
                         # results
                         delta_output.gpudata)

rev_output_pred_shuffling.prepared_async_call((grid_x_int_der_err, 1), (maxthreads, 1, 1),
                                              stream1,
                                              delta_pred.gpudata, delta_h2op.gpudata,
                                              pred_map_gpu.gpudata, L_pred)

rev_output_pred_shuffling.prepared_async_call((grid_x_o_shuf, 1), (maxthreads, 1, 1),
                                              stream2,
                                              delta_output.gpudata, delta_h2op.gpudata,
                                              out_map_gpu.gpudata, L_out)



hadamard.prepared_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                       delta_h2op.gpudata, a_prime_h2op.gpudata,
                       delta_h2op.gpudata, L_op)

zerofill.prepared_call((grid_x_i2h, 1), (maxthreads, 1, 1),
                       delta_i2h.gpudata, L_hidden)

Tdot.prepared_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                   L_op,
                   h2op_pointers.gpudata, h2op_indices.gpudata,
                   h2op_weights.gpudata,
                   delta_h2op.gpudata,
                   delta_i2h.gpudata)

hadamard.prepared_call((grid_x_i2h, 1), (maxthreads, 1, 1),
                       delta_i2h.gpudata, a_prime_i2h.gpudata,
                       delta_i2h.gpudata, L_hidden)

In [None]:
lr = np.float64(0.001)

In [None]:
%%time
kron.prepared_async_call((grid_x_i2h, 1), (maxthreads, 1, 1),
                         stream1,
                         L_hidden, i2h_pointers.gpudata, i2h_indices.gpudata,
                         delta_i2h.gpudata, hidden.gpudata,
                         grad_weight_i2h.gpudata)

kron.prepared_async_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                         stream2,
                         L_op, h2op_pointers.gpudata, h2op_indices.gpudata,
                         delta_h2op.gpudata, out_and_pred.gpudata,
                         grad_weight_h2op.gpudata)

kron.prepared_async_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                         stream3,
                         L_heatmaps, tracker_pointers.gpudata, tracker_indices.gpudata,
                         delta_tracker.gpudata, heatmaps.gpudata,
                         grad_weight_tracker.gpudata)

update.prepared_async_call((grid_x_i2h, 1), (maxthreads, 1, 1),
                           stream1,
                           i2h_bias.gpudata, delta_i2h.gpudata,
                           lr, L_hidden)

update.prepared_async_call((grid_x_h2op, 1), (maxthreads, 1, 1),
                           stream2,
                           h2op_bias.gpudata, delta_h2op.gpudata,
                           lr, L_op)

update.prepared_async_call((grid_x_tracker, 1), (maxthreads, 1, 1),
                           stream3,
                           tracker_bias.gpudata, delta_tracker.gpudata, 
                           lr, L_heatmaps)

update.prepared_async_call((grid_x_update_i2h, 1), (maxthreads, 1, 1),
                           stream1,
                           i2h_weights.gpudata, grad_weight_i2h.gpudata,
                           lr, i2h_weights_nnz)

update.prepared_async_call((grid_x_update_h2op, 1), (maxthreads, 1, 1),
                           stream2,
                           h2op_weights.gpudata, grad_weight_h2op.gpudata,
                           lr, h2op_weights_nnz)

update.prepared_async_call((grid_x_update_tracker, 1), (maxthreads, 1, 1),
                           stream3,
                           tracker_weights.gpudata, grad_weight_tracker.gpudata, 
                           lr, tracker_weights_nnz)

In [None]:
grad_weight_i2h

In [None]:
grad_weight_h2op

In [None]:
grad_weight_tracker

In [None]:
print('-'*80)

In [None]:
tracker_weights, i2h_weights, h2op_weights

In [None]:
delta_tracker.get()[1::256]

In [None]:
delta_output.get()

In [None]:
delta_h2op

In [None]:
delta_i2h

In [None]:
tracker_indices.get()[-1]

In [None]:
L_out

In [None]:
L_heatmaps

In [None]:
tracker_pointers

In [None]:
tracker_indices

In [None]:
L_heatmaps