# Table of Contents
 <p>

READ (described in Sec. 3.2),
WRITE (described in Sec. 3.2).

ZERO(a, b) = 0, 
ONE(a, b) = 1, 
TWO(a, b) = 2, 

INC(a, b) = (a+1) mod M, 
ADD(a, b) = (a+b) mod M, 
SUB(a, b) = (a−b) mod M, 
DEC(a, b) = (a−1) mod M, 
LESS-THAN(a, b) = [a < b], 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = mod M, 
SUB(a, b) = (a−b) mod M, 
DEC(a, b) = (a−1) mod M, 
LESS-THAN(a, b) = [a < b], 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = min(a, b), 
MAX(a, b) = max(a, b), 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = min(a, b), 
MAX(a, b) = max(a, b), 


In [1]:
import numpy as np
import theano

from theano import tensor
from collections import namedtuple
from theano.tensor.extra_ops import to_one_hot
from theano.tensor import roll
from theano.tensor import stack
from theano.tensor import batched_dot
from theano.tensor import concatenate
from theano.tensor import as_tensor
from theano.tensor import set_subtensor
from theano import shared
from numpy.random import uniform
from theano.tensor.nnet import softmax, relu, sigmoid
from theano import function, config, shared, sandbox, Out

def zero() :
    return to_one_hot(np.asarray([0]),M)
def one() :
    return to_one_hot(np.asarray([1]),M)
def two() :
    return to_one_hot(np.asarray([2]),M)
def get_const(value) :
    return to_one_hot(np.asarray([value % M]),M)
def create_memory_tape(init_val=0):
    m = stack([get_const(init_val) for i in range(M)], axis=1)
    #return shared(np.asarray(m.eval()), config.floatX) 
    return m
def inc(a) :
    return roll(a, 1, axis=1)
def negate(a) :
    return roll(a[:, ::-1], 1, axis=1)
def add(a,b) :
    rows = [roll(b[:,], j, axis=1) for j in range(M)]
    return (batched_dot(a, stack(rows, axis=1)))
def sub(a,b) :
    b_negative = negate(b)
    return add(a, b_negative)
def eq_zero(a) :
    r = tensor.zeros_like(as_tensor(a))
    r = set_subtensor(r[:,1], a[:, 0])
    r = set_subtensor(r[:, 0], 1 - a[:, 0])
    return r
def lt(a,b):
    return tensor.lt(as_tensor(a),as_tensor(b))
def lte(a,b):
    return tensor.le(as_tensor(a),as_tensor(b))
def eq(a,b):
    return tensor.eq(as_tensor(a),as_tensor(b))
def min(a,b):
    if(lte(a,b)):
        return as_tensor(a)
    else:
        return as_tensor(b)
def max(a,b):
    if(lt(a,b)):
        return as_tensor(b)
    else:
        return as_tensor(a)    
def read(mem, a) :
    ptr = as_tensor(a)
    return weighted_avg(mem, ptr), mem
def write(mem, a, b) :
    ptr = as_tensor(a)
    val = as_tensor(b)
    J = (tensor.ones_like(zero())).T
    erase_vector = (J-ptr.T).dot(J.T)
    x = tensor.mul(erase_vector, mem)
    y = (ptr.T).dot(val)
    mem = (x + y)
    #print("write gate: memory updated", mem.type)
    return val, mem
def write_external(a, b, ext_mem) :
    ptr = as_tensor(a)
    val = as_tensor(b)
    J = (tensor.ones_like(zero())).T
    erase_vector = (J-ptr.T).dot(J.T)
    x = tensor.mul(erase_vector, ext_mem)
    y = (ptr.T).dot(val)
    ext_mem = (x + y)
    return ext_mem

def get_registers(init_val):
    return stack([get_const(init_val) for i in range(R)], axis=1)

def weighted_avg(inputs, coefficient) :
    return batched_dot(inputs.transpose(0, 2, 1), coefficient.dimshuffle(0, 1, 'x')).flatten(2);

def compute_gate_new(module, inputs, coefficients, memory_tape, debug=False) :
    """ Arity of this gate must be equal to the number of given
        coefficients list
    """ 
    if (len(coefficients) != module.arity) :
        print("Error: Incorrect number of coefficients: ",  len(coefficients), " to module arity: ", module.arity)
    
    params = [weighted_avg(inputs, as_tensor(coef)) for coef in coefficients]
    
    if (debug == True):
        for i, p in enumerate(params):
            print("compute_gate_new: weighted param [", i ,"] = ", p.eval())
    
    if (module.memory_function == True) :
        #print("read/write now!")
        output, memory_tape = module.function(memory_tape, *params)
    else :   
        output = module.function(*params)
    
    if (debug == True):
        print("output from module: ", output.eval())
    
    #error check for constant gates
    return output, memory_tape


def get_n_tensor(t, count, idx):
    result=[]
    if count > 0: 
        result = [t[idx+i] for i in range(count)]
    return result, (idx+count)

def fuzzy_circuit(registers, gates, memory_tape, gate_coef, reg_coef, debug=False):
    # Initially, only the registers may be used as inputs.
    Q= len(gates)
    gate_inputs = registers
    idx=0    
    
    # Run through all the gates.
    for i in range(Q):
        c, idx = get_n_tensor(gate_coef, gates[i].arity, idx) 
        output, memory_tape = compute_gate_new(gates[i], gate_inputs, c, memory_tape)
        # Append the output of the gate as an input for future gates.
        gate_inputs = concatenate([gate_inputs, output.dimshuffle(0,'x',1)], axis=1)
        if (debug==True):
            print("gate i = ", i, "R :", R, "arity:", gates[i].arity, "my gate output: ", output.eval(), "my gate inputs: ", gate_inputs.eval())    
            print("Concatenated inputs: ", gate_inputs.eval())       
        
    # All leftover coefficients are for registers.
    new_registers = []
             
    for i in range(len(reg_coef)):
        # (R+Q) x M dot 1 X (R+Q) 
        new_registers.append(weighted_avg(gate_inputs, reg_coef[i]))
        if (debug == True) :
            print("new register: weighted_avg argmax of gate inputs and reg coef: ", weighted_avg(gate_inputs, reg_coef[i]).eval().argmax(), "reg_coef argmax", reg_coef[i].eval().argmax())
    return tensor.stack(new_registers, axis=1), memory_tape


# coefficients = [r1,r2..rR,g1_param1,g1_param2,...,gQ_param1,gQ_param2,c1,c1..CR,cR+1,..cR+Q]
def gen_random_weights(layer1, layer2, dtype=np.float64, _min=-1, _max=1):
    weights = uniform(low=_min, high=_max, size=(layer1, layer2))
    var = shared(weights.astype(dtype), name="w{0}x{1}".format(layer1, layer2))   
    #var = tensor.addbroadcast(var, 0)
    return var

def gen_network_weights(gates, layers):
    n_registers = R #input is R registers
    w = []
    current_layer_units = n_registers
    
    for next_layer_units in layers:
        w.append(gen_random_weights(current_layer_units + 1, next_layer_units))
        current_layer_units = next_layer_units
    
    #output wt for gate coefficients
    gate_coef = []
    for i, gate in enumerate(gates):
        print(i, gate)
        #each param to gates are R+i depending on arity 
        gate_output_units = n_registers + i
        for _ in range(gate.arity):
            gate_coef.append(gen_random_weights(current_layer_units + 1, gate_output_units))            
    
    #print("gen_network_weights => gate_coef", gate_coef)         
    
    #output wt for new registers
    reg_coef = []
    reg_output_units = n_registers + len(gates)
    for _ in range(n_registers):
        reg_coef.append(gen_random_weights(current_layer_units + 1, n_registers + len(gates)))    
    
    #print("gen_network_weights => reg_coef", reg_coef)      
     
    prob_completion_coef = (gen_random_weights(current_layer_units + 1, 1))

    for i in gate_coef:
        w.append(i)

    for x in reg_coef:
        w.append(x)
    
    w.append(prob_completion_coef)   
    return w

def aug_ones_col(inputs):
    print("OMFG")
    s = inputs.shape
    col_elems = s[1]
    ones = tensor.ones_like(to_one_hot(np.asarray([0]), col_elems))
    x = concatenate([inputs.T,ones], axis=1)
    return x.T

def aug_ones_col_new(inputs):
    x=[]
    s = inputs.shape
    #print(s.eval())
    d = inputs.ndim
    row_elems = s[d-2]
    #print("aug one col, dimension of inputs = ", d)
    matrix = tensor.reshape(inputs, (s[d-2],s[d-1]))
    #print(matrix.eval())
    matrix = tensor.shape_padleft(matrix)
    #print(matrix.eval())
    ones = tensor.ones_like(to_one_hot(np.asarray([0]), row_elems))
    #print(ones.eval())
    #print("shuffled ones: ", ones.dimshuffle(0,'x',1).eval())
    #print("shuffled matrix: ",matrix.dimshuffle(0,2,1).eval())
    x = concatenate([matrix.dimshuffle(0,2,1),ones.dimshuffle(0,'x',1)], axis=1)
    return x.dimshuffle(0,2,1)

def controller_forward_prop(n_registers, layers, weight_matrix, gates, registers) :
    inputs = registers[:,:,0]
    inputs = aug_ones_col_new(inputs)
    
    for i in range(len(layers)):
        W = weight_matrix[i]
        inputs = aug_ones_col_new(relu(inputs.dot(W)))  
    
    #extract gate coefficients
    gate_coef = []
    n_gate_coef = 0
    for i, gate in enumerate(gates): 
        #each param to gates are R+i depending on arity 
        gate_output_units = n_registers + i
        for _ in range(gate.arity):
            n_gate_coef += 1 
            
    for W in weight_matrix[len(layers):len(layers) + n_gate_coef]:
        gate_coef.append(softmax(inputs.dot(W)[0]))            

    reg_coef = []
    
    for W in (weight_matrix[len(layers)+n_gate_coef:len(layers)+n_gate_coef+n_registers]):
        reg_coef.append(softmax(inputs.dot(W)[0])) 
        
    p = sigmoid(inputs.dot(weight_matrix[-1]))
    
    return p, reg_coef, gate_coef

def calculate_cost_at_t(prob_complete_t, t, cum_cost, cum_prob_t, p_incomplete, memory_in, desired_output, output_len, debug) :
    e = 1e-100
    cost_t = 0
    #TODO: Find use for desired registers in calculating cost. Now, only desired memory layout is matched.
    
    for i in range(output_len):
        #Compute the loss for this register using the mask.
        mask = to_one_hot(desired_output.argmax(axis=2)[:,i], M)        
        a = tensor.log(memory_in[:, i, :] + e)
        x = mask * a
        loss = (x).sum(axis=1)
        cost_t += tensor.shape_padright(loss, 1)
    if (debug == True):
        print("Desired value at mem location [", i, "] is ", mask.eval().argmax(), "but actual value is [", memory_in[:,i,:].eval().argmax(), "loss is ", loss.eval(), " cost = ", cost_t.eval()) 

    if (t >= MAX_TIMESTEP):
        prob_complete = 1 - cum_prob_t
    else:
        prob_complete = prob_complete_t * p_incomplete
    
    p_incomplete *= (1 - prob_complete_t)   
    cum_prob_t += prob_complete
        
    cum_cost = -(cost_t*prob_complete_t)
    #print ("cost at t", cost_t)
    #print ("p of completion at t", prob_complete)
    
    return cum_cost, cum_prob_t, p_incomplete

def machine_compute_step_t(debug, R, layers, w, gates, t, desired_output, output_len, registers, memory_tape, cost_t, cum_prob, prob_incomplete) : 
    
    p, reg_coef, gate_coef = controller_forward_prop(R, layers, w, gates, registers)
    
    new_registers, new_memory_tape = fuzzy_circuit(registers, gates, memory_tape, gate_coef, reg_coef, debug)
    
    cost_t, cum_prob_t, prob_incomplete = calculate_cost_at_t(p, t, cost_t, cum_prob, prob_incomplete, new_memory_tape, desired_output, output_len, debug)
    
    return new_registers, new_memory_tape, cost_t, cum_prob_t, prob_incomplete

def compute_all_timesteps(gates, layers, registers, memory_tape, w, reg_lambda, output_len):
    #w = make_broadcastable(w)
    # Create symbolic variables for the input to the machine
    # and for the desired output of the machine.
      
    #initial_registers = registers
    #desired_output = memory_tape

    initial_registers = tensor.dtensor3("Registers")
    initial_memory = tensor.dtensor3("Memory_Tape")
    desired_output = tensor.dtensor3("Y")
     
    
    # Run the model for all timesteps. The arguments are 
    # registers, cost, cumulative probability complete, 
    # and probability incomplete. The latter are initialized
    # to zero and to one, respectively.
    
    v0 = as_tensor(0)
    v1 = as_tensor(1)
    output = [initial_registers, initial_memory, v0, v0, v1]
    intermediate_registers = []
    for timestep in range(MAX_TIMESTEP):
        print("compute_all_timesteps t = ", timestep)
        output = machine_compute_step_t(False, R, layers, w, gates, timestep+1, desired_output, output_len, *(output))
        intermediate_registers.append(output[0])
        
        #registers=new_registers

    # Add in regularization, to avoid overfitting simple examples.
    reg_cost = reg_lambda * sum((p * p).sum() for p in list(w))
    
    # Get the final cost: regularization plus loss.
    final_cost = reg_cost + output[2].sum()
    
    # Return the symbolic variables, the final cost, and the
    # intermediate register values for analysis and prediction.
    return initial_registers, initial_memory, desired_output, final_cost, intermediate_registers




In [2]:
#Maximum Representable Integer M is set below
M = 16
# Number of registers
R = 2
#Max number of timesteps
MAX_TIMESTEP = 5

Module = namedtuple("Module", "arity function memory_function")

m_zero = Module(0, zero, False)
m_one = Module(0, one, False)
m_two = Module(0, two, False)
m_inc = Module(1, inc, False)
m_negate = Module(1, negate, False)
m_add = Module(2, add, False)
m_sub = Module(2, sub, False)
m_eq_zero = Module(1, eq_zero, False)
m_read = Module(1, read, True)
m_write = Module(2, write, True)
m_lt = Module(2, lt, False)
m_lte = Module(2, lte, False)
m_eq = Module(2, eq, False)
m_min = Module(2, min, False)
m_max = Module(2, max, False)

gates = [m_read, m_inc, m_lt, m_min, m_write]
N = len(gates)

registers = get_registers(1)
memory_tape = create_memory_tape()
desired_out = create_memory_tape(0)
for i in range(M):
     desired_out = write_external(get_const(i), get_const(i), desired_out)
        
layers = [1,2]
w = gen_network_weights(gates, layers)
print("weight matrix: ", w)

(0, Module(arity=1, function=<function read at 0x106b86230>, memory_function=True))
(1, Module(arity=1, function=<function inc at 0x106b82cf8>, memory_function=False))
(2, Module(arity=2, function=<function lt at 0x106b82f50>, memory_function=False))
(3, Module(arity=2, function=<function min at 0x106b86140>, memory_function=False))
(4, Module(arity=2, function=<function write at 0x106b862a8>, memory_function=True))
('weight matrix: ', [w3x1, w2x2, w3x2, w3x3, w3x4, w3x4, w3x5, w3x5, w3x6, w3x6, w3x7, w3x7, w3x1])


In [3]:
reg_lambda = 0.1
output_len = M
result  = compute_all_timesteps(gates, layers, registers, memory_tape, w, reg_lambda, output_len)
initial_registers, initial_memory, desired_output, final_cost, intermediate_registers = result

('compute_all_timesteps t = ', 0)
('compute_all_timesteps t = ', 1)
('compute_all_timesteps t = ', 2)
('compute_all_timesteps t = ', 3)
('compute_all_timesteps t = ', 4)


In [4]:
initial_registers, initial_memory, desired_output, final_cost, intermediate_registers

(Registers,
 Memory_Tape,
 Y,
 Elemwise{add,no_inplace}.0,
 [Join.0, Join.0, Join.0, Join.0, Join.0])

In [5]:
gradients = theano.grad(final_cost, list(w)) #, disconnected_inputs='warn', return_disconnected='Disconnected')


In [6]:
for r in intermediate_registers:
    print (r.type)

TensorType(float64, 3D)
TensorType(float64, 3D)
TensorType(float64, 3D)
TensorType(float64, 3D)
TensorType(float64, 3D)


In [7]:
# Compile training function to compute gradients.
train = theano.function([initial_registers, initial_memory, desired_output], [final_cost] + gradients) #, on_unused_input='ignore', allow_input_downcast=True)

In [8]:
# Compile prediction function (registers after one timestep)
predict = theano.function([initial_registers, initial_memory], intermediate_registers[0])


In [9]:
for i in range(2):
    output_train = train(registers.eval(), memory_tape.eval(), desired_out.eval())

In [10]:
output_train

[array(1377.1944222635361), array([[ 0.01534355],
        [-0.05771496],
        [-0.1562771 ]]), array([[  1.10450928e-01,   1.36550612e-02],
        [  8.40694793e+01,  -1.86932764e-01]]), array([[-0.10239065, -0.02650435],
        [ 0.03336372, -0.10320154],
        [-0.02110737, -0.12504492]]), array([[-0.15608317, -0.03163312,  0.08399274],
        [ 0.09873842, -0.15461   ,  0.02544683],
        [ 0.07044295,  0.19657399, -0.04488378]]), array([[-0.15053301, -0.08745199,  0.17696243,  0.08211676],
        [-0.12523196,  0.06675526,  0.13385073, -0.05702877],
        [ 0.0391238 ,  0.08993994,  0.0035428 ,  0.00150313]]), array([[ 0.12012156,  0.11169121,  0.10317543,  0.17816953],
        [ 0.03958255,  0.08383777, -0.19307736, -0.11006563],
        [-0.0965774 , -0.06584564,  0.13872239,  0.04755252]]), array([[ 0.32460716, -0.05250864,  0.32285572,  0.02358304, -0.57819645],
        [-0.14311541,  0.13878391,  0.12230059, -0.1808767 ,  0.14280589],
        [ 0.74124373, -0.0680

In [11]:
#This code is borrowed from http://andrew.gibiansky.com/ based on ADAM optimization technique proposed in the paper
def adam_optimize(params, train, train_inputs, mem, train_outputs, output_len,
                  alpha=0.001, b1=0.9, b2=0.999,
                  epsilon=1e-8, batch_size=3):
    """Implementation of Adam optimization method, with hyperparameters
    taken as recommended by the original publication."""
    # Initialize first and second moment estimates to zero.
    # This causes some bias, which is addressed later.
    moment1 =  [0 for _ in params]
    moment2 = [0 for _ in params]
    
    timestep = 0  # Current optimization step
    batch = 0     # Where does this batch start
    
    converged = False
    gradients = []
    while not converged:
        timestep += 1
        
        # Train on a small batch.
        #inputs = train_inputs   
        inputs  = train_inputs[batch:batch+batch_size, :, :]
        #outputs = train_outputs 
        outputs = train_outputs[batch:batch+batch_size, :]   
        #print("training now")
        result = train(inputs, mem, outputs)
        cost = result[0]
        gradients = result[1:]
        #print("gradients now", gradients)
        
        # Advance to next batch.
        batch = (batch + batch_size) % train_inputs.shape[0]

        # Compute first and second moment estimates.
        # These are decaying moving averages; first moment
        # uses the gradient, second uses squared gradient.
        moment1  = [b1 * m + (1 - b1) * gradient
                    for (m, gradient)
                    in zip(moment1, gradients)]
        moment2 = [b2 * v + (1 - b2) * gradient ** 2
                   for (v, gradient)
                   in zip(moment2, gradients)]
        
        # Correct for initialization bias and compute new values.
        correction1 = 1. / (1 - b1 ** timestep)
        correction2 = 1. / (1 - b2 ** timestep)
        corrected1 = [correction1 * m for m in moment1]
        corrected2 = [correction2 * v for v in moment2]
        
        # Compute new parameter values.
        params_new = [p.get_value() - alpha * m1 / (np.sqrt(m2) + epsilon)
                      for (p, m1, m2) in zip(params, corrected1, corrected2)]

        # Check for convergence by looking at magnitude of delta.
        delta = [abs(p.get_value() - p_new)
                 for (p, p_new) in zip(params, params_new)]
        converged = all((d < 0.5 * alpha).all() for d in delta)        
        
        # Update parameters to new values.
        for p, p_new in zip(params, params_new):
            p.set_value(p_new.astype('float32'))
            
        # Provide some output for tracking during runtime.
        if timestep % 100 == 1 or converged:
            print("Cost (t = %4d): \t%.2f" % (timestep - 1, cost))
            


In [12]:
#registers = get_registers(2)
#memory_tape = create_memory_tape()
#desired_out = create_memory_tape(0)
#for i in range(M):
#     desired_out = write_external(get_const(i), get_const(i), desired_out)

In [13]:
import sys
sys.getrecursionlimit()
sys.setrecursionlimit(40000)
sys.getrecursionlimit()

40000

Task based Input Output Definition

In [14]:
#Access Task
input = [ 3,  1,  12,  4,  7,  12,  1,  13,  8,  2, 1, 3, 11, 11, 12, 0]
output = [ 4,  1,  12,  4,  7,  12,  1,  13,  8,  2, 1, 3, 11, 11, 12, 0]

memory_tape = stack([get_const(i) for i in input], axis=1)
desired_out = stack([get_const(i) for i in output], axis=1)

In [15]:
print ("desired_out.eval()", desired_out.eval().argmax(axis=2))
print ("memory_tape.eval()", memory_tape.eval().argmax(axis=2))
print ("registers.eval()", registers.eval().argmax(axis=2))

('desired_out.eval()', array([[ 4,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('memory_tape.eval()', array([[ 3,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('registers.eval()', array([[1, 1]]))


In [16]:
result = adam_optimize(w, train, registers.eval(), memory_tape.eval(), desired_out.eval(), 1)


Cost (t =    0): 	12.64
Cost (t =  100): 	10.68


KeyboardInterrupt: 

In [None]:
v0 = as_tensor(0)
v1 = as_tensor(1)
output = [registers, memory_tape, v0, v0, v1]
new_registers = []
for timestep in range(10):
    #print("compute_all_timesteps t = ", timestep)
    
    output = machine_compute_step_t(True, R, layers, w, gates, timestep+1, desired_out, 1, *(output))
    registers, memory_tape, cost_t, cum_prob_t, prob_incomplete = output
    
    print("registers = ", registers.eval().argmax(axis=2))
    print("memory tape:  ", memory_tape.eval().argmax(axis=2))
    print("desired tape: ", desired_out.eval().argmax(axis=2))
    print("cost = ", output[2].eval())
    
    #new_registers.append(output[0])
    #registers=new_registers
    #memory_tape=new_memory_tape

('gate i = ', 0, 'R :', 2, 'arity:', 1, 'my gate output: ', array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.]]), 'my gate inputs: ', array([[[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.]]]))
('Concatenated inputs: ', array([[[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.]]]))
('gate i = ', 1, 'R :', 2, 'arity:', 1, 'my gate output: ', array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.]]), 'my gate inputs: ', array([[[ 0.,  1

In [39]:
from theano import pp, tensor as T
from theano import tensor as T, function, printing

p = printing.Print('p')
printed_r = p(initial_registers)
f = function([initial_registers], printed_r)
p_r = f(registers.eval())

p __str__ = [[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]
  [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.]]]


In [27]:
print("registers = ", registers.eval())
print("memory tape: ", memory_tape.eval().argmax(axis=2))
print("desired tape: ", desired_out.eval().argmax(axis=2))

registers =  [[[  0.00000000e+000   1.68733456e-048   3.93711398e-048   4.37598441e-080
     4.14789411e-020   6.51811920e-020   1.00000000e+000   2.45786088e-040
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000]
  [  5.92556329e-021   9.49183839e-029   5.33863566e-077   9.12919712e-040
     1.00000000e+000   2.37022516e-020   2.96278157e-020   3.99936297e-068
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000]
  [  3.89420956e-105   5.62444855e-049   3.20249434e-087   4.27974840e-060
     1.00000000e+000   1.20627068e-039   2.96278157e-020   3.99936297e-068
     0.00000000e+000   0.00000000e+000   0.00000000e+000   0.00000000e+000
     0.000

In [None]:
print_mem_op = printing.Print('print_mem')
printed_mem = print_mem_op(initial_memory)
f_mem = function([initial_memory], printed_mem)
p_mem = f(memory_tape.eval())

In [None]:
cost_t, cum_prob_t, prob_incomplete = calculate_cost_at_t(True, 0.5, 2, 0, 0, 0.5, memory_tape, desired_out)

