# Table of Contents
 <p>

READ (described in Sec. 3.2),
WRITE (described in Sec. 3.2).

ZERO(a, b) = 0, 
ONE(a, b) = 1, 
TWO(a, b) = 2, 

INC(a, b) = (a+1) mod M, 
ADD(a, b) = (a+b) mod M, 
SUB(a, b) = (a−b) mod M, 
DEC(a, b) = (a−1) mod M, 
LESS-THAN(a, b) = [a < b], 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = mod M, 
SUB(a, b) = (a−b) mod M, 
DEC(a, b) = (a−1) mod M, 
LESS-THAN(a, b) = [a < b], 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = min(a, b), 
MAX(a, b) = max(a, b), 
LESS-OR-EQUAL-THAN(a, b) = [a ≤ b], 
EQUALITY-TEST(a, b) = [a = b], 
MIN(a, b) = min(a, b), 
MAX(a, b) = max(a, b), 


In [1]:
import numpy as np
import theano

from theano import tensor
from collections import namedtuple
from theano.tensor.extra_ops import to_one_hot
from theano.tensor import roll
from theano.tensor import stack
from theano.tensor import batched_dot
from theano.tensor import concatenate
from theano.tensor import as_tensor
from theano.tensor import set_subtensor
from theano import shared
from numpy.random import uniform
from theano.tensor.nnet import softmax, relu, sigmoid
from theano import function, config, shared, sandbox, Out

def zero() :
    return to_one_hot(np.asarray([0]),M)
def one() :
    return to_one_hot(np.asarray([1]),M)
def two() :
    return to_one_hot(np.asarray([2]),M)
def get_const(value) :
    return to_one_hot(np.asarray([value % M]),M)
def create_memory_tape(init_val=0):
    m = stack([get_const(init_val) for i in range(M)], axis=1)
    #return shared(np.asarray(m.eval()), config.floatX) 
    return m
def inc(a) :
    return roll(a, 1, axis=1)
def negate(a) :
    return roll(a[:, ::-1], 1, axis=1)
def add(a,b) :
    rows = [roll(b[:,], j, axis=1) for j in range(M)]
    return (batched_dot(a, stack(rows, axis=1)))
def sub(a,b) :
    b_negative = negate(b)
    return add(a, b_negative)
def eq_zero(a) :
    r = tensor.zeros_like(as_tensor(a))
    r = set_subtensor(r[:,1], a[:, 0])
    r = set_subtensor(r[:, 0], 1 - a[:, 0])
    return r
def lt(a,b):
    a = as_tensor(a)
    b = as_tensor(b)
    b = set_subtensor(b[:,0], [0]) 
    b = roll(b[:,], -1, axis=1)
    rows = [set_subtensor(tensor.zeros_like(as_tensor(b))[:,j:M], b[:,j:M]) for j in range(M)]
    result = ((tensor.dot(as_tensor(a), stack(rows, axis =1))).sum()).dimshuffle('x')
    return set_subtensor(tensor.zeros_like(as_tensor(b))[:,1], result)
def eq(a,b):
    elemwise_result = tensor.eq(as_tensor(a),as_tensor(b))
    sum = elemwise_result.sum()
    result = tensor.eq(sum,M).dimshuffle('x')
    return set_subtensor(tensor.zeros_like(as_tensor(a))[:,1], result)
def max_(a,b):
    if(tensor.eq([0],lt(a,b))):
        return as_tensor(b)
    else:
        return as_tensor(a)
def min_(a,b):
    if(tensor.eq([1],lt(a,b))):
        return as_tensor(a)
    else:
        return as_tensor(b)    
def read(mem, a) :
    ptr = as_tensor(a)
    return weighted_avg(mem, ptr), mem
def write(mem, a, b) :
    ptr = as_tensor(a)
    val = as_tensor(b)
    J = (tensor.ones_like(zero())).T
    erase_vector = (J-ptr.T).dot(J.T)
    x = tensor.mul(erase_vector, mem)
    y = (ptr.T).dot(val)
    mem = (x + y)
    return val, mem
def write_external(a, b, ext_mem) :
    ptr = as_tensor(a)
    val = as_tensor(b)
    J = (tensor.ones_like(zero())).T
    erase_vector = (J-ptr.T).dot(J.T)
    x = tensor.mul(erase_vector, ext_mem)
    y = (ptr.T).dot(val)
    ext_mem = (x + y)
    return ext_mem

def get_registers(init_val):
    return stack([get_const(init_val) for i in range(R)], axis=1)

def weighted_avg(inputs, coefficient) :
    return batched_dot(inputs.transpose(0, 2, 1), coefficient.dimshuffle(0, 1, 'x')).flatten(2);

def compute_gate_new(module, inputs, coefficients, memory_tape, debug=False) :
    """ Arity of this gate must be equal to the number of given
        coefficients list
    """ 
    if (len(coefficients) != module.arity) :
        print("Error: Incorrect number of coefficients: ",  len(coefficients), " to module arity: ", module.arity)
    
    params = [weighted_avg(inputs, as_tensor(coef)) for coef in coefficients]
    
    if (debug == True):
        for i, p in enumerate(params):
            print("compute_gate_new: coeff", coefficients[i].eval(), " weighted param [", i ,"] = ", p.eval().argmax())
    
    if (module.memory_function == True) :
        #print("read/write ")
        output, memory_tape = module.function(memory_tape, *params)
    else :   
        output = module.function(*params)
    
    #error check for constant gates
    return output, memory_tape


def get_n_tensor(t, count, idx):
    result=[]
    if count > 0: 
        result = [t[idx+i] for i in range(count)]
    return result, (idx+count)

def fuzzy_circuit(registers, gates, memory_tape, gate_coef, reg_coef, debug=False):
    # Initially, only the registers may be used as inputs.
    Q= len(gates)
    gate_inputs = registers
    idx=0    
    
    # Run through all the gates.
    for i in range(Q):
        c, idx = get_n_tensor(gate_coef, gates[i].arity, idx) 
        if (debug==True):
            print("gate i = ", i, "arity:", gates[i].arity, "gate inputs: ", gate_inputs.eval().argmax(axis=2))
        output, memory_tape = compute_gate_new(gates[i], gate_inputs, c, memory_tape, debug)
        # Append the output of the gate as an input for future gates.
        gate_inputs = concatenate([gate_inputs, output.dimshuffle(0,'x',1)], axis=1)
        if (debug==True):
            print("gate output: ", output.eval().argmax())    
            print("concatenated inputs: ", gate_inputs.eval().argmax(axis=2))       
        
    # All leftover coefficients are for registers.
    new_registers = []
             
    for i in range(len(reg_coef)):
        # (R+Q) x M dot 1 X (R+Q) 
        new_registers.append(weighted_avg(gate_inputs, reg_coef[i]))
        if (debug == True) :
            print("register [", i, "]  new value: reg_coef", reg_coef[i].eval().argmax(),"weighted_avg of gate inputs and reg_coef: ", weighted_avg(gate_inputs, reg_coef[i]).eval().argmax())
    return tensor.stack(new_registers, axis=1), memory_tape


# coefficients = [r1,r2..rR,g1_param1,g1_param2,...,gQ_param1,gQ_param2,c1,c1..CR,cR+1,..cR+Q]
def gen_random_weights(layer1, layer2, dtype=np.float64, _min=-1, _max=1):
    weights = uniform(low=_min, high=_max, size=(layer1, layer2))
    var = shared(weights.astype(dtype), name="w{0}x{1}".format(layer1, layer2))   
    #var = tensor.addbroadcast(var, 0)
    return var

def gen_network_weights(gates, layers):
    n_registers = R #input is R registers
    w = []
    current_layer_units = n_registers
    for next_layer_units in layers:
        w.append(gen_random_weights(current_layer_units + 1, next_layer_units))
        current_layer_units = next_layer_units
    #output wt for gate coefficients
    gate_coef = []
    for i, gate in enumerate(gates):
        print(i, gate)
        #each param to gates are R+i depending on arity 
        gate_output_units = n_registers + i
        for _ in range(gate.arity):
            gate_coef.append(gen_random_weights(current_layer_units + 1, gate_output_units))            
    
    #print("gen_network_weights => gate_coef", gate_coef)         
    #output wt for new registers
    reg_coef = []
    reg_output_units = n_registers + len(gates)
    for _ in range(n_registers):
        reg_coef.append(gen_random_weights(current_layer_units + 1, n_registers + len(gates)))    
    
    #print("gen_network_weights => reg_coef", reg_coef)      
    prob_completion_coef = (gen_random_weights(current_layer_units + 1, 1))

    for i in gate_coef:
        w.append(i)

    for x in reg_coef:
        w.append(x)
    
    w.append(prob_completion_coef)   
    return w

def aug_ones_col(inputs):
    print("OMFG")
    s = inputs.shape
    col_elems = s[1]
    ones = tensor.ones_like(to_one_hot(np.asarray([0]), col_elems))
    x = concatenate([inputs.T,ones], axis=1)
    return x.T

def aug_ones_col_new(inputs):
    x=[]
    s = inputs.shape
    d = inputs.ndim
    row_elems = s[d-2]
    matrix = tensor.reshape(inputs, (s[d-2],s[d-1]))
    matrix = tensor.shape_padleft(matrix)
    ones = tensor.ones_like(to_one_hot(np.asarray([0]), row_elems))
    x = concatenate([matrix.dimshuffle(0,2,1),ones.dimshuffle(0,'x',1)], axis=1)
    return x.dimshuffle(0,2,1)

def controller_forward_prop(n_registers, layers, weight_matrix, gates, registers, debug) :
    inputs = aug_ones_col_new(registers[:,:,0])
    if (debug == True):
        print("registers: ", registers.eval())
        print("registers[:,:,0]: ", registers[:,:,0].eval())
        print("inputs: ", inputs.eval())
        
    for i in range(len(layers)):
        W = weight_matrix[i]
        inputs = aug_ones_col_new(relu(inputs.dot(W)))  
        if (debug == True):
            print("compute layers inputs: ", inputs.eval())
    
    #extract gate coefficients
    gate_coef = []
    n_gate_coef = 0
    
    for i, gate in enumerate(gates): 
        #each param to gates are R+i depending on arity 
        gate_output_units = n_registers + i
        for _ in range(gate.arity):
            n_gate_coef += 1  
            
    for W in weight_matrix[len(layers):len(layers) + n_gate_coef]:
        gate_coef.append(softmax(inputs.dot(W)[0]))            
    
    reg_coef = []
    for W in (weight_matrix[len(layers)+n_gate_coef:len(layers)+n_gate_coef+n_registers]):
        reg_coef.append(softmax(inputs.dot(W)[0]))  
    
    if (debug == True):
        print("p_complete inputs: ", inputs.eval())
        print("inputs.dot(weight_matrix[-1]: ", inputs.dot(weight_matrix[-1]).eval())  
              
    p = sigmoid(abs(inputs.dot(weight_matrix[-1])))
    if (debug == True):
        print("p_complete: ", p.eval())
        
    return p, reg_coef, gate_coef

def calculate_cost_at_t(prob_complete_t, t, cum_cost, cum_prob_t, p_incomplete, memory_in, desired_output, output_len, debug) :
    e_min = 1e-100
    e_max = 1e+100
    cost_t = 0
    #TODO: Find use for desired registers in calculating cost. Now, only desired memory layout is matched.
    for i in range(output_len):
        #Compute the loss for this register using the mask.
        y = to_one_hot(desired_output.argmax(axis=2)[:,i], M)   
        y_hat = to_one_hot(memory_in.argmax(axis=2)[:,i], M) 
        ln_y_hat = tensor.log(tensor.clip(memory_in[:, i, :], e_min, e_max))
        ln_1_minus_y_hat = tensor.log(tensor.clip(1 - memory_in[:, i, :], e_min, e_max))
        x1 = y * ln_y_hat
        x2 = (1 - y) * ln_1_minus_y_hat
        loss = (x1+x2).sum(axis=1) 
        cost_t += tensor.shape_padright(loss, 1)
        if (debug == True):
            #print("desired_output at i", desired_output[:,i].eval())
            #print("memory value at i", memory_in[:, i, :].eval())
            #print("y ", y.eval())
            #print("y_hat ", y_hat.eval())
            #print("ln_y_hat ", ln_y_hat.eval())
            #print("ln_1_minus_y_hat ", ln_1_minus_y_hat.eval())
            #print("x1 ", x1.eval())
            #print("x2 ", x2.eval())
            print("loss ", loss.eval())
            #print("tensor.shape_padright(loss, 1) ", tensor.shape_padright(loss, 1).eval())
            #print("Desired value at mem location [", i, "] is ", y.eval().argmax(), "but actual value is [", memory_in[:,i,:].eval().argmax(), "loss is ", loss.eval(), " cost = ", cost_t.eval()) 
    
    if (t == MAX_TIMESTEP):
        prob_complete = 1 - cum_prob_t
    else:
        prob_complete = (prob_complete_t * p_incomplete)
    
    cum_prob_t += prob_complete
    p_incomplete *= (1 - prob_complete_t)
    cum_cost -= (cost_t*prob_complete)
    
    if (debug == True):
        print("prob_complete_t", prob_complete_t.eval())
        print("p_incomplete", p_incomplete.eval())
        print("prob_complete", prob_complete.eval())
        print("cum_prob_t", cum_prob_t.eval())
        print("cost_t*prob_complete", (cost_t*prob_complete).eval())
        print("cum_cost", cum_cost.eval())
    return cum_cost, cum_prob_t, p_incomplete

def machine_compute_step_t(debug, R, layers, w, gates, t, desired_output, output_len, registers, memory_tape, cost_t, cum_prob, prob_incomplete) : 
    prob_complete_t, reg_coef, gate_coef = controller_forward_prop(R, layers, w, gates, registers, debug)
    new_registers, new_memory_tape = fuzzy_circuit(registers, gates, memory_tape, gate_coef, reg_coef, debug)
    cost_t, cum_prob_t, prob_incomplete = calculate_cost_at_t(prob_complete_t, t, cost_t, cum_prob, prob_incomplete, new_memory_tape, desired_output, output_len, debug)
    return new_registers, new_memory_tape, cost_t, cum_prob_t, prob_incomplete

def compute_all_timesteps(gates, layers, registers, memory_tape, w, reg_lambda, output_len):
    #w = make_broadcastable(w)
    # Create symbolic variables for the input to the machine
    # and for the desired output of the machine.
      
    #initial_registers = registers
    #desired_output = memory_tape

    initial_registers = tensor.dtensor3("Registers")
    initial_memory = tensor.dtensor3("Memory_Tape")
    desired_output = tensor.dtensor3("Y")
     
    
    # Run the model for all timesteps. The arguments are 
    # registers, cost, cumulative probability complete, 
    # and probability incomplete. The latter are initialized
    # to zero and to one, respectively.
    
    v0 = as_tensor(0)
    v1 = as_tensor(1)
    output = [initial_registers, initial_memory, v0, v0, v1]
    intermediate_registers = []
    for timestep in range(MAX_TIMESTEP):
        print("compute_all_timesteps t = ", timestep)
        output = machine_compute_step_t(False, R, layers, w, gates, timestep+1, desired_output, output_len, *(output))
        intermediate_registers.append(output[0])

    # Add in regularization, to avoid overfitting simple examples.
    reg_cost = reg_lambda * sum((p * p).sum() for p in list(w))
    
    # Get the final cost: regularization plus loss.
    final_cost = reg_cost + output[2].sum()
    
    # Return the symbolic variables, the final cost, and the
    # intermediate register values for analysis and prediction.
    return initial_registers, initial_memory, desired_output, final_cost, intermediate_registers


In [2]:
#Maximum Representable Integer M is set below
M = 16
# Number of registers
R = 2
#Max number of timesteps
MAX_TIMESTEP = 5

Module = namedtuple("Module", "arity function memory_function")

m_zero = Module(0, zero, False)
m_one = Module(0, one, False)
m_two = Module(0, two, False)
m_inc = Module(1, inc, False)
m_negate = Module(1, negate, False)
m_add = Module(2, add, False)
m_sub = Module(2, sub, False)
m_eq_zero = Module(1, eq_zero, False)
m_read = Module(1, read, True)
m_write = Module(2, write, True)
m_lt = Module(2, lt, False)
m_eq = Module(2, eq, False)
m_min = Module(2, min_, False)
m_max = Module(2, max_, False)

gates = [m_read, m_inc, m_zero, m_lt, m_min, m_write]
#gates = [m_read, m_write]
N = len(gates)

registers = get_registers(1)
memory_tape = create_memory_tape()
desired_out = create_memory_tape(0)
for i in range(M):
     desired_out = write_external(get_const(i), get_const(i), desired_out)
        
layers = [5,5]
w = gen_network_weights(gates, layers)
print("weight matrix: ", w)

(0, Module(arity=1, function=<function read at 0x106ac16e0>, memory_function=True))
(1, Module(arity=1, function=<function inc at 0x106ac12a8>, memory_function=False))
(2, Module(arity=0, function=<function zero at 0x106abdcf8>, memory_function=False))
(3, Module(arity=2, function=<function lt at 0x106ac1500>, memory_function=False))
(4, Module(arity=2, function=<function min_ at 0x106ac1668>, memory_function=False))
(5, Module(arity=2, function=<function write at 0x106ac1758>, memory_function=True))
('weight matrix: ', [w3x5, w6x5, w6x2, w6x3, w6x5, w6x5, w6x6, w6x6, w6x7, w6x7, w6x8, w6x8, w6x1])


In [3]:
reg_lambda = 0.1
output_len = 1
result  = compute_all_timesteps(gates, layers, registers, memory_tape, w, reg_lambda, output_len)
initial_registers, initial_memory, desired_output, final_cost, intermediate_registers = result

('compute_all_timesteps t = ', 0)
('compute_all_timesteps t = ', 1)
('compute_all_timesteps t = ', 2)
('compute_all_timesteps t = ', 3)
('compute_all_timesteps t = ', 4)


In [4]:
initial_registers, initial_memory, desired_output, final_cost, intermediate_registers

(Registers,
 Memory_Tape,
 Y,
 Elemwise{add,no_inplace}.0,
 [Join.0, Join.0, Join.0, Join.0, Join.0])

In [5]:
gradients = theano.grad(final_cost, list(w)) #, disconnected_inputs='warn', return_disconnected='Disconnected')


In [6]:
for i in range(len(gradients)):
    theano.gradient.grad_clip(gradients[i], -1, 1)

In [7]:
# Compile training function to compute gradients.
train = theano.function([initial_registers, initial_memory, desired_output], [final_cost] + gradients) #, on_unused_input='ignore', allow_input_downcast=True)

In [8]:
#Access Task
input = [ 7,  1,  12,  4,  7,  12,  1,  13,  8,  2, 1, 3, 11, 11, 12, 0]
output = [ 13 ]
registers = get_registers(0)
memory_tape = stack([get_const(i) for i in input], axis=1)
desired_out = stack([get_const(i) for i in output], axis=1)

In [9]:
print ("desired_out.eval()", desired_out.eval().argmax(axis=2))
print ("memory_tape.eval()", memory_tape.eval().argmax(axis=2))
print ("registers.eval()", registers.eval().argmax(axis=2))

('desired_out.eval()', array([[13]]))
('memory_tape.eval()', array([[ 7,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('registers.eval()', array([[0, 0]]))


In [20]:
for i in range(1):
    output_train = train(registers.eval(), memory_tape.eval(), desired_out.eval())
    print(output_train)

[array(4.848957238226147), array([[  2.75153728e-02,   2.64778662e-02,   2.66734234e-02,
          3.72727044e-02,   1.86764015e-02],
       [  1.88653654e-02,   1.82650123e-02,   1.82135152e-02,
         -1.03098015e-04,  -3.07552576e-02],
       [  1.03379412e-01,   1.03571371e-01,   9.79211185e-02,
          6.25465226e-02,   9.99091066e-02]]), array([[ -1.69903979e-06,   7.29245096e-02,   9.57987305e-06,
          6.29119315e-02,   6.84682502e-02],
       [ -1.94570202e-06,   7.13589591e-02,   1.16366500e-06,
          6.16070111e-02,   6.72254519e-02],
       [ -2.08058827e-06,   7.04938709e-02,   9.11624666e-06,
          6.04710776e-02,   6.52345302e-02],
       [ -3.28394490e-06,   2.06030560e-02,   2.45001819e-06,
          7.34094902e-02,   6.71316927e-02],
       [ -1.53098226e-06,   2.20342900e-03,   4.29366764e-06,
          1.27747533e-01,   6.85497409e-03],
       [ -3.92097136e-06,   2.34216788e-01,   9.40974426e-07,
          7.10260939e-02,   1.73603401e-01]]), array(

In [23]:
#training data for "access" 
train_data_size = 50
train_inputs_access = []
train_outputs_access = []

input = []
output = []
for i in range(train_data_size):
    #generated a random array
    input = np.random.randint(1,15,size=(M))
    input_t = stack([get_const(i) for i in input], axis=1)
    #print("input array", input_t.eval().argmax(axis=2))
    train_inputs_access.append(input_t)
    first_elem = input[0]
    output = np.array([[input[first_elem]]])
    output_t = stack([get_const(i) for i in output], axis=1)
    #print(output_t.eval().argmax(axis=2))
    train_outputs_access.append(output_t)
print(train_inputs_access)    
print(train_outputs_access)    


[Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0]
[Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0, Join.0]


In [82]:
def clamp(n, minn, maxn):
    for i, item in enumerate(n):
        print(i, item)
        n[i] = (map(lambda x: max(min(maxn, x), minn), item))
    return n    
    

#This code is borrowed from http://andrew.gibiansky.com/ based on ADAM optimization technique proposed in the paper
def adam_optimize(params, train, registers, train_inputs, train_outputs, output_len,
                  alpha=0.001, b1=0.9, b2=0.999,
                  epsilon=1e-8, batch_size=1):
    """Implementation of Adam optimization method, with hyperparameters
    taken as recommended by the original publication."""
    # Initialize first and second moment estimates to zero.
    # This causes some bias, which is addressed later.
    moment1 =  [0 for _ in params]
    moment2 = [0 for _ in params]
    
    timestep = 0  # Current optimization step
    batch = 0     # Where does this batch start
    
    converged = False
    gradients = []
    
    while not converged:
        timestep += 1
        
        # Train on a small batch.
        #inputs = train_inputs   
        inputs  = train_inputs[batch:batch+batch_size][0]
        #print(inputs.eval().argmax(axis=2))
        #outputs = train_outputs 
        outputs = train_outputs[batch:batch+batch_size][0] 
        #print(outputs.eval().argmax())
        #print("training now")
        result = train(registers, inputs.eval(), outputs.eval())
        cost = result[0]
        gradients = result[1:]
        #print("gradients now", gradients)
        
        # Advance to next batch.
        batch = (batch + batch_size) % len(train_inputs)

        # Compute first and second moment estimates.
        # These are decaying moving averages; first moment
        # uses the gradient, second uses squared gradient.
        moment1  = [b1 * m + (1 - b1) * gradient
                    for (m, gradient)
                    in zip(moment1, gradients)]
        moment2 = [b2 * v + (1 - b2) * gradient ** 2
                   for (v, gradient)
                   in zip(moment2, gradients)]
        
        # Correct for initialization bias and compute new values.
        correction1 = 1. / (1 - b1 ** timestep)
        correction2 = 1. / (1 - b2 ** timestep)
        corrected1 = [correction1 * m for m in moment1]
        corrected2 = [correction2 * v for v in moment2]
        
        # Compute new parameter values.
        params_new = [p.get_value() - alpha * m1 / (np.sqrt(m2) + epsilon)
                      for (p, m1, m2) in zip(params, corrected1, corrected2)]

        # Check for convergence by looking at magnitude of delta.
        delta = [abs(p.get_value() - p_new)
                 for (p, p_new) in zip(params, params_new)]
        converged = all((d < 0.5 * alpha).all() for d in delta)        
        
        # Update parameters to new values.
        for p, p_new in zip(params, params_new):
            print("p.eval(): ",p.eval())
            print("p_new: ",p_new)
            y = clamp(p_new, -0.01, 1)
            print("after clamping: ", y)
            p.set_value(p_new.astype('float32'))
            print("p.eval() without clamping: ",p.eval())
            #print("p.eval() with clamping: ",p.eval())
            
        # Provide some output for tracking during runtime.
        if timestep % 100 == 1 or converged:
            print("Cost (t = %4d): \t%.2f" % (timestep - 1, cost))
            #print("gradients: ", gradients)

In [None]:
#def adam_optimize(params, train, train_inputs, mem, train_outputs, output_len,
#                  alpha=0.001, b1=0.9, b2=0.999,
#                  epsilon=1e-8, batch_size=1):
    
result = adam_optimize(w, train, registers.eval(), train_inputs_access, train_outputs_access, 1)

('p.eval(): ', array([[ 0.11745767,  0.11460486,  0.11266711,  0.13209605,  0.0987543 ],
       [ 0.09523928,  0.09333514,  0.0914875 ,  0.05237446,  0.04469303],
       [ 0.52981955,  0.53080857,  0.50304079,  0.38800201,  0.59197104]]))
('p_new: ', array([[ 0.11645768,  0.11360486,  0.11166712,  0.13109605,  0.0977543 ],
       [ 0.09423929,  0.09233515,  0.09048751,  0.05337445,  0.04569303],
       [ 0.53081955,  0.53180857,  0.50404079,  0.38900201,  0.59097104]]))
(0, array([ 0.11645768,  0.11360486,  0.11166712,  0.13109605,  0.0977543 ]))
(1, array([ 0.09423929,  0.09233515,  0.09048751,  0.05337445,  0.04569303]))
(2, array([ 0.53081955,  0.53180857,  0.50404079,  0.38900201,  0.59097104]))
('after clamping: ', array([[ 0.11645768,  0.11360486,  0.11166712,  0.13109605,  0.0977543 ],
       [ 0.09423929,  0.09233515,  0.09048751,  0.05337445,  0.04569303],
       [ 0.53081955,  0.53180857,  0.50404079,  0.38900201,  0.59097104]]))
('p.eval() without clamping: ', array([[ 0.116

In [21]:
for W in w:
    print(W.eval())

[[ 0.13340069  0.12847804  0.12914172  0.19359079  0.11188179]
 [ 0.09103771  0.08824333  0.08774117  0.00510197 -0.13935611]
 [ 0.49577555  0.49811393  0.46818787  0.35165867  0.59786719]]
[[ -8.49519893e-06   2.30213985e-01   4.07163097e-05   3.62966448e-01
    3.48213017e-01]
 [ -9.72851012e-06   2.22137854e-01  -1.37802431e-06   3.56533349e-01
    3.42011452e-01]
 [ -1.04029414e-05   2.25322813e-01   3.87862856e-05   3.48145634e-01
    3.31726283e-01]
 [ -1.64197245e-05   5.16907638e-03   7.02109264e-06   4.02276576e-01
    3.39925289e-01]
 [ -7.65491131e-06  -1.39430121e-01   1.34274724e-05   6.92989171e-01
    4.08997051e-02]
 [ -1.96048568e-05   9.19685423e-01  -8.73138288e-06   4.45776373e-01
    8.79080713e-01]]
[[  1.68236111e-05  -1.59535848e-05]
 [  1.54035790e-02  -1.52694760e-02]
 [  1.67716303e-06  -1.19942945e-06]
 [  1.24201579e-02  -1.18242204e-02]
 [  1.28548853e-02  -1.41935525e-02]
 [  7.37955142e-03  -8.91334470e-03]]
[[  2.63046104e-07  -8.34264756e-06   7.754991

In [14]:
#Access Task
input = [ 7,  1,  12,  4,  7,  12,  1,  13,  8,  2, 1, 3, 11, 11, 12, 0]
output = [ 13 ]
registers = get_registers(0)
memory_tape = stack([get_const(i) for i in input], axis=1)
desired_out = stack([get_const(i) for i in output], axis=1)

In [15]:
v0 = as_tensor(0)
v1 = as_tensor(1)
output = [registers, memory_tape, v0, v0, v1]
new_registers = []
print("registers = ", registers.eval().argmax(axis=2))
print("memory tape:  ", memory_tape.eval().argmax(axis=2))
print("desired tape: ", desired_out.eval().argmax(axis=2))
for timestep in range(MAX_TIMESTEP):
    output = machine_compute_step_t(False, R, layers, w, gates, timestep+1, desired_out, 1, *(output))
    registers, memory_tape, cost_t, cum_prob_t, prob_incomplete = output    
    print("registers = ", registers.eval().argmax(axis=2))
    print("memory tape:  ", memory_tape.eval().argmax(axis=2))
    print("desired tape: ", desired_out.eval().argmax(axis=2))
    print("cost = ", output[2].eval())
    print("prob_incomplete = ", prob_incomplete.eval())
    

('registers = ', array([[0, 0]]))
('memory tape:  ', array([[ 7,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('desired tape: ', array([[13]]))
('registers = ', array([[0, 7]]))
('memory tape:  ', array([[ 7,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('desired tape: ', array([[13]]))
('cost = ', array([[[ 116.15232372]]]))
('prob_incomplete = ', array([[[ 0.49812724]]]))
('registers = ', array([[0, 0]]))
('memory tape:  ', array([[ 7,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('desired tape: ', array([[13]]))
('cost = ', array([[[ 116.97569676]]]))
('prob_incomplete = ', array([[[ 0.22918452]]]))
('registers = ', array([[0, 0]]))
('memory tape:  ', array([[ 7,  1, 12,  4,  7, 12,  1, 13,  8,  2,  1,  3, 11, 11, 12,  0]]))
('desired tape: ', array([[13]]))
('cost = ', array([[[ 117.3332402]]]))
('prob_incomplete = ', array([[[ 0.10765278]]]))
('registers = ', array([[0, 0]]))
('memory tape:  ', array([[ 0,  1, 12,  4,  7, 12,  

KeyboardInterrupt: 