In [175]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np

class Critic(tf.keras.Model):
    def __init__(self,nature, valreg=0.01, seed_val=0.3, pad_value=-7., dolinar_layers=2, tau=0.01):
        '''
        dolinar_layers= number of photodetections
        pad_value: value not considered by the lstm
        valreg: regularisation value
        seed_val: interval of random parameter inizialitaion.
        '''
        super(Critic,self).__init__()

        self.pad_value = pad_value
        self.nature = nature
        self.dolinar_layers = dolinar_layers
        self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(self.dolinar_layers, 2)) #(beta1, pad), (n1, beta2), (n2, guess). In general i will have (layer+1)
        self.lstm = tf.keras.layers.LSTM(500, return_sequences=True)

        self.tau = tau
        self.l1 = Dense(250,kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg))

        self.l2 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l3 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l4 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))



    def update_target_parameters(self,primary_net):
        #### only
        # for i,j in zip(self.get_weights(), primary_net.get_weights()):
        #     tf.assign(i, tau*j + (i-tau)*i )
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(self.tau * prim_weights[i] + (1 - self.tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, inputs):
        feat = self.mask(inputs)
        feat= self.lstm(feat)
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l1(feat))
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.sigmoid(self.l4(feat))
        return feat


    def process_sequence(self,sample_buffer):
        """"
        sample_buffer: array of shape (N,2*self.layers +1), N>1 (+1 for the reward)

        gets data obtained from N experiments: data.shape = (N, 2L+1),
        where +1 accounts for the guess and 2L for (beta, outcome).

        [[a0, o1, a1, o2, a2, o3, a4]
         [same but other experiment]
        ]

        and returns an array of shape (experiments, self.layers, 2 ), as accepted by an RNN
        """
        rr = np.ones(experiences.shape)*self.pad_value
        rr[:,1:] = experiences[:,:-1]
        rr = np.reshape(rr, (experiences.shape[0],self.dolinar_layers+1,2))
        #padded_data[:,selff.dolinar_layers] = data[:,[selff.dolinar_layers+1, selff.dolinar_layers+2]]
        rewards_obtained = np.zeros((experiences.shape[0], self.dolinar_layers+1))
        rewards_obtained[:,-1] = sample_buffer[:,-1]
        return rr, rewards_obtained


    def pad_single_sequence(self, seq):
        """"
        input: [a0, o1, a1, o2, a2, o3, a4]

        output: [[a0, pad], [o1, a1], [...]]

        the cool thing is that then you can put this to predict the greedy guess/action.
        """
        padded_data = np.ones((1,self.dolinar_layers+1, 2))*self.pad_value
        padded_data[0][0][0] = seq[0]
        #padded_data[0][0] = data[0]
        for k in range(1,self.dolinar_layers+1):
            padded_data[0][k] = seq[k:(k+2)]
        return padded_data

    def give_td_error_Kennedy_guess(self,batched_input,sequential_rews_with_zeros):
        # this function takes as input the actions as given by the target actor (but the first one!)
        #and outpus the correspoindg TD-errors for DDPG! To obtain them from sample of buffer
        #you call the method targeted_sequence from the actor_target and then the process_sequence
        #of this critic network.
        if self.nature != "target":
            raise AttributeError("I'm not the target!")
            return
        b = batched_input.copy()
        ll = sequential_rews_with_zeros.copy()
        for k in range(self.dolinar_layers):
            ll[:,k] = np.squeeze(self(b))[:,k+1] + ll[:,k]

        preds1 = self(b)
        b[:,-1][:,-1] = -b[:,1][:,1]
        preds2 = self(b)
        both = tf.concat([preds1,preds2],2)
        maxs = np.squeeze(tf.math.reduce_max(both,axis=2).numpy())
        ll[:,-2] = maxs[:,-1] # This is the last befre the guess.. so the label is max_g Q(h-L, g)
        ll = np.expand_dims(ll,axis=1)
        return ll


    def give_favourite_guess(self,sequence_with_plus):
        """"
            important !! the 1!
        sequence should be [[beta, pad], [outcome, 1]] """
        pred_1 = self(sequence_with_plus)
        sequence_with_plus[:,1][:,1] = -sequence_with_plus[:,1][:,1]
        pred_2 = self(sequence_with_plus)
        both = tf.concat([pred_1,pred_2],2)
        maxs = np.squeeze(tf.argmax(both,axis=2).numpy())[1]

        guess = (-1)**maxs
        return  guess




##### ACTOR CLASSS ####
class Actor(tf.keras.Model):
    def __init__(self, nature, valreg=0.01, seed_val=0.1, pad_value = -7.,
                 dolinar_layers=2,tau=0.01):
        super(Actor,self).__init__()
        self.dolinar_layers = dolinar_layers
        self.pad_value = pad_value
        self.nature = nature
        self.tau = tau

        if nature == "primary":
            self.lstm = tf.keras.layers.LSTM(500, return_sequences=True, stateful=True)
            self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(1,1))#CHECK
        elif nature == "target":
            self.lstm = tf.keras.layers.LSTM(500, return_sequences=True, stateful=False)
            self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(self.dolinar_layers, 1)) #'cause i feed altoghether.
        else:
            print("Hey! the character is either primary or target")
        self.l1 = Dense(250,kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg), dtype='float32')

        self.l2 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val), dtype='float32')

        self.l3 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val), dtype='float32')

        self.l4 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val), dtype='float32')



    def update_target_parameters(self,primary_net):
        #### only
        # for i,j in zip(self.get_weights(), primary_net.get_weights()):
        #     tf.assign(i, tau*j + (i-tau)*i )
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(self.tau * prim_weights[i] + (1 - self.tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, inputs):
        feat = self.mask(inputs)
        feat= self.lstm(feat)
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l1(feat))
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.sigmoid(self.l4(feat))

        return feat

    def process_sequence_of_experiences(self, experiences):

        export = experiences.copy()
        for index in range(1,2*self.dolinar_layers-1,2): # I consider from first outcome to last one (but guess)
            export[:,index+1] = np.squeeze(self(np.reshape(np.array(export[:,index]),
                                                                 (experiences.shape[0],1,1))))
        return export

    def __str__(self):
        return self.name


In [176]:
def optimization_step(experiences, critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor):
    experiences = experiences.astype(np.float32)
    targeted_experience = actor_target.process_sequence_of_experiences(experiences)
    sequences, zeroed_rews = critic_target.process_sequence(targeted_experience)
    labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)
    #
    ###### train the critic ######
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        loss_critic = np.squeeze(loss_critic.numpy())
    #
    #
    actor.lstm.reset_states()
    actor.lstm.stateful=False ### this is because the mask has trouble with differing the batch_size

    actions_indexed = [0.]*(actor.dolinar_layers)
    with tf.GradientTape() as tape:
        ##### get the actions only ######
        actions_with_outcomes = experiences.copy()
        act_ind=0
        for ind in range(len(experiences)): #experiences.shape[0] = 2L +2
            if (ind%2 == 0)&(ind < 2*actor.dolinar_layers):
                ac = tf.convert_to_tensor(np.reshape(experiences[:,ind], (len(experiences),1,1)))
                actions_indexed[act_ind] = ac
                act_ind+=1
        actions_indexed = tf.concat(actions_indexed,axis=1)
        tape.watch(actions_indexed) ####watch the ations

        ### now prepare the state acions to put them into the critic###
        padded_data = [tf.ones((experiences.shape[0],1))*actor.pad_value]
        watched_input_critic  = padded_data.copy()
        ind_actions=0
        for ind,k in enumerate(tf.unstack(tf.convert_to_tensor(experiences[:,:-1]),axis=1)):
            if (ind%2==0)&(ind < 2*actor.dolinar_layers):
                padded_data.append(actions_indexed[:,ind_actions]) ### i add the input of the critic the watched actions!
                ind_actions+=1
            else:
                padded_data.append(tf.expand_dims(k, axis=1))
            if ind == 0:
                watched_input_critic = tf.stack([padded_data[0], padded_data[1]], axis=2) #importantly i put the padd first (state_action.)
            if (ind%2 == 0)&(ind!=0):
                intermediate = tf.stack([padded_data[ind], padded_data[ind+1]], axis=2)
                watched_input_critic = tf.concat([watched_input_critic, intermediate], axis=1)

        qvals = critic(watched_input_critic)
        dq_da = tape.gradient(qvals, actions_indexed)

    with tf.GradientTape() as tape:

        pads = np.ones(len(experiences)).astype(np.float32)*actor.pad_value
        news = np.random.rand(experiences.shape[0], experiences.shape[1]+1).astype(np.float32)
        news[:,1:] = experiences
        news[:,0] = pads
        instances_actor = [i for i in range(0,2*actor.dolinar_layers,2)]
        actionss = actor(np.reshape(news[:,instances_actor], (experiences.shape[0],actor.dolinar_layers,1)).astype(np.float32))

        da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)

    #
    optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    actor.lstm.stateful=True
    return loss_critic

In [177]:
amplitude = 0.4
lr_critic = lr_actor = 0.01

critic = Critic(nature="primary",valreg=0.01)
critic_target = Critic(nature="target")
actor = Actor(nature="primary")
actor_target = Actor(nature="target")

optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor) #0.001 works well

experiences = np.load("expe_2L.npy")

input_actor = np.reshape(np.array([actor.pad_value]),(1,1,1))
beta_would_do = np.squeeze(actor(input_actor))
    
#new_loss = optimization_step(experiences,critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor)


First steps

In [179]:
experiences = experiences[:5]
experiences

array([[ 0.69102163,  0.        ,  0.83695079,  0.        , -1.        ,
         1.        ],
       [ 0.98145692,  0.        ,  0.04985608,  0.        , -1.        ,
         1.        ],
       [ 0.69588898,  0.        ,  0.30091034,  0.        ,  1.        ,
         0.        ],
       [ 0.65480695,  0.        ,  0.72952473,  1.        , -1.        ,
         0.        ],
       [ 0.07486903,  0.        ,  0.39324444,  1.        ,  1.        ,
         1.        ]])

In [180]:
experiences = experiences.astype(np.float32)
experiences = experiences[:5]
targeted_experience = actor_target.process_sequence_of_experiences(experiences)
sequences, zeroed_rews = critic_target.process_sequence(targeted_experience)
labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)



To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



targeted_experience = $(a^{target}_0, o_1, a^{target}_1, o_2, guess^{(targ = prim)}, reward)$

In [181]:
targeted_experience

array([[ 0.6910216 ,  0.        ,  0.5089872 ,  0.        , -1.        ,
         1.        ],
       [ 0.98145694,  0.        ,  0.5089872 ,  0.        , -1.        ,
         1.        ],
       [ 0.695889  ,  0.        ,  0.5089872 ,  0.        ,  1.        ,
         0.        ],
       [ 0.654807  ,  0.        ,  0.5089872 ,  1.        , -1.        ,
         0.        ],
       [ 0.07486903,  0.        ,  0.5089872 ,  1.        ,  1.        ,
         1.        ]], dtype=float32)

In [182]:
sequences, zeroed_rews = critic_target.process_sequence(targeted_experience)
sequences

array([[[-7.        ,  0.69102162],
        [ 0.        ,  0.83695078],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.98145694],
        [ 0.        ,  0.04985608],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.695889  ],
        [ 0.        ,  0.30091032],
        [ 0.        ,  1.        ]],

       [[-7.        ,  0.65480697],
        [ 0.        ,  0.72952473],
        [ 1.        , -1.        ]],

       [[-7.        ,  0.07486903],
        [ 0.        ,  0.39324445],
        [ 1.        ,  1.        ]]])

$\texttt{labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)}$

this gives $r_{t+1} + Q(s_{t+1}, a^{target}(s_{t+1}))$, but for the last time-step that gives
$max_guess Q(h_L, g)$

In [183]:
labels_critic

array([[[0.5973852 , 0.61283683, 1.        ]],

       [[0.60361261, 0.61038331, 1.        ]],

       [[0.60347774, 0.61117616, 0.        ]],

       [[0.59924107, 0.61901563, 0.        ]],

       [[0.60647486, 0.61481086, 1.        ]]])

array([[[-7.        ,  0.69102162],
        [ 0.        ,  0.83695078],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.98145694],
        [ 0.        ,  0.04985608],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.695889  ],
        [ 0.        ,  0.30091032],
        [ 0.        ,  1.        ]],

       [[-7.        ,  0.65480697],
        [ 0.        ,  0.72952473],
        [ 1.        , -1.        ]],

       [[-7.        ,  0.07486903],
        [ 0.        ,  0.39324445],
        [ 1.        ,  1.        ]]])

Up to here we processed the sample from the buffer (whose form is mandatorily (a_0, o_1, a_1, o_2, a_3, ..., o_L, g=a_L+1, reward)

In [185]:
preds_critic = critic(sequences)
preds_critic

<tf.Tensor: shape=(5, 3, 1), dtype=float64, numpy=
array([[[0.48384659],
        [0.44745938],
        [0.46690037]],

       [[0.4811861 ],
        [0.44813932],
        [0.46690939]],

       [[0.48382078],
        [0.44667874],
        [0.46415958]],

       [[0.48403872],
        [0.44678101],
        [0.46208075]],

       [[0.48350252],
        [0.44920999],
        [0.45986259]]])>

In [187]:
loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
loss_critic

<tf.Tensor: shape=(5, 3), dtype=float64, numpy=
array([[0.09864795, 0.1183762 , 0.10750632],
       [0.10028268, 0.11834842, 0.10781938],
       [0.08820658, 0.08372241, 0.08548918],
       [0.08859461, 0.08417403, 0.08565338],
       [0.09971124, 0.11850851, 0.11241751]])>

In [188]:
loss_critic = tf.reduce_mean(loss_critic)
loss_critic

<tf.Tensor: shape=(), dtype=float64, numpy=0.09983055987304452>

Now it comes the actor optimziation step.

<br> 
<br>
Firstly, we'll predict the actor's output on a batch, so we need to set stateful=False. We also reset the states.

In [189]:
actor.lstm.reset_states()
actor.lstm.stateful=False ### this is because the mask has trouble with differing the batch_size


Now we have to tape.watch the actions that would be output by the actor for each state (it's just a reshape of experiences)

In [190]:
actions_indexed = [0.]*(actor.dolinar_layers)
with tf.GradientTape() as tape:
    ##### get the actions only ######
    actions_with_outcomes = experiences.copy()
    act_ind=0
    for ind in range(len(experiences)): #experiences.shape[0] = 2L +2
        if (ind%2 == 0)&(ind < 2*actor.dolinar_layers):
            ac = tf.convert_to_tensor(np.reshape(experiences[:,ind], (len(experiences),1,1)))
            actions_indexed[act_ind] = ac
            act_ind+=1
    actions_indexed = tf.concat(actions_indexed,axis=1)
    tape.watch(actions_indexed) ####watch the ations

Let's compare them with experiences:

In [192]:
experiences

array([[ 0.6910216 ,  0.        ,  0.8369508 ,  0.        , -1.        ,
         1.        ],
       [ 0.98145694,  0.        ,  0.04985608,  0.        , -1.        ,
         1.        ],
       [ 0.695889  ,  0.        ,  0.30091032,  0.        ,  1.        ,
         0.        ],
       [ 0.654807  ,  0.        ,  0.72952473,  1.        , -1.        ,
         0.        ],
       [ 0.07486903,  0.        ,  0.39324445,  1.        ,  1.        ,
         1.        ]], dtype=float32)

In [191]:
actions_indexed

<tf.Tensor: shape=(5, 2, 1), dtype=float32, numpy=
array([[[0.6910216 ],
        [0.8369508 ]],

       [[0.98145694],
        [0.04985608]],

       [[0.695889  ],
        [0.30091032]],

       [[0.654807  ],
        [0.72952473]],

       [[0.07486903],
        [0.39324445]]], dtype=float32)>

Now we have to merge this watched variables (actions) to the inputs of the critic, in a tensor of shape (batch_size, L+1, 2),

to obtain Q(h_l, a_l), {h_l, a_l} in buffer

In [206]:
### now prepare the state acions to put them into the critic###
padded_data = [tf.ones((experiences.shape[0],1))*actor.pad_value]
watched_input_critic  = padded_data.copy()
watched_input_critic

[<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
 array([[-7.],
        [-7.],
        [-7.],
        [-7.],
        [-7.]], dtype=float32)>]

In [207]:
tf.unstack(tf.convert_to_tensor(experiences[:,:-1]),axis=1)

[<tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([0.6910216 , 0.98145694, 0.695889  , 0.654807  , 0.07486903],
       dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 0.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([0.8369508 , 0.04985608, 0.30091032, 0.72952473, 0.39324445],
       dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 1., 1.], dtype=float32)>,
 <tf.Tensor: shape=(5,), dtype=float32, numpy=array([-1., -1.,  1., -1.,  1.], dtype=float32)>]

In [208]:
ind_actions=0
for ind,k in enumerate(tf.unstack(tf.convert_to_tensor(experiences[:,:-1]),axis=1)): #notice we get rid of the rewards here
    if (ind%2==0)&(ind < 2*actor.dolinar_layers):
        padded_data.append(actions_indexed[:,ind_actions]) ### i add the input of the critic the watched actions!
        ind_actions+=1
    else:
        padded_data.append(tf.expand_dims(k, axis=1))
    if ind == 0:
        watched_input_critic = tf.stack([padded_data[0], padded_data[1]], axis=2) #importantly i put the padd first (state_action.)
    if (ind%2 == 0)&(ind!=0):
        intermediate = tf.stack([padded_data[ind], padded_data[ind+1]], axis=2)
        watched_input_critic = tf.concat([watched_input_critic, intermediate], axis=1)

In [210]:
watched_input_critic

<tf.Tensor: shape=(5, 3, 2), dtype=float32, numpy=
array([[[-7.        ,  0.6910216 ],
        [ 0.        ,  0.8369508 ],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.98145694],
        [ 0.        ,  0.04985608],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.695889  ],
        [ 0.        ,  0.30091032],
        [ 0.        ,  1.        ]],

       [[-7.        ,  0.654807  ],
        [ 0.        ,  0.72952473],
        [ 1.        , -1.        ]],

       [[-7.        ,  0.07486903],
        [ 0.        ,  0.39324445],
        [ 1.        ,  1.        ]]], dtype=float32)>

Notice this is just the same as sequences, but now the tape is watching the actions

In [214]:
watched_input_critic - sequences

<tf.Tensor: shape=(5, 3, 2), dtype=float32, numpy=
array([[[0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.]]], dtype=float32)>

array([[[-7.        ,  0.69102162],
        [ 0.        ,  0.83695078],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.98145694],
        [ 0.        ,  0.04985608],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.695889  ],
        [ 0.        ,  0.30091032],
        [ 0.        ,  1.        ]],

       [[-7.        ,  0.65480697],
        [ 0.        ,  0.72952473],
        [ 1.        , -1.        ]],

       [[-7.        ,  0.07486903],
        [ 0.        ,  0.39324445],
        [ 1.        ,  1.        ]]])

In [216]:
experiences = experiences.astype(np.float32)
targeted_experience = actor_target.process_sequence_of_experiences(experiences)
sequences, zeroed_rews = critic_target.process_sequence(targeted_experience)
labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)
#
###### train the critic ######
with tf.GradientTape() as tape:
    tape.watch(critic.trainable_variables)
    preds_critic = critic(sequences)
    loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
    loss_critic = tf.reduce_mean(loss_critic)
    grads = tape.gradient(loss_critic, critic.trainable_variables)
    optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
    loss_critic = np.squeeze(loss_critic.numpy())
#
#
#actor.lstm.reset_states()
actor.lstm.stateful=False ### this is because the mask has trouble with differing the batch_size

actions_indexed = [0.]*(actor.dolinar_layers)
with tf.GradientTape() as tape:
    ##### get the actions only ######
    actions_with_outcomes = experiences.copy()
    act_ind=0
    for ind in range(len(experiences)): #experiences.shape[0] = 2L +2
        if (ind%2 == 0)&(ind < 2*actor.dolinar_layers):
            ac = tf.convert_to_tensor(np.reshape(experiences[:,ind], (len(experiences),1,1)))
            actions_indexed[act_ind] = ac
            act_ind+=1
    actions_indexed = tf.concat(actions_indexed,axis=1)
    tape.watch(actions_indexed) ####watch the ations

    ### now prepare the state acions to put them into the critic###
    padded_data = [tf.ones((experiences.shape[0],1))*actor.pad_value]
    watched_input_critic  = padded_data.copy()
    ind_actions=0
    for ind,k in enumerate(tf.unstack(tf.convert_to_tensor(experiences[:,:-1]),axis=1)):
        if (ind%2==0)&(ind < 2*actor.dolinar_layers):
            padded_data.append(actions_indexed[:,ind_actions]) ### i add the input of the critic the watched actions!
            ind_actions+=1
        else:
            padded_data.append(tf.expand_dims(k, axis=1))
        if ind == 0:
            watched_input_critic = tf.stack([padded_data[0], padded_data[1]], axis=2) #importantly i put the padd first (state_action.)
        if (ind%2 == 0)&(ind!=0):
            intermediate = tf.stack([padded_data[ind], padded_data[ind+1]], axis=2)
            watched_input_critic = tf.concat([watched_input_critic, intermediate], axis=1)

    qvals = critic(watched_input_critic)
    dq_da = tape.gradient(qvals, actions_indexed)

with tf.GradientTape() as tape:

    pads = np.ones(len(experiences)).astype(np.float32)*actor.pad_value
    news = np.random.rand(experiences.shape[0], experiences.shape[1]+1).astype(np.float32)
    news[:,1:] = experiences
    news[:,0] = pads
    instances_actor = [i for i in range(0,2*actor.dolinar_layers,2)]
    actionss = actor(np.reshape(news[:,instances_actor], (experiences.shape[0],actor.dolinar_layers,1)).astype(np.float32))

    da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)

#
optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))



To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float32 by default, call `tf.keras.backend.set_floatx('float32')`. To change just this layer, pass dtype='float32' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

In [217]:
da_dtheta

[<tf.Tensor: shape=(1, 2000), dtype=float64, numpy=array([[0., 0., 0., ..., 0., 0., 0.]])>,
 <tf.Tensor: shape=(500, 2000), dtype=float64, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])>,
 <tf.Tensor: shape=(2000,), dtype=float64, numpy=array([0., 0., 0., ..., 0., 0., 0.])>,
 <tf.Tensor: shape=(500, 250), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(250,), dtype=float32, numpy=
 array([-2.17598850e-09, -8.45104875e-10,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.65575287e-09,  0.00000000e+0