In [43]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
import numpy as np

class Critic(tf.keras.Model):
    def __init__(self,nature, valreg=0.01, seed_val=0.3, pad_value=-7., dolinar_layers=2, tau=0.01):
        '''
        dolinar_layers= number of photodetections
        pad_value: value not considered by the lstm
        valreg: regularisation value
        seed_val: interval of random parameter inizialitaion.
        '''
        super(Critic,self).__init__()

        self.pad_value = pad_value
        self.nature = nature
        self.dolinar_layers = dolinar_layers
        self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(self.dolinar_layers, 2)) #(beta1, pad), (n1, beta2), (n2, guess). In general i will have (layer+1)
        self.lstm = tf.keras.layers.LSTM(500, return_sequences=True)

        self.tau = tau
        self.l1 = Dense(250,kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg))

        self.l2 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l3 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l4 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))



    def update_target_parameters(self,primary_net):
        #### only
        # for i,j in zip(self.get_weights(), primary_net.get_weights()):
        #     tf.assign(i, tau*j + (i-tau)*i )
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(self.tau * prim_weights[i] + (1 - self.tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, inputs):
        feat = self.mask(inputs)
        feat= self.lstm(feat)
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l1(feat))
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.sigmoid(self.l4(feat))
        return feat


    def process_sequence(self,sample_buffer):
        """"
        sample_buffer: array of shape (N,2*self.layers +1), N>1

        gets data obtained from N experiments: data.shape = (N, 2L+1),
        where +1 accounts for the guess and 2L for (beta, outcome).

        [[a0, o1, a1, o2, a2, o3, a4]
         [same but other experiment]
        ]

        and returns an array of shape (experiments, self.layers, 2 ), as accepted by an RNN
        """
        batch_size = sample_buffer.shape[0]
        data = sample_buffer[:,0:(self.dolinar_layers+1+1)]
        padded_data = np.ones((batch_size,self.dolinar_layers+1, 2))*self.pad_value
        padded_data[:,0][:,0] = data[:,0]
        for k in range(1,self.dolinar_layers+1):
            padded_data[:,k] = data[:,[k,k+1]]

        rewards_obtained = np.zeros((batch_size, self.dolinar_layers+1))
        rewards_obtained[:,-1] = sample_buffer[:,-1]
        return padded_data, rewards_obtained


    def pad_single_sequence(self, seq):
        """"
        input: [a0, o1, a1, o2, a2, o3, a4]

        output: [[a0, pad], [o1, a1], [...]]

        the cool thing is that then you can put this to predict the greedy guess/action.
        """
        padded_data = np.ones((1,self.dolinar_layers+1, 2))*self.pad_value
        padded_data[0][0][0] = seq[0]
        #padded_data[0][0] = data[0]
        for k in range(1,self.dolinar_layers+1):
            padded_data[0][k] = seq[k:(k+2)]
        return padded_data

    def give_td_error_Kennedy_guess(self,batched_input,sequential_rews_with_zeros):
        # this function takes as input the actions as given by the target actor (but the first one!)
        #and outpus the correspoindg TD-errors for DDPG! To obtain them from sample of buffer
        #you call the method targeted_sequence from the actor_target and then the process_sequence
        #of this critic network.
        if self.nature != "target":
            raise AttributeError("I'm not the target!")
            return
        b = batched_input.copy()
        ll = sequential_rews_with_zeros.copy()
        for k in range(0,self.dolinar_layers-1):
            print(k)
            ll[:,k] = np.squeeze(self(b))[:,k+1] + ll[:,k]

        preds1 = self(b)
        b[:,-1][:,-1] = -b[:,1][:,1]
        preds2 = self(b)
        both = tf.concat([preds1,preds2],2)
        maxs = np.squeeze(tf.math.reduce_max(both,axis=2).numpy())
        ll[:,-2] = maxs[:,1] # This is the last befre the guess.. so the label is max_g Q(h-L, g)
        ll = np.expand_dims(ll,axis=1)
        return ll


    def give_favourite_guess(self,sequence_with_plus):
        """"
            important !! the 1!
        sequence should be [[beta, pad], [outcome, 1]] """
        pred_1 = self(sequence_with_plus)
        sequence_with_plus[:,1][:,1] = -sequence_with_plus[:,1][:,1]
        pred_2 = self(sequence_with_plus)
        both = tf.concat([pred_1,pred_2],2)
        maxs = np.squeeze(tf.argmax(both,axis=2).numpy())[1]

        guess = (-1)**maxs
        return  guess




##### ACTOR CLASSS ####
class Actor(tf.keras.Model):
    def __init__(self, nature, valreg=0.01, seed_val=0.1, pad_value = -7.,
                 dolinar_layers=2,tau=0.01):
        super(Actor,self).__init__()
        self.dolinar_layers = dolinar_layers
        self.pad_value = pad_value
        self.nature = nature
        self.tau = tau

        if nature == "primary":
            self.lstm = tf.keras.layers.LSTM(500, return_sequences=True, stateful=True)
            self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(1,1))#CHECK
        elif nature == "target":
            self.lstm = tf.keras.layers.LSTM(500, return_sequences=True, stateful=False)
            self.mask = tf.keras.layers.Masking(mask_value=pad_value,
                                  input_shape=(self.dolinar_layers, 1)) #'cause i feed altoghether.
        else:
            print("Hey! the character is either primary or target")
        self.l1 = Dense(250,kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
        kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg))

        self.l2 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l3 = Dense(100, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))

        self.l4 = Dense(1, kernel_regularizer=tf.keras.regularizers.l1(valreg),
    activity_regularizer=tf.keras.regularizers.l2(valreg),
    kernel_initializer=tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val),
    bias_initializer = tf.random_uniform_initializer(minval=-seed_val, maxval=seed_val))



    def update_target_parameters(self,primary_net):
        #### only
        # for i,j in zip(self.get_weights(), primary_net.get_weights()):
        #     tf.assign(i, tau*j + (i-tau)*i )
        prim_weights = primary_net.get_weights()
        targ_weights = self.get_weights()
        weights = []
        for i in tf.range(len(prim_weights)):
            weights.append(self.tau * prim_weights[i] + (1 - self.tau) * targ_weights[i])
        self.set_weights(weights)
        return

    def call(self, inputs):
        feat = self.mask(inputs)
        feat= self.lstm(feat)
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l1(feat))
        # feat = tf.nn.dropout(feat, rate=0.01)
        feat = tf.nn.relu(self.l2(feat))
        feat = tf.nn.relu(self.l3(feat))
        feat = tf.nn.sigmoid(self.l4(feat))

        return feat

    def process_sequence_of_experiences(self, experiences):

        #This function takes a vector of experiences:
        #vector = (\beta1, o1, \beta2, o2, \beta3, o3,...,o_L, guess)
        #and retrieves
        #(\beta1, o1, \beta2_target, o2, \beta3_target, o3, \beta4_target,... ,o_L, guess)

        #For the primary it should give again the actions that generated the experience (this is to consider the wegiths
        #in the graph)

        #For the target it gives the "opinion" of the actions it should've taken...

        # if self.nature != "target":
        #     raise AttributeError("check the lstm memory of actor target, stateful == True ?")
        #     return
        export = experiences.copy()
        for index in range(1,2*self.dolinar_layers-1,2): # I consider from first outcome to last one (but guess)
            export[:,index+1] = np.squeeze(self(np.reshape(np.array(export[:,index]),
                                                                 (experiences.shape[0],1,1))))
        return export

    def __str__(self):
        return self.name


In [54]:
def optimization_step(experiences, critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor):
    targeted_experience = actor_target.process_sequence_of_experiences(experiences)
    sequences, zeroed_rews = critic_target.process_sequence(targeted_experience)
    labels_critic = critic_target.give_td_error_Kennedy_guess( sequences, zeroed_rews)
    #
    ###### train the critic ######
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        preds_critic = critic(sequences)
        loss_critic = tf.keras.losses.MSE(labels_critic, preds_critic)
        loss_critic = tf.reduce_mean(loss_critic)
        grads = tape.gradient(loss_critic, critic.trainable_variables)
        optimizer_critic.apply_gradients(zip(grads, critic.trainable_variables))
        loss_critic = np.squeeze(loss_critic.numpy())
    #
    #
    actor.lstm.reset_states()
    actor.lstm.stateful=False
    with tf.GradientTape() as tape:
        # export = experiences.copy()
        # actions = [0.]*actor.dolinar_layers
        # actons[0] = actor(np.reshape(np.array(actor.pad_value), (experiences.shape[0],1,1)))
        #
        # for ind,index in enumerate(range(1,2*actor.dolinar_layers-1,2)): # I consider from first outcome to last one (but guess)
        #     actions[:,ind] = actor(np.reshape(np.array(export[:,index]),
        #                                                          (experiences.shape[0],1,1)))

        actions_with_outcomes = experiences.copy()
        actions_indexed = []
        for ind, index in enumerate(range(1,2*actor.dolinar_layers-1,2)): # I consider from first outcome to last one (but guess)
            ac = actor_target(np.reshape(actions_with_outcomes[:,index], (len(actions_with_outcomes[:,index]), 1,1 )))
            # print(actor(np.reshape(actions_with_outcomes[:,index], (len(actions_with_outcomes[:,index]), 1,1 ))))
            print("****")
            # actor(np.reshape(actions_with_outcomes[:,index], (len(actions_with_outcomes[:,index]), 1,1 )))
            # print(np.reshape(actions_with_outcomes[:,index], (len(actions_with_outcomes[:,index]), 1,1 )).shape)
            actions_indexed.append(ac)
            actions_with_outcomes[:,index+1] = np.squeeze(ac)
        # tape.watch(actions_indexed)
        tape.watch(actions_indexed)
        print(actions_indexed)
        acionts_indexed = tf.concat(actions_indexed,axis=0)
        bbs, rrs = critic.process_sequence(actions_with_outcomes)
        qvals = critic(bbs)
        print(qvals)
        dq_da = tape.gradient(qvals, actions_indexed)
        print(dq_da)
    # #
    # with tf.GradientTape() as tape:
    #     actionss = actor(np.expand_dims(np.zeros(len(experiences)),axis=1))
    #     da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)
    #
    # optimizer_actor.apply_gradients(zip(da_dtheta, actor.trainable_variables))
    actor.lstm.stateful=True
    return loss_critic

In [423]:
experiences

array([[ 0.6910216 ,  0.        ,  0.8369508 ,  0.        , -1.        ],
       [ 0.98145694,  0.        ,  0.04985608,  0.        , -1.        ],
       [ 0.695889  ,  0.        ,  0.30091032,  0.        ,  1.        ],
       [ 0.654807  ,  0.        ,  0.72952473,  1.        , -1.        ]],
      dtype=float32)

In [55]:
amplitude = 0.4
lr_critic = lr_actor = 0.01

critic = Critic(nature="primary",valreg=0.01)
critic_target = Critic(nature="target")
actor = Actor(nature="primary")
actor_target = Actor(nature="target")

optimizer_critic = tf.keras.optimizers.Adam(lr=lr_critic)
optimizer_actor = tf.keras.optimizers.Adam(lr=lr_actor) #0.001 works well

experiences = np.load("expe_2L.npy")

input_actor = np.reshape(np.array([actor.pad_value]),(1,1,1))
beta_would_do = np.squeeze(actor(input_actor))
    
new_loss = optimization_step(experiences,critic, critic_target, actor, actor_target, optimizer_critic, optimizer_actor)




To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

0


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('flo

In [9]:
bbs, ll = critic.process_sequence(experiences)

In [19]:
with tf.GradientTape() as tape:
    preds = critic(bbs)
    tape.watch(preds)
    ll = tf.keras.losses.MSE(np.ones(preds.numpy().shape), preds)
    grads = tape.gradient(ll, preds)

In [21]:

grads

<tf.Tensor: shape=(100, 3, 1), dtype=float32, numpy=
array([[[-9.41753387e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-8.10623169e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-9.29832458e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-9.53674316e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.29938126e-05],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.04904175e-05],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-8.94069672e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-8.58306885e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-9.05990601e-06],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.02519989e-05],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.25169754e-05],
        [ 0.00000000e+00],
        [ 0.00000000e+00]],

       [[-1.02519989e-05],
       

In [None]:
experiences = experiences[:4]

In [422]:
experiences = experiences[:,:-1]

In [339]:
actions_indexed = [0.]*(actor.dolinar_layers)
actions_indexed[0] = tf.convert_to_tensor(np.reshape(experiences[:,0], (len(experiences),1,1)))

with tf.GradientTape() as tape:
    ##### get the actions only ######
    actions_with_outcomes = experiences.copy()
    act_ind=0
    for ind in range(len(experiences)):
        if (ind%2 == 0)&(ind!=len(experiences)):
            ac = tf.convert_to_tensor(np.reshape(experiences[:,ind], (len(experiences),1,1)))
            actions_indexed[act_ind] = ac
            act_ind+=1
    actions_indexed = tf.concat(actions_indexed,axis=1)
    tape.watch(actions_indexed) ####watch the ations 
    
    ### now prepare the state acions to put them into the critic###
    padded_data = [tf.ones((batch_size,1))*actor.pad_value]
    watched_input_critic  = padded_data.copy()
    ind_actions=0
    for ind,k in enumerate(tf.unstack(tf.convert_to_tensor(experiences),axis=1)):
        if (ind%2==0)&(ind != len(experiences)):
            padded_data.append(actions_indexed[:,ind_actions]) ### i add the input of the critic the watched actions!
            ind_actions+=1
        else:
            padded_data.append(tf.expand_dims(k, axis=1))
        if ind == 0:
            watched_input_critic = tf.stack([padded_data[0], padded_data[1]], axis=2) #importantly i put the padd first (state_action.)
        if (ind%2 == 0)&(ind!=0):
            intermediate = tf.stack([padded_data[ind], padded_data[ind+1]], axis=2)
            watched_input_critic = tf.concat([watched_input_critic, intermediate], axis=1)
    
    qvals = critic(watched_input_critic)
    dq_da = tape.gradient(qvals, actions_indexed)


In [419]:
with tf.GradientTape() as tape:

    pads = np.ones(len(experiences))*actor.pad_value
    news = np.random.rand(experiences.shape[0], experiences.shape[1]+1)
    news[:,1:] = experiences
    news[:,0] = pads
    instances_actor = [i for i in range(0,2*actor.dolinar_layers,2)]
    actionss = actor(np.reshape(news[:,instances_actor], (experiences.shape[0],actor.dolinar_layers,1)))
    
    da_dtheta = tape.gradient(actionss, actor.trainable_variables, output_gradients=-dq_da)


[<tf.Tensor: shape=(1, 2000), dtype=float32, numpy=array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(500, 2000), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(2000,), dtype=float32, numpy=array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)>,
 <tf.Tensor: shape=(500, 250), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(250,), dtype=float32, numpy=
 array([ 3.1427753e-03, -5.9653060e-03,  0.0000000e+00,  0.0000000e+00,
         1.3609142e-03,  2.61710

In [213]:
experiences = experiences.astype(np.float32)

In [214]:
padded_data = tf.ones((batch_size,1))*actor.pad_value
first_step = tf.stack([padded_data, actions_indexed[:,0]], axis=2)
step_2 = tf.stack([np.expand_dims(experiences[:,1],1), actions_indexed[:,1]], axis=2)
step_2 = tf.concat([first_step, step_2], axis=1)
final_step =  tf.stack([np.expand_dims(experiences[:,-3],1), np.expand_dims(experiences[:,-2],1)], axis=2)
step_3 = tf.concat([step_2,final_step], axis=1)
step_3

<tf.Tensor: shape=(4, 3, 2), dtype=float32, numpy=
array([[[-7.        ,  0.49959072],
        [ 0.        ,  0.4999036 ],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.49960613],
        [ 0.        , -1.        ]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.4996912 ],
        [ 0.        ,  1.        ]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.49986365],
        [ 1.        , -1.        ]]], dtype=float32)>

In [207]:
step_2

<tf.Tensor: shape=(4, 2, 2), dtype=float32, numpy=
array([[[-7.        ,  0.49959072],
        [ 0.        ,  0.4999036 ]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.49960613]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.4996912 ]],

       [[-7.        ,  0.49959072],
        [ 0.        ,  0.49986365]]], dtype=float32)>

In [203]:
experiences[:,4]

array([-1., -1.,  1., -1.])

In [204]:
experiences[:,5]

array([1., 1., 0., 0.])

#### for k in range(1,self.dolinar_layers+1):
    padded_data[:,k] = data[:,[k,k+1]]

rewards_obtained = np.zeros((batch_size, self.dolinar_layers+1))
rewards_obtained[:,-1] = sample_buffer[:,-1]

In [143]:
actions_indexed

[<tf.Tensor: shape=(4, 1, 1), dtype=float32, numpy=
 array([[[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]]], dtype=float32)>,
 <tf.Tensor: shape=(4, 1, 1), dtype=float32, numpy=
 array([[[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]]], dtype=float32)>]

In [141]:
batch_size = experiences.shape[0]
data = experiences[:,0:(actor.dolinar_layers+1+1)]


In [139]:
padded_data = np.ones((batch_size,self.dolinar_layers+1, 2))*self.pad_value
padded_data[:,0][:,0] = data[:,0]
for k in range(1,self.dolinar_layers+1):
    padded_data[:,k] = data[:,[k,k+1]]

rewards_obtained = np.zeros((batch_size, self.dolinar_layers+1))
rewards_obtained[:,-1] = sample_buffer[:,-1]

[<tf.Tensor: shape=(4, 1, 1), dtype=float32, numpy=
 array([[[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]]], dtype=float32)>,
 <tf.Tensor: shape=(4, 1, 1), dtype=float32, numpy=
 array([[[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]],
 
        [[0.49959072]]], dtype=float32)>]

In [81]:

batch_size = experiences.shape[0]
data = sample_buffer[:,0:(self.dolinar_layers+1+1)]
padded_data = np.ones((batch_size,self.dolinar_layers+1, 2))*self.pad_value
padded_data[:,0][:,0] = data[:,0]
for k in range(1,self.dolinar_layers+1):
    padded_data[:,k] = data[:,[k,k+1]]

rewards_obtained = np.zeros((batch_size, self.dolinar_layers+1))
rewards_obtained[:,-1] = sample_buffer[:,-1]

array([[ 0.69102163,  0.        ,  0.83695079,  0.        , -1.        ,
         1.        ],
       [ 0.98145692,  0.        ,  0.04985608,  0.        , -1.        ,
         1.        ],
       [ 0.69588898,  0.        ,  0.30091034,  0.        ,  1.        ,
         0.        ],
       [ 0.65480695,  0.        ,  0.72952473,  1.        , -1.        ,
         0.        ]])

In [80]:
actions_indexed

<tf.Tensor: shape=(4, 1, 1), dtype=float32, numpy=
array([[[0.49959072]],

       [[0.49959072]],

       [[0.49959072]],

       [[0.49959072]]], dtype=float32)>

array([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.])

<tf.Tensor: shape=(100, 1, 1), dtype=float32, numpy=
array([[[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

       [[1.5804997e-05]],

       [[1.5807092e-05]],

       [[1.5807092e-05]],

  