In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense


class Actor(tf.keras.Model):
    def __init__(self, state_shape, action_dim, max_action, units=[400, 300], name="Actor"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1")
        self.l2 = Dense(units[1], name="L2")
        self.l3 = Dense(action_dim, name="L3")

        self.max_action = max_action

        with tf.device("/cpu:0"):
            self(tf.constant(np.zeros(shape=(1,)+state_shape, dtype=np.float32)))

    def call(self, inputs):
        features = tf.nn.relu(self.l1(inputs))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        action = self.max_action * tf.nn.tanh(features)
        return action
    
class Critic(tf.keras.Model):
    def __init__(self, state_shape, action_dim, units=[400, 300], name="Critic"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1")
        self.l2 = Dense(units[1], name="L2")
        self.l3 = Dense(1, name="L3")

        dummy_state = tf.constant(
            np.zeros(shape=(1,)+state_shape, dtype=np.float32))
        dummy_action = tf.constant(
            np.zeros(shape=[1, action_dim], dtype=np.float32))
        with tf.device("/cpu:0"):
            self([dummy_state, dummy_action])

    def call(self, inputs):
        states, actions = inputs
        features = tf.concat([states, actions], axis=1)
        features = tf.nn.relu(self.l1(features))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        return features
    
def update_towards_net2(net1,net2,tau=.01):
    for source_variable,target_variable in zip(net1.trainable_variables, net2.trainable_variables):
        source_variable.assign(tau*source_variable + (1.0 - tau)*target_variable)
    return

PROBANDO 1-2-3

In [22]:
critic = Actor((1,),1,1)
critic(np.array([[[2.]]]))

f = []
for i in range(4):
    f.append([[[np.random.random()]]])
actions = np.array(f)

for i in critic(actions).numpy():
    print(i.flatten()[0])

W1021 17:31:22.844091 139949274855232 base_layer.py:1814] Layer Actor is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



-0.0047916854
-0.005579821
-0.00043133568
-0.003389843


PROBANDO 1-2-3

In [45]:
tf.constant([0.])

<tf.Tensor: id=1393, shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>

In [88]:
with tf.device("/cpu:0"):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        f = []
        rewards=[]
        for i in range(4):
            rewards.append(tf.constant(np.random.choice([0,0],1)))
            f.append([np.random.random()])
        actions = tf.constant(np.array(f))

        q_values = critic(actions)
        loss = tf.keras.losses.MSE(rewards, q_values)
        f = tape.gradient(loss,critic.trainable_variables)
        
     #   critic_loss = tf.keras.losses.MSE(rewards, q_values)
        #critic_loss = tf.reduce_mean(critic_loss) ??????
      #  critic_grad = tape.gradient(critic_loss, critic.trainable_variables)

In [84]:
q_values

<tf.Tensor: id=2529, shape=(4, 1), dtype=float32, numpy=
array([[-0.00140814],
       [-0.00445031],
       [-0.00218428],
       [-0.00036424]], dtype=float32)>

In [85]:
rewards

[<tf.Tensor: id=2507, shape=(1,), dtype=int64, numpy=array([0])>,
 <tf.Tensor: id=2508, shape=(1,), dtype=int64, numpy=array([0])>,
 <tf.Tensor: id=2509, shape=(1,), dtype=int64, numpy=array([0])>,
 <tf.Tensor: id=2510, shape=(1,), dtype=int64, numpy=array([0])>]

In [86]:
loss

<tf.Tensor: id=2534, shape=(4,), dtype=float32, numpy=
array([1.9828499e-06, 1.9805268e-05, 4.7710992e-06, 1.3267262e-07],
      dtype=float32)>

In [87]:
tf.square(q_values)

<tf.Tensor: id=2535, shape=(4, 1), dtype=float32, numpy=
array([[1.9828499e-06],
       [1.9805268e-05],
       [4.7710992e-06],
       [1.3267262e-07]], dtype=float32)>

In [89]:
f

[<tf.Tensor: id=2599, shape=(1, 400), dtype=float32, numpy=
 array([[ 8.83519533e-05,  0.00000000e+00, -5.76188846e-04,
          3.09322718e-06,  0.00000000e+00, -5.02533338e-04,
          0.00000000e+00,  0.00000000e+00,  4.90286911e-04,
          4.09229804e-04, -1.11992436e-03,  3.74499177e-05,
         -3.71537346e-04,  0.00000000e+00,  0.00000000e+00,
         -6.89497450e-04,  0.00000000e+00, -1.17019296e-03,
          0.00000000e+00,  0.00000000e+00, -3.84546671e-04,
          1.58389215e-03,  0.00000000e+00, -8.15397187e-04,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          4.89795988e-04,  0.00000000e+00,  0.00000000e+00,
          1.03772827e-03, -3.90804489e-04,  0.00000000e+00,
          3.45412031e-04, -1.17841433e-03,  0.00000000e+00,
          0.00000000e+00,  8.00446258e-04,  0.00000000e+00,
         -4.93085885e-04,  1.25912030e-03,  0.00000000e+00,
          2.85722112e-04,  0.00000000e+00, -2.83068221e-04,
          5.27078519e-05,  0.00000000e+0