In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense


In [7]:
class Actor(tf.keras.Model):
    def __init__(self, state_shape, action_dim, max_action, units=[400, 300], name="Actor"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1")
        self.l2 = Dense(units[1], name="L2")
        self.l3 = Dense(action_dim, name="L3")

        self.max_action = max_action

        with tf.device("/cpu:0"):
            self(tf.constant(np.zeros(shape=(1,)+state_shape, dtype=np.float32)))

    def call(self, inputs):
        features = tf.nn.relu(self.l1(inputs))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        action = self.max_action * tf.nn.tanh(features)
        return action
    
class Critic(tf.keras.Model):
    def __init__(self, state_shape, action_dim, units=[400, 300], name="Critic"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1")
        self.l2 = Dense(units[1], name="L2")
        self.l3 = Dense(1, name="L3")

        dummy_state = tf.constant(
            np.zeros(shape=(1,)+state_shape, dtype=np.float32))
        dummy_action = tf.constant(
            np.zeros(shape=[1, action_dim], dtype=np.float32))
        with tf.device("/cpu:0"):
            self([dummy_state, dummy_action])

    def call(self, inputs):
        states, actions = inputs
        features = tf.concat([states, actions], axis=1)
        features = tf.nn.relu(self.l1(features))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        return features
    
    
 

PROBANDO 1-2-3

In [14]:
a=Actor((1,),1,3)
state = np.array([[2]])
print(a(state))

cc = Critic((1,),1)
print(cc([tf.constant(state,dtype=np.float32),
    tf.constant(a(state),dtype=np.float32)]))

tf.Tensor([[0.21592781]], shape=(1, 1), dtype=float32)
tf.Tensor([[0.03099412]], shape=(1, 1), dtype=float32)


In [6]:
state_shape = (1,)
action_shape = 1

actor = Actor(state_shape, action_shape,max_action=2)
critic = Critic(state_shape,action_shape)

actor_target = Actor(state_shape, action_shape,max_action=2)
critic_target = Critic(state_shape,action_shape)

FIN PROBANDO 1-2-3

In [17]:
tf.keras.optimizers.Adam(learning_rate=0.01)

<tensorflow.python.keras.optimizer_v2.adam.Adam at 0x7f8bfab89be0>

In [18]:
def update_towards_net2(net1,net2,tau=.01):
    for source_variable,target_variable in zip(net1.trainable_variables, net2.trainable_variables):
        source_variable.assign(tau*source_variable + (1.0 - tau)*target_variable)
    return

In [19]:
update_towards_net2(actor,actor_target)

In [20]:
def give_action():
    with tf.device("/cpu:0"):
        action = actor(state)
        action += tf.random.normal(shape=action.shape, mean=0., stddev=0.1, dtype=tf.float32)
        return tf.clip_by_value(action, -actor.max_action, actor.max_action)


In [45]:
give_action().numpy().flatten()[0]

-0.2885803

In [22]:
@tf.function
def compute_tf_error_body(states, actions, next_states, rewards, dones):
    with tf.device(self.device):
        not_dones = 1. - dones
        target_Q = self.critic_target( [next_states, actor_target(next_states)] )
        target_Q = rewards + (not_dones * self.discount * target_Q)
        target_Q = tf.stop_gradient(target_Q)
        current_Q = self.critic([states, actions])
        td_errors = target_Q - current_Q
        return td_errors



In [None]:
@tf.function
def td_error_body_bandit(actions, rewards):
    with tf.device("/cpu:0"):
        target_Q = critic_target( [ actor_target(next_states)] )
        target_Q = rewards + target_Q
        target_Q = tf.stop_gradient(target_Q)
        current_Q = critic([statex, actions])
        td_errors = target_Q - current_Q
        return td_errors

In [None]:
@tf.function
def td_error_body_L2(states, actions, next_states, rewards):
    with tf.device("/cpu:0"):
        target_Q = critic_target( [next_states, actor_target(next_states)] )
        target_Q = rewards + target_Q
        target_Q = tf.stop_gradient(target_Q)
        current_Q = critic([statex, actions])
        td_errors = target_Q - current_Q
        return td_errors

In [None]:
with tf.device("/cpu:0"):
    with tf.GradientTape() as tape:
        td_errors = compute_td_error_body(states, actions, next_states, rewards, done)
#         critic_loss = tf.reduce_mean(huber_loss(td_errors, delta=self.max_grad) * weights)

#             critic_grad = tape.gradient(
#                 critic_loss, self.critic.trainable_variables)
#             self.critic_optimizer.apply_gradients(
#                 zip(critic_grad, self.critic.trainable_variables))

#             with tf.GradientTape() as tape:
#                 next_action = self.actor(states)
#                 actor_loss = -tf.reduce_mean(self.critic([states, next_action]))

#             actor_grad = tape.gradient(
#                 actor_loss, self.actor.trainable_variables)
#             self.actor_optimizer.apply_gradients(
#                 zip(actor_grad, self.actor.trainable_variables))

#             # Update target networks
#             update_target_variables(
#                 self.critic_target.weights, self.critic.weights, self.tau)
#             update_target_variables(
#                 self.actor_target.weights, self.actor.weights, self.tau)
