In [207]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense


class Actor(tf.keras.Model):
    def __init__(self, state_shape, action_dim, max_action, units=[4, 3], name="Actor"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1",kernel_initializer='random_uniform',
                bias_initializer='random_uniform')
        self.l2 = Dense(units[1], name="L2",kernel_initializer='random_uniform',
                bias_initializer='random_uniform')
        self.l3 = Dense(action_dim, name="L3",kernel_initializer='random_uniform',
                bias_initializer='random_uniform')

        self.max_action = max_action

        with tf.device("/cpu:0"):
            self(tf.constant(np.zeros(shape=(1,)+state_shape, dtype=np.float32)))

    def call(self, inputs):
        features = tf.nn.relu(self.l1(inputs))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        action = self.max_action * tf.nn.tanh(features)
        return action
    
class Critic(tf.keras.Model):
    def __init__(self, state_shape, action_dim, units=[400, 300], name="Critic"):
        super().__init__(name=name)

        self.l1 = Dense(units[0], name="L1")
        self.l2 = Dense(units[1], name="L2")
        self.l3 = Dense(1, name="L3")

        dummy_state = tf.constant(
            np.zeros(shape=(1,)+state_shape, dtype=np.float32))
        dummy_action = tf.constant(
            np.zeros(shape=[1, action_dim], dtype=np.float32))
        with tf.device("/cpu:0"):
            self([dummy_state, dummy_action])

    def call(self, inputs):
        states, actions = inputs
        features = tf.concat([states, actions], axis=1)
        features = tf.nn.relu(self.l1(features))
        features = tf.nn.relu(self.l2(features))
        features = self.l3(features)
        return features
    
def update_towards_net2(net1,net2,tau=.01):
    for source_variable,target_variable in zip(net1.trainable_variables, net2.trainable_variables):
        source_variable.assign(tau*source_variable + (1.0 - tau)*target_variable)
    return

In [208]:
critic_optimizer = tf.keras.optimizers.Adam(lr=0.5)
actor_optimizer = tf.keras.optimizers.Adam(lr=0.01)

PROBANDO 1-2-3

In [195]:
critic = Actor((1,),1,1)
critic(np.array([[[2.]]]))

f = []
for i in range(4):
    f.append([[[np.random.random()]]])
actions = np.array(f)

for i in critic(actions).numpy():
    print(i.flatten()[0])

W1021 19:22:20.855508 139949274855232 base_layer.py:1814] Layer Actor is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



0.018672984
0.019036163
0.02072278
0.019677768


PROBANDO 1-2-3

In [196]:
tf.constant([0.])

<tf.Tensor: id=816216, shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>

In [197]:
with tf.device("/cpu:0"):
    with tf.GradientTape() as tape:
        tape.watch(critic.trainable_variables)
        actions = []
        rewards=[]
        for i in range(4):
            rewards.append(tf.constant(np.random.choice([0,0],1)))
            actions.append([np.random.random()])
        actions = tf.constant(np.array(actions))

        q_values = critic(actions)
        loss = tf.keras.losses.MSE(rewards, q_values)
        critic_grad = tape.gradient(loss,critic.trainable_variables)
        
        critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))
     #   critic_loss = tf.keras.losses.MSE(rewards, q_values)
        #critic_loss = tf.reduce_mean(critic_loss) ??????
      #  critic_grad = tape.gradient(critic_loss, critic.trainable_variables)
        

In [198]:
def give_action():
    with tf.device("/cpu:0"):
        dummy_state = tf.constant([[1.]])
        action = actor(dummy_state)
        action += tf.random.normal(shape=action.shape, mean=0., stddev=0.01, dtype=tf.float32)
        return tf.clip_by_value(action, -actor.max_action, actor.max_action)

In [205]:
def learn(rewards, actions):
    with tf.device("/cpu:0"):
        with tf.GradientTape() as tape:
            tape.watch(critic.trainable_variables)
            
            q_values = critic(actions)
            loss = tf.keras.losses.MSE(rewards, q_values)
            critic_grad = tape.gradient(loss,critic.trainable_variables)
            critic_optimizer.apply_gradients(zip(critic_grad, critic.trainable_variables))
            
            
        with tf.GradientTape() as tape:
            tape.watch(actor.trainable_variables)
            actor_loss = -tf.reduce_mean(critic(actions))
            actor_grad = tape.gradient(actor_loss, actor.trainable_variables)
            #actor_optimizer.apply_gradients(zip(actor_grad, actor.trainable_variables))
            return

In [206]:
actor=Actor((1,),1,3)


length = 10**3
alpha=0.56
cum_freq=0
batch_length=10
rewards_batch=[]
actions_batch=[]
for i in range(length):
    phase = np.random.choice([-1,1],1)[0]
    beta = give_action()
    p0 = np.exp(-(beta.numpy().flatten()[0]-(phase*alpha))**2)
    outcome = np.random.choice([0,1],1,p=[p0,1-p0])
    if (-1)**(outcome+1) == phase:
        reward=1
    else:
        reward = 0
    rewards_batch.append(tf.constant([reward]))
    actions_batch.append(beta)
    
    if i%batch_length==1:
        actions_batch = tf.stack(actions_batch)
        learn(rewards_batch, actions_batch)
        actions_batch, rewards_batch = [], []

tf.Tensor(
[[1. 4.]
 [1. 4.]], shape=(2, 2), dtype=float32)
tf.Tensor(
[[1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]
 [1. 1. 4. 4. 4. 1. 1. 4. 1. 1.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]
 [1. 4. 1. 1. 1. 4. 4. 4. 4. 4.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[1. 4. 1. 4. 4. 4. 4. 4. 4. 4.]
 [1. 4. 1. 4. 4. 4. 4. 4. 4. 4.]
 [1. 4. 1. 4. 4. 4. 4. 4. 4. 4.]
 [1. 4. 1. 4. 4. 4. 4. 4. 4. 4.]
 [1. 4. 1. 4. 4. 4. 4. 4. 4. 4.]
 [1. 4. 1. 4. 4.

tf.Tensor(
[[1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]
 [1. 1. 1. 4. 1. 4. 4. 4. 4. 4.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]
 [1. 4. 4. 4. 1. 1. 1. 1. 1. 4.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1. 1. 4. 1. 4. 4. 4. 1.]
 [1. 4. 1.

tf.Tensor(
[[4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]
 [4. 4. 4. 4. 4. 1. 1. 1. 4. 1.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]
 [4. 1. 4. 1. 1. 4. 4. 4. 1. 1.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4. 4. 1. 1. 4. 1. 1. 1.]
 [4. 4. 4.

tf.Tensor(
[[4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]
 [4. 1. 4. 1. 1. 4. 1. 1. 4. 1.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]
 [4. 1. 4. 4. 1. 4. 1. 4. 4. 4.]], shape=(10, 10), dtype=float32)
tf.Tensor(
[[1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4. 4. 4. 4. 1. 4. 4. 1.]
 [1. 1. 4.