In [1]:
import numpy as np
import tensorflow as tf
import gym.spaces
from gym.spaces import Box, Tuple, Discrete
from ray.rllib.models.action_dist import ActionDistribution
from ray.rllib.utils.annotations import override


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
class DiagGaussianLastDimParams(ActionDistribution):

    def __init__(self, inputs):
        mean, log_std = tf.split(inputs, 2, axis=-1)
        self.mean = tf.squeeze(mean, -1)
        self.log_std = tf.squeeze(log_std, -1)
        self.std = tf.exp(self.log_std)
        ActionDistribution.__init__(self, inputs)
    
    @property
    def reduction_axes_to_batch_dim(self):
        return list(range(len(self.mean.shape)))[1:]

    @override(ActionDistribution)
    def logp(self, x):
        reduction_indeces = self.reduction_axes_to_batch_dim
        return (-0.5 * tf.reduce_sum(
            tf.square((x - self.mean) / self.std), reduction_indices=reduction_indeces) -
                0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
                tf.reduce_sum(self.log_std, reduction_indices=reduction_indeces))

    @override(ActionDistribution)
    def kl(self, other):
        assert isinstance(other, DiagGaussian)
        return tf.reduce_sum(
            other.log_std - self.log_std +
            (tf.square(self.std) + tf.square(self.mean - other.mean)) /
            (2.0 * tf.square(other.std)) - 0.5,
            reduction_indices=self.reduction_axes_to_batch_dim)

    @override(ActionDistribution)
    def entropy(self):
        return tf.reduce_sum(
            .5 * self.log_std + .5 * np.log(2.0 * np.pi * np.e),
            reduction_indices=self.reduction_axes_to_batch_dim)

    @override(ActionDistribution)
    def _build_sample_op(self):
        return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
    
    @staticmethod
    def param_shape_for_action_space(action_space):
        return tuple(action_space.shape) + (2,)

In [3]:
# a multi-box distribution
action_space = Tuple([Box(0, 1, [1]), Box(0, 1, [2, 2]), Box(0, 1, [3, 3, 3])])

In [4]:
action_space.sample()

(array([0.2304567], dtype=float32), array([[0.63514733, 0.92726606],
        [0.21354426, 0.15934667]], dtype=float32), array([[[0.27304482, 0.10951104, 0.7014962 ],
         [0.97908866, 0.66205126, 0.23139764],
         [0.34884912, 0.3818943 , 0.06303718]],
 
        [[0.22497487, 0.6939296 , 0.47430623],
         [0.06586658, 0.23856476, 0.9842732 ],
         [0.37169626, 0.6850393 , 0.3766804 ]],
 
        [[0.7469263 , 0.8524631 , 0.48257282],
         [0.13400312, 0.08383126, 0.83112395],
         [0.46184042, 0.661182  , 0.7354287 ]]], dtype=float32))

In [5]:
# some function to ask action distributions what shape of parameter tensor they need for each component action space
def param_shape_for_action_space(space):
    if isinstance(space, Tuple):
        return tuple(param_shape_for_action_space(s) for s in space)
    # case for Dict action spaces
    if isinstance(space, Box):
        return DiagGaussianLastDimParams.param_shape_for_action_space(space)
    # other cases

In [6]:
param_shapes = param_shape_for_action_space(action_space)
# ^^ this goes into the Model's __init__ so it knows how many tensors it needs to return and what shapes

In [7]:
# define placeholders for the parameters for this tuple of distribution
param_placeholders = tuple([tf.placeholder(tf.float32, [None, *shape]) for shape in param_shapes])

In [8]:
param_placeholders

(<tf.Tensor 'Placeholder:0' shape=(?, 1, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_1:0' shape=(?, 2, 2, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_2:0' shape=(?, 3, 3, 3, 2) dtype=float32>)

In [9]:
action_distributions = [DiagGaussianLastDimParams(param) for param in param_placeholders]

In [10]:
# sample op for a tuple action distribution
sample_ops = tuple(dist.sample() for dist in action_distributions)

In [11]:
# generate some dummy batch of distribution parameter values
batch_size = 5
param_values = [np.random.randn(batch_size, *ph.shape.as_list()[1:]) for ph in param_placeholders]

In [12]:
# distribution parameter feed dict
param_feed_dict = dict(zip(param_placeholders, param_values))

In [13]:
sess = tf.Session()

In [14]:
total_entropy = tf.add_n([dist.entropy() for dist in action_distributions])

In [15]:
#batchwise statistics of a joint distribution
sess.run(total_entropy, param_feed_dict)

array([47.086502, 47.601696, 46.11048 , 47.55205 , 43.394684],
      dtype=float32)

In [16]:
action_distributions[-1].sample_op

<tf.Tensor 'add_2:0' shape=(?, 3, 3, 3) dtype=float32>

In [17]:
# get some sampled actions for our random parameter set
sampled_actions = sess.run(sample_ops, feed_dict=param_feed_dict)

In [18]:
action_placeholders = [tf.placeholder(tf.float32, [None, *s.shape]) for s in action_space]

In [19]:
action_feed_dict = dict(zip(action_placeholders, sampled_actions))

In [20]:
action_placeholders

[<tf.Tensor 'Placeholder_3:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'Placeholder_4:0' shape=(?, 2, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_5:0' shape=(?, 3, 3, 3) dtype=float32>]

In [21]:
# a joint logp for a generic tuple distribution
logp_op = [dist.logp(action) for dist, action in zip(action_distributions, action_placeholders)]

Instructions for updating:
Use tf.cast instead.


In [22]:
sess.run(logp_op, {**action_feed_dict, **param_feed_dict})

[array([-0.986536  ,  0.6014351 , -1.5928472 , -0.9579828 , -0.86125576],
       dtype=float32),
 array([-5.5960627, -2.8220892, -4.082852 , -2.5123987, -4.4787593],
       dtype=float32),
 array([-12.479827, -23.272427, -24.689245, -21.244213, -16.407429],
       dtype=float32)]