In [1]:
import numpy as np
import tensorflow as tf
import gym.spaces
from gym.spaces import Box, Tuple, Discrete
from ray.rllib.models.action_dist import ActionDistribution
from ray.rllib.utils.annotations import override


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
class DiagGaussianLastDimParams(ActionDistribution):

    def __init__(self, inputs):
        mean, log_std = tf.split(inputs, 2, axis=-1)
        self.mean = tf.squeeze(mean, -1)
        self.log_std = tf.squeeze(log_std, -1)
        self.std = tf.exp(self.log_std)
        ActionDistribution.__init__(self, inputs)
    
    @property
    def reduction_axes_to_batch_dim(self):
        return list(range(len(self.mean.shape)))[1:]

    @override(ActionDistribution)
    def logp(self, x):
        reduction_indeces = self.reduction_axes_to_batch_dim
        return (-0.5 * tf.reduce_sum(
            tf.square((x - self.mean) / self.std), reduction_indices=reduction_indeces) -
                0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
                tf.reduce_sum(self.log_std, reduction_indices=reduction_indeces))

    @override(ActionDistribution)
    def kl(self, other):
        assert isinstance(other, DiagGaussian)
        return tf.reduce_sum(
            other.log_std - self.log_std +
            (tf.square(self.std) + tf.square(self.mean - other.mean)) /
            (2.0 * tf.square(other.std)) - 0.5,
            reduction_indices=self.reduction_axes_to_batch_dim)

    @override(ActionDistribution)
    def entropy(self):
        return tf.reduce_sum(
            .5 * self.log_std + .5 * np.log(2.0 * np.pi * np.e),
            reduction_indices=self.reduction_axes_to_batch_dim)

    @override(ActionDistribution)
    def _build_sample_op(self):
        return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
    
    @staticmethod
    def param_shape_for_action_space(action_space):
        return tuple(action_space.shape) + (2,)

In [3]:
# a multi-box distribution
action_space = Tuple([Box(0, 1, [1]), Box(0, 1, [2, 2]), Box(0, 1, [3, 3, 3])])

In [4]:
action_space.sample()

(array([0.88121283], dtype=float32), array([[0.33298007, 0.36552036],
        [0.4007087 , 0.5866933 ]], dtype=float32), array([[[0.90620035, 0.6751731 , 0.90138113],
         [0.29133072, 0.7190684 , 0.04849643],
         [0.59938323, 0.11002094, 0.12665713]],
 
        [[0.35579494, 0.8894267 , 0.9703054 ],
         [0.27502245, 0.43402576, 0.22903115],
         [0.91382164, 0.5025509 , 0.41424957]],
 
        [[0.1830025 , 0.6247107 , 0.1169171 ],
         [0.2366039 , 0.4560243 , 0.27019507],
         [0.3699933 , 0.41588876, 0.6042304 ]]], dtype=float32))

In [5]:
# some function to ask action distributions what shape of parameter tensor they need for each component action space
def param_shape_for_action_space(space):
    if isinstance(space, Tuple):
        return tuple(param_shape_for_action_space(s) for s in space)
    # case for Dict action spaces
    if isinstance(space, Box):
        return DiagGaussianLastDimParams.param_shape_for_action_space(space)
    # other cases

In [6]:
param_shapes = param_shape_for_action_space(action_space)
# ^^ this goes into the Model's build_layers_v2 as the num_outputs so it knows how many tensors it needs to return and what shapes

In [7]:
param_shapes

((1, 2), (2, 2, 2), (3, 3, 3, 2))

In [8]:
# define placeholders for the parameters for this tuple of distribution
param_placeholders = tuple([tf.placeholder(tf.float32, [None, *shape]) for shape in param_shapes])

In [9]:
param_placeholders

(<tf.Tensor 'Placeholder:0' shape=(?, 1, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_1:0' shape=(?, 2, 2, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_2:0' shape=(?, 3, 3, 3, 2) dtype=float32>)

In [10]:
action_distributions = [DiagGaussianLastDimParams(param) for param in param_placeholders]

In [11]:
# sample op for a tuple action distribution
sample_ops = tuple(dist.sample() for dist in action_distributions)

In [12]:
sample_ops

(<tf.Tensor 'add:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'add_1:0' shape=(?, 2, 2) dtype=float32>,
 <tf.Tensor 'add_2:0' shape=(?, 3, 3, 3) dtype=float32>)

In [13]:
# generate some dummy batch of distribution parameter values (ie a frozen distribution)
batch_size = 5
param_values = [np.random.randn(batch_size, *ph.shape.as_list()[1:]) for ph in param_placeholders]

In [14]:
# distribution parameter feed dict
param_feed_dict = dict(zip(param_placeholders, param_values))

In [15]:
sess = tf.Session()

In [16]:
total_entropy = tf.add_n([dist.entropy() for dist in action_distributions])

In [17]:
#batchwise statistics of a joint distribution
sess.run(total_entropy, param_feed_dict)

array([44.31424 , 43.636604, 47.26198 , 47.43135 , 46.079258],
      dtype=float32)

In [18]:
action_distributions[-1].sample_op

<tf.Tensor 'add_2:0' shape=(?, 3, 3, 3) dtype=float32>

In [19]:
# get some sampled actions for our random parameter set
sampled_actions = sess.run(sample_ops, feed_dict=param_feed_dict)

In [20]:
action_placeholders = [tf.placeholder(tf.float32, [None, *s.shape]) for s in action_space]

In [21]:
action_feed_dict = dict(zip(action_placeholders, sampled_actions))

In [22]:
action_placeholders

[<tf.Tensor 'Placeholder_3:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'Placeholder_4:0' shape=(?, 2, 2) dtype=float32>,
 <tf.Tensor 'Placeholder_5:0' shape=(?, 3, 3, 3) dtype=float32>]

In [23]:
# a joint logp for a generic tuple distribution
logp_op = [dist.logp(action) for dist, action in zip(action_distributions, action_placeholders)]

Instructions for updating:
Use tf.cast instead.


In [24]:
sess.run(logp_op, {**action_feed_dict, **param_feed_dict}) # per-component statistics with both parameters and actions fed in

[array([-2.0074866 , -2.375213  , -0.8906841 , -1.313107  , -0.07342559],
       dtype=float32),
 array([-3.3032596, -4.1778593, -3.5050447, -5.103592 , -5.629133 ],
       dtype=float32),
 array([-11.28433 , -12.708602, -22.713045, -17.607649, -19.062397],
       dtype=float32)]