In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.keras.layers as kl
import gym
import os

os.environ["CUDA_VISIBLE_DEVICES"]="-1"

my_devices = tf.config.experimental.list_physical_devices()
print(f'TF2 available devices: {my_devices}')

class ProbabilityDistribution(tf.keras.Model):
    def call(self, logits, **kwargs):
        # Sample a random categorical action from the given logits.
        return tf.squeeze(tf.random.categorical(logits, 1), axis=-1)


class Model(tf.keras.Model):
    def __init__(self, num_actions):
        super().__init__('mlp_policy')
        # Note: no tf.get_variable(), just simple Keras API!
        self.hidden1 = kl.Dense(128, activation='relu')
        self.hidden2 = kl.Dense(128, activation='relu')
        self.value = kl.Dense(1, name='value')
        # Logits are unnormalized log probabilities.
        self.logits = kl.Dense(num_actions, name='policy_logits')
        self.dist = ProbabilityDistribution()

    def call(self, inputs, **kwargs):
        # Inputs is a numpy array, convert to a tensor.
        x = tf.convert_to_tensor(inputs)
        # Separate hidden layers from the same input tensor.
        hidden_logs = self.hidden1(x)
        hidden_vals = self.hidden2(x)
        return self.logits(hidden_logs), self.value(hidden_vals)

    def action_value(self, obs):
        # Executes `call()` under the hood.
        logits, value = self.predict_on_batch(obs)
        action = self.dist.predict_on_batch(logits)
        # Another way to sample actions:
        #   action = tf.random.categorical(logits, 1)
        # Will become clearer later why we don't use it.
        return np.squeeze(action, axis=-1), np.squeeze(value, axis=-1)

TF2 available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [3]:
env = gym.make('CartPole-v0')
model = Model(num_actions=env.action_space.n)

obs = env.reset()
# No feed_dict or tf.Session() needed at all!
action, value = model.action_value(obs[None, :])
print(action, value) # [1] [-0.00145713]

1 [0.00696009]


In [4]:
model.action_value(obs[None, :])

(array(1, dtype=int64), array([0.00696009], dtype=float32))

In [5]:
model.predict(obs[None, :])[0]

array([[0.00584153, 0.00161778]], dtype=float32)

In [26]:
modelk = tf.keras.Sequential()
modelk.add(tf.keras.layers.Dense(32, activation='relu'))
modelk.add(tf.keras.layers.Dense(32, activation='relu'))
modelk.add(tf.keras.layers.Dense(32, activation='relu'))
modelk.add(tf.keras.layers.Dense(2, activation='softmax', name='final'))
modelk.build(input_shape=[None,7])

In [27]:
modelk.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             multiple                  256       
_________________________________________________________________
dense_15 (Dense)             multiple                  1056      
_________________________________________________________________
dense_16 (Dense)             multiple                  1056      
_________________________________________________________________
final (Dense)                multiple                  66        
Total params: 2,434
Trainable params: 2,434
Non-trainable params: 0
_________________________________________________________________


In [15]:
modelk.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  640       
_________________________________________________________________
dense_4 (Dense)              multiple                  258       
Total params: 898
Trainable params: 898
Non-trainable params: 0
_________________________________________________________________


In [20]:
modelk.predict_on_batch(obs[None, :])

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.00702143,  0.00067079]], dtype=float32)>