###### utilities

In [4]:
import gym
from collections import deque
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.utils import shuffle

seed = 0
np.random.seed(seed)

  from ._conv import register_converters as _register_converters


In [5]:
# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display


def display_frames_as_gif(frames):
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=5)
    display(display_animation(anim, default_mode='loop'))

###### PPO Agent

In [14]:
class PPOAgent_softmax():
    def __init__(self
                 , obs_dim
                 , act_dim
                 , epochs=5
                 , lr=3e-5
                 , hdim=32
                 , clip_value=0.2
                 , seed=0):

        self.seed=0
        self.obs_dim = obs_dim
        self.act_dim = act_dim      
        self.epochs = epochs
        self.lr = lr
        self.hdim = hdim
        self.clip_value = clip_value
        
        self.g = None  # graph
        self.obs_ph = None  # observation placeholder
        self.act_ph = None  # action placeholder
        self.advantage_ph = None  # advantage placeholder
        self.lr_ph = None # learning rate placeholder
        
        self._build_graph()
        self._init_session()
        
    def _build_graph(self):
        self.g = tf.Graph()
        with self.g.as_default():
            self._placeholders()
            self._policy_nn()
            
    def _init_session(self):
        pass
    
    def _placeholders(self):
        self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs')
        self.act_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'act')
        self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages')
        
        self.lr_ph = tf.placeholder(tf.float32, (), 'lr')
        
        self.old_std_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_std')
        self.old_mean_ph = tf.placeholder(tf.float32, (None, self.act_dim), 'old_means')
    
    def _policy_nn(self):
        hid1_size = self.hdim
        hid2_size = self.hdim
        
        out = tf.layers.dense(self.obs_ph, hid1_size, tf.tanh,
                             kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='h1')
        out = tf.layers.dense(out,         hid1_size, tf.tanh,
                             kernel_initializer=tf.random_normal_initializer(stddev=0.01, seed=self.seed), name='h2')
        self.mean = tf.layers.dense(out, self.act_dim,
                             )

In [15]:
env = gym.make('CartPole-v1')
env.seed(seed=seed)
obs_dim = env.observation_space.shape[0]
n_act = env.action_space.n

agent = PPOAgent_softmax(obs_dim, n_act, epochs=5, hdim=32, clip_value=0.45, lr=3e-4, seed=seed)



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [23]:
for op in agent.g.get_operations():
    print(op.outputs)

[<tf.Tensor 'obs:0' shape=(?, 4) dtype=float32>]
[<tf.Tensor 'act:0' shape=(?, 2) dtype=float32>]
[<tf.Tensor 'advantages:0' shape=(?,) dtype=float32>]
[<tf.Tensor 'lr:0' shape=() dtype=float32>]
[<tf.Tensor 'old_std:0' shape=(?, 2) dtype=float32>]
[<tf.Tensor 'old_means:0' shape=(?, 2) dtype=float32>]
