# Rebuild HW2, HW3

In [2]:
# The goal of this notebook is to flatten the code from CS294-HW2, parameters will be hard coded
# First we will work on train_ac

In [1]:
# Load libraries
import numpy as np
import tensorflow as tf
import gym
import time

In [2]:
# Set parameters

# the gym environment name
env_name = "CartPole-v0"
#env_name = "Breakout-ram-v0"

# number of hidden layers
n_layers = 2

# random seed
seed_init = 0

# number of interations
n_iter = 100

# maximum steps can be taken by the simulation
max_path_length = None

# dimension of the hidden layer
size = 64

# default LR
learning_rate = 5e-3

# number of iterations in one update of critic
num_target_updates = 10

# number of gradients per iteration 
num_grad_steps_per_target_update = 10

# corresponds to 'render', whether to animate the trajactory or not
animate = "store_true"

# each timestep is a single cycle of the decision sequence
min_timesteps_per_batch = 1000

# discount over future reward, assume no discount for now
gamma = 1.0

# Whether the advantage should be normalized or not
normalize_advantages = "store_true"

In [3]:
# Setup the environment and randomness
env = gym.make(env_name)
seed = seed_init + 10*1

# Set random seeds
tf.set_random_seed(seed)
np.random.seed(seed)
env.seed(seed)

# Maximum length for episodes, 'or' served as a simple None check
# max number of steps in environment
max_path_length = max_path_length or env.spec.max_episode_steps

# Is this env continuous, or self.discrete?
# check if env.action_space is a type of gym.spaces.Discrete
discrete = isinstance(env.action_space, gym.spaces.Discrete)

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

# printoOut
tp = (env_name, "isDiscrete: " + str(discrete), "ob_dim: "+ str(ob_dim), "ac_dim:" + str(ac_dim))
print("This test works on environment [" + ', '.join(tp) + "]")

This test works on environment [CartPole-v0, isDiscrete: True, ob_dim: 4, ac_dim:2]


In [4]:
class Agent(object):
    def __init__(self, computation_graph_args, sample_trajectory_args, estimate_advantage_args):
        super(Agent, self).__init__()
        self.ob_dim = computation_graph_args['ob_dim']
        self.ac_dim = computation_graph_args['ac_dim']
        self.discrete = computation_graph_args['discrete']
        self.size = computation_graph_args['size']
        self.n_layers = computation_graph_args['n_layers']
        self.learning_rate = computation_graph_args['learning_rate']
        self.num_target_updates = computation_graph_args['num_target_updates']
        self.num_grad_steps_per_target_update = computation_graph_args['num_grad_steps_per_target_update']

        self.animate = sample_trajectory_args['animate']
        self.max_path_length = sample_trajectory_args['max_path_length']
        self.min_timesteps_per_batch = sample_trajectory_args['min_timesteps_per_batch']

        self.gamma = estimate_advantage_args['gamma']
        self.normalize_advantages = estimate_advantage_args['normalize_advantages']

    # Get a runnable session
    def init_tf_sess(self):
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        tf_config.gpu_options.allow_growth = True # may need if using GPU
        self.sess = tf.Session(config=tf_config)
        self.sess.__enter__() # equivalent to `with self.sess:`
        tf.global_variables_initializer().run() #pylint: disable=E1101

    def define_placeholders(self):
        """
            Placeholders for batch observations / actions / advantages in actor critic
            loss function.
            See Agent.build_computation_graph for notation

            returns:
                sy_ob_no: placeholder for observations
                sy_ac_na: placeholder for actions
                sy_adv_n: placeholder for advantages
        """
        sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32)
        if self.discrete:
            sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
        else:
            sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32)

        # YOUR HW2 CODE HERE
        sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        return sy_ob_no, sy_ac_na, sy_adv_n

    def policy_forward_pass(self, sy_ob_no):
        """ Constructs the symbolic operation for the policy network outputs,
            which are the parameters of the policy distribution p(a|s)

            arguments:
                sy_ob_no: (batch_size, self.ob_dim)

            returns:
                the parameters of the policy.

                if discrete, the parameters are the logits of a categorical distribution
                    over the actions
                    sy_logits_na: (batch_size, self.ac_dim)

                if continuous, the parameters are a tuple (mean, log_std) of a Gaussian
                    distribution over actions. log_std should just be a trainable
                    variable, not a network output.
                    sy_mean: (batch_size, self.ac_dim)
                    sy_logstd: (self.ac_dim,)

            Hint: use the 'build_mlp' function to output the logits (in the discrete case)
                and the mean (in the continuous case).
                Pass in self.n_layers for the 'n_layers' argument, and
                pass in self.size for the 'size' argument.
        """
        if self.discrete:
            # YOUR HW2 CODE HERE
            sy_logits_na = build_mlp(sy_ob_no, self.ac_dim, "policy_forward_pass", self.n_layers, self.size)
            return sy_logits_na
        else:
            # YOUR HW2 CODE HERE
            sy_mean = build_mlp(sy_ob_no, self.ac_dim, "policy_forward_pass", self.n_layers, self.size)
            sy_logstd = tf.get_variable("logstd", shape=[self.ac_dim], trainable=True)
            return (sy_mean, sy_logstd)

    def sample_action(self, policy_parameters):
        """ Constructs a symbolic operation for stochastically sampling from the policy
            distribution

            arguments:
                policy_parameters
                    if discrete: logits of a categorical distribution over actions
                        sy_logits_na: (batch_size, self.ac_dim)
                    if continuous: (mean, log_std) of a Gaussian distribution over actions
                        sy_mean: (batch_size, self.ac_dim)
                        sy_logstd: (self.ac_dim,)

            returns:
                sy_sampled_ac:
                    if discrete: (batch_size)
                    if continuous: (batch_size, self.ac_dim)

            Hint: for the continuous case, use the reparameterization trick:
                 The output from a Gaussian distribution with mean 'mu' and std 'sigma' is

                      mu + sigma * z,         z ~ N(0, I)

                 This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
        """
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR HW2 CODE_HERE
            samples = tf.multinomial(logits=sy_logits_na, num_samples=1) # output of shape [batch_size, num_samples]
            sy_sampled_ac = tf.reshape(samples, [-1]) # flatten to be of shape [batch_size]
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR HW2 CODE_HERE
            z = tf.random_normal(tf.shape(sy_mean), mean=0.0, stddev=1.0)
            sy_std = tf.exp(sy_logstd)
            sy_sampled_ac = sy_mean + sy_std * z
        return sy_sampled_ac

    def get_log_prob(self, policy_parameters, sy_ac_na):
        """ Constructs a symbolic operation for computing the log probability of a set of actions
            that were actually taken according to the policy

            arguments:
                policy_parameters
                    if discrete: logits of a categorical distribution over actions
                        sy_logits_na: (batch_size, self.ac_dim)
                    if continuous: (mean, log_std) of a Gaussian distribution over actions
                        sy_mean: (batch_size, self.ac_dim)
                        sy_logstd: (self.ac_dim,)

                sy_ac_na: (batch_size, self.ac_dim)

            returns:
                sy_logprob_n: (batch_size)

            Hint:
                For the discrete case, use the log probability under a categorical distribution.
                For the continuous case, use the log probability under a multivariate gaussian.
        """
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR HW2 CODE_HERE
            sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR HW2 CODE_HERE
            # calculate the z score of the sampled actions under the policy
            sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd)
            # this maximizes likelihood by pushing z towards 0 (mean of distribution)
            sy_logprob_n = -0.5 * tf.reduce_mean(tf.square(sy_z), axis=1)
        return sy_logprob_n

    def build_computation_graph(self):
        """
            Notes on notation:

            Symbolic variables have the prefix sy_, to distinguish them from the numerical values
            that are computed later in the function

            Prefixes and suffixes:
            ob - observation
            ac - action
            _no - this tensor should have shape (batch self.size /n/, observation dim)
            _na - this tensor should have shape (batch self.size /n/, action dim)
            _n  - this tensor should have shape (batch self.size /n/)

            Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis
            is None

            ----------------------------------------------------------------------------------
            loss: a function of self.sy_logprob_n and self.sy_adv_n that we will differentiate
                to get the policy gradient.
        """
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n = self.define_placeholders()

        # The policy takes in an observation and produces a distribution over the action space
        self.policy_parameters = self.policy_forward_pass(self.sy_ob_no)

        # We can sample actions from this action distribution.
        # This will be called in Agent.sample_trajectory() where we generate a rollout.
        self.sy_sampled_ac = self.sample_action(self.policy_parameters)

        # We can also compute the logprob of the actions that were actually taken by the policy
        # This is used in the loss function.
        self.sy_logprob_n = self.get_log_prob(self.policy_parameters, self.sy_ac_na)

        actor_loss = tf.reduce_sum(-self.sy_logprob_n * self.sy_adv_n)
        self.actor_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(actor_loss)

        # define the critic
        self.critic_prediction = tf.squeeze(build_mlp(
                                self.sy_ob_no,
                                1,
                                "nn_critic",
                                n_layers=self.n_layers,
                                size=self.size))
        self.sy_target_n = tf.placeholder(shape=[None], name="critic_target", dtype=tf.float32)
        self.critic_loss = tf.losses.mean_squared_error(self.sy_target_n, self.critic_prediction)
        self.critic_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.critic_loss)

    def sample_trajectories(self, itr, env):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode=(len(paths)==0 and (itr % 1 == 0) and self.animate)
            path = self.sample_trajectory(env, animate_this_episode)
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > self.min_timesteps_per_batch:
                break
        return paths, timesteps_this_batch

    def sample_trajectory(self, env, animate_this_episode):
        ob = env.reset()
        obs, acs, rewards, next_obs, terminals = [], [], [], [], []
        steps = 0
        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.04)
            obs.append(ob)
            ac = self.sess.run(self.sy_sampled_ac, feed_dict={self.sy_ob_no: np.expand_dims(ob, axis=0)}) # YOUR HW2 CODE HERE
            ac = ac[0]
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
            # add the observation after taking a step to next_obs
            # YOUR CODE HERE
            next_obs.append(ob)
            rewards.append(rew)
            steps += 1
            # If the episode ended, the corresponding terminal value is 1
            # otherwise, it is 0
            # YOUR CODE HERE
            if done or steps > self.max_path_length:
                terminals.append(1)
                break
            else:
                terminals.append(0)
        path = {"observation" : np.array(obs, dtype=np.float32),
                "reward" : np.array(rewards, dtype=np.float32),
                "action" : np.array(acs, dtype=np.float32),
                "next_observation": np.array(next_obs, dtype=np.float32),
                "terminal": np.array(terminals, dtype=np.float32)}
        return path

    def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
        """
            Estimates the advantage function value for each timestep.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                re_n: length: sum_of_path_lengths. Each element in re_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end

            returns:
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                    advantages whose length is the sum of the lengths of the paths
        """
        # First, estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
        # To get the advantage, subtract the V(s) to get A(s, a) = Q(s, a) - V(s)
        # This requires calling the critic twice --- to obtain V(s') when calculating Q(s, a),
        # and V(s) when subtracting the baseline
        # Note: don't forget to use terminal_n to cut off the V(s') term when computing Q(s, a)
        # otherwise the values will grow without bound.
        # YOUR CODE HERE
        v_s = self.sess.run(self.critic_prediction, feed_dict={self.sy_ob_no: ob_no})
        v_s_next = self.sess.run(self.critic_prediction, feed_dict={self.sy_ob_no: next_ob_no})
        adv_n = re_n + (1 - terminal_n) * self.gamma * v_s_next - v_s

        if self.normalize_advantages:
            adv_n = (adv_n - adv_n.mean()) / adv_n.std() # YOUR HW2 CODE_HERE
        return adv_n

    def update_critic(self, ob_no, next_ob_no, re_n, terminal_n):
        """
            Update the parameters of the critic.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
                re_n: length: sum_of_path_lengths. Each element in re_n is a scalar containing
                    the reward for each timestep
                terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
                    at that timestep of 0 if the episode did not end

            returns:
                nothing
        """
        # Use a bootstrapped target values to update the critic
        # Compute the target values r(s, a) + gamma*V(s') by calling the critic to compute V(s')
        # In total, take n=self.num_grad_steps_per_target_update*self.num_target_updates gradient update steps
        # Every self.num_grad_steps_per_target_update steps, recompute the target values
        # by evaluating V(s') on the updated critic
        # Note: don't forget to use terminal_n to cut off the V(s') term when computing the target
        # otherwise the values will grow without bound.
        # YOUR CODE HERE
        for _ in range(self.num_target_updates):
            v_s_next = self.sess.run(self.critic_prediction, feed_dict={self.sy_ob_no: next_ob_no})
            target_values = re_n + (1 - terminal_n) * self.gamma * v_s_next

            for _ in range(self.num_grad_steps_per_target_update):
                self.sess.run(self.critic_update_op, feed_dict={self.sy_target_n: target_values, self.sy_ob_no: ob_no})

    def update_actor(self, ob_no, ac_na, adv_n):
        """
            Update the parameters of the policy.

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                ac_na: shape: (sum_of_path_lengths).
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                    advantages whose length is the sum of the lengths of the paths

            returns:
                nothing

        """
        self.sess.run(self.actor_update_op,
            feed_dict={self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n})


In [5]:
#============================================================================================#
# Utilities
#============================================================================================#

def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
    """
        Builds a feedforward neural network

        arguments:
            input_placeholder: placeholder variable for the state (batch_size, input_size)
            output_size: size of the output layer
            scope: variable scope of the network
            n_layers: number of hidden layers
            size: dimension of the hidden layer
            activation: activation of the hidden layers
            output_activation: activation of the ouput layers

        returns:
            output placeholder of the network (the result of a forward pass)

        Hint: use tf.layers.dense
    """
    inputs = input_placeholder

    with tf.variable_scope(scope):
        for layer in range(n_layers):
            inputs = tf.layers.dense(inputs=inputs, units=size, activation=activation)

        output_placeholder = tf.layers.dense(inputs=inputs, units=output_size, activation=output_activation)

    return output_placeholder

def pathlength(path):
    return len(path["reward"])

def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_AC)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

In [6]:
computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
        }

sample_trajectory_args = {
    'animate': animate,
    'max_path_length': max_path_length,
    'min_timesteps_per_batch': min_timesteps_per_batch,
}

estimate_advantage_args = {
    'gamma': gamma,
    'normalize_advantages': normalize_advantages,
}

agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args

# build computation graph
agent.build_computation_graph()

# tensorflow: config, session, variable initialization
agent.init_tf_sess()

total_timesteps = 0

W0711 18:01:55.977897 140735683527552 deprecation.py:323] From <ipython-input-5-e5e140580e84>:27: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0711 18:01:55.983305 140735683527552 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0711 18:01:56.556996 140735683527552 deprecation.py:323] From <ipython-input-4-7d3ecbc61f3b>:111: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.random.categorical` instead.
W0711 18:01:56.911659 140735683527552 d

In [7]:
n_iter = 1

try:
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        
        print(paths)
        
        total_timesteps += timesteps_this_batch
    
        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate([path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])
    
        # Call tensorflow operations to:
        # (1) update the critic, by calling agent.update_critic
        # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage
        # (3) use the estimated advantage values to update the actor, by calling agent.update_actor
    
        # YOUR CODE HERE
        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)
        
        returns = [path["reward"].sum() for path in paths]
        print(np.max(returns))
finally:
    env.close()

********** Iteration 0 ************
[{'observation': array([[ 9.2834039e-03,  2.1692464e-02, -4.8822626e-02,  2.8070265e-02],
       [ 9.7172530e-03, -1.7269658e-01, -4.8261221e-02,  3.0495822e-01],
       [ 6.2633213e-03, -3.6709872e-01, -4.2162057e-02,  5.8203864e-01],
       [-1.0786533e-03, -5.6160539e-01, -3.0521285e-02,  8.6114734e-01],
       [-1.2310761e-02, -7.5629872e-01, -1.3298337e-02,  1.1440794e+00],
       [-2.7436735e-02, -9.5124441e-01,  9.5832516e-03,  1.4325626e+00],
       [-4.6461623e-02, -7.5624204e-01,  3.8234502e-02,  1.1428899e+00],
       [-6.1586462e-02, -5.6163996e-01,  6.1092298e-02,  8.6243832e-01],
       [-7.2819263e-02, -3.6740065e-01,  7.8341067e-02,  5.8957285e-01],
       [-8.0167279e-02, -5.6352711e-01,  9.0132527e-02,  9.0586895e-01],
       [-9.1437817e-02, -7.5974631e-01,  1.0824990e-01,  1.2254661e+00],
       [-1.0663275e-01, -9.5608264e-01,  1.3275923e-01,  1.5500102e+00],
       [-1.2575440e-01, -7.6277995e-01,  1.6375943e-01,  1.3015242e+00]

68.0
