## Goal
The goal of this notebook is to flatten the code from CS294-HW2, parameters will be hard coded

In [1]:
# Load libraries
import numpy as np
import tensorflow as tf
import gym
import time

In [2]:
# Set parameters

# the gym environment name
#env_name = "InvertedPendulum-v2"
#env_name = "Breakout-ram-v0"
env_name = "CartPole-v1"

# number of hidden layers
n_layers = 2

# random seed
seed_init = 0

# number of interations
n_iter = 100

# maximum steps can be taken by the simulation
max_path_length = None

# dimension of the hidden layer
size = 64

# default LR
learning_rate = 5e-3
#learning_rate = 5e-1

# number of iterations in one update of critic
num_target_updates = 10

# number of gradients per iteration 
num_grad_steps_per_target_update = 10

# corresponds to 'render', whether to animate the trajactory or not
animate = "store_true"

# the minimum sum of total timestep for all trajectories in a single run
min_timesteps_per_batch = 1000

# discount over future reward, assume no discount for now
gamma = 0.99

# Whether the advantage should be normalized or not
normalize_advantages = "store_true"

# whether we use reward_to_go as baseline or not
reward_to_go = "store_true"
#reward_to_go = None

# whether we like baseline or not
nn_baseline = "store_true"

# iterations between animation
animation_interval = 10

In [3]:
env = gym.make(env_name)
seed = seed_init + 10*1

# Set random seeds
tf.set_random_seed(seed)
np.random.seed(seed)
env.seed(seed)

# Maximum length for episodes
max_path_length = max_path_length or env.spec.max_episode_steps

# Is this env continuous, or self.discrete?
discrete = isinstance(env.action_space, gym.spaces.Discrete)

# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

In [4]:
class Agent(object):
    def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args):
        super(Agent, self).__init__()
        self.ob_dim = computation_graph_args['ob_dim']
        self.ac_dim = computation_graph_args['ac_dim']
        self.discrete = computation_graph_args['discrete']
        self.size = computation_graph_args['size']
        self.n_layers = computation_graph_args['n_layers']
        self.learning_rate = computation_graph_args['learning_rate']

        self.animate = sample_trajectory_args['animate']
        self.max_path_length = sample_trajectory_args['max_path_length']
        self.min_timesteps_per_batch = sample_trajectory_args['min_timesteps_per_batch']
        self.animation_interval = sample_trajectory_args['animation_interval']

        self.gamma = estimate_return_args['gamma']
        self.reward_to_go = estimate_return_args['reward_to_go']
        self.nn_baseline = estimate_return_args['nn_baseline']
        self.normalize_advantages = estimate_return_args['normalize_advantages']
        
        self.timesteps_this_batch = 0

    def init_tf_sess(self):
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        self.sess = tf.Session(config=tf_config)
        self.sess.__enter__() # equivalent to `with self.sess:`
        tf.global_variables_initializer().run() #pylint: disable=E1101

    #========================================================================================#
    #                           ----------PROBLEM 2----------
    #========================================================================================#
    def define_placeholders(self):
        """
            Placeholders for batch observations / actions / advantages in policy gradient
            loss function.
            See Agent.build_computation_graph for notation

            returns:
                sy_ob_no: placeholder for observations
                sy_ac_na: placeholder for actions - this is the policy pi
                sy_adv_n: placeholder for advantages
        """
        sy_ob_no = tf.placeholder(shape=[None, self.ob_dim], name="ob", dtype=tf.float32)

        if self.discrete:
            # In case a single number is used to represent the action 1 or -1
            sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
        else:
            # Simultaneously actions at the same time
            sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32)

        # YOUR CODE HERE
        # Average advantage across all 
        sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        steps = tf.placeholder(shape=[None], name="steps", dtype=tf.float32)

        return sy_ob_no, sy_ac_na, sy_adv_n, steps


    #========================================================================================#
    #                           ----------PROBLEM 2----------
    #========================================================================================#
    def policy_forward_pass(self, sy_ob_no):
        """ Constructs the symbolic operation for the policy network outputs,
            which are the parameters of the policy distribution p(a|s)

            arguments:
                sy_ob_no: (batch_size, self.ob_dim)

            returns:
                the parameters of the policy.

                if discrete, the parameters are the logits of a categorical distribution
                    over the actions
                    sy_logits_na: (batch_size, self.ac_dim)

                if continuous, the parameters are a tuple (mean, log_std) of a Gaussian
                    distribution over actions. log_std should just be a trainable
                    variable, not a network output.
                    sy_mean: (batch_size, self.ac_dim)
                    sy_logstd: (self.ac_dim,)

            Hint: use the 'build_mlp' function to output the logits (in the discrete case)
                and the mean (in the continuous case).
                Pass in self.n_layers for the 'n_layers' argument, and
                pass in self.size for the 'size' argument.
        """
        if self.discrete:
            # YOUR HW2 CODE HERE
            sy_logits_na = build_mlp(sy_ob_no, self.ac_dim, "policy_forward_pass", self.n_layers, self.size)
            return sy_logits_na
        else:
            # YOUR HW2 CODE HERE
            sy_mean = build_mlp(sy_ob_no, self.ac_dim, "policy_forward_pass", self.n_layers, self.size)
            sy_logstd = tf.get_variable("logstd", shape=[self.ac_dim], trainable=True)
            return (sy_mean, sy_logstd)

    #========================================================================================#
    #                           ----------PROBLEM 2----------
    #========================================================================================#
    def sample_action(self, policy_parameters):
        """ Constructs a symbolic operation for stochastically sampling from the policy
            distribution

            arguments:
                policy_parameters
                    if discrete: logits of a categorical distribution over actions
                        sy_logits_na: (batch_size, self.ac_dim)
                    if continuous: (mean, log_std) of a Gaussian distribution over actions
                        sy_mean: (batch_size, self.ac_dim)
                        sy_logstd: (self.ac_dim,)

            returns:
                sy_sampled_ac:
                    if discrete: (batch_size,)
                    if continuous: (batch_size, self.ac_dim)

            Hint: for the continuous case, use the reparameterization trick:
                 The output from a Gaussian distribution with mean 'mu' and std 'sigma' is

                      mu + sigma * z,         z ~ N(0, I)

                 This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
        """
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR HW2 CODE_HERE
            samples = tf.multinomial(logits=sy_logits_na, num_samples=1) # output of shape [batch_size, num_samples]
            sy_sampled_ac = tf.reshape(samples, [-1]) # flatten to be of shape [batch_size]
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR HW2 CODE_HERE
            z = tf.random_normal(tf.shape(sy_mean), mean=0.0, stddev=1.0)
            sy_std = tf.exp(sy_logstd)
            sy_sampled_ac = sy_mean + sy_std * z
        return sy_sampled_ac

    #========================================================================================#
    #                           ----------PROBLEM 2----------
    #========================================================================================#
    def get_log_prob(self, policy_parameters, sy_ac_na):
        """ Constructs a symbolic operation for computing the log probability of a set of actions
            that were actually taken according to the policy

            arguments:
                policy_parameters
                    if discrete: logits of a categorical distribution over actions
                        sy_logits_na: (batch_size, self.ac_dim)
                    if continuous: (mean, log_std) of a Gaussian distribution over actions
                        sy_mean: (batch_size, self.ac_dim)
                        sy_logstd: (self.ac_dim,)

                sy_ac_na: 
                    if discrete: (batch_size,)
                    if continuous: (batch_size, self.ac_dim)

            returns:
                sy_logprob_n: (batch_size)

            Hint:
                For the discrete case, use the log probability under a categorical distribution.
                For the continuous case, use the log probability under a multivariate gaussian.
        """
        if self.discrete:
            sy_logits_na = policy_parameters
            # YOUR HW2 CODE_HERE
            # Softmax the raw policy_parameters to become probabilities which can sum up to 1
            # Then times the action dimension
            sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)
        else:
            sy_mean, sy_logstd = policy_parameters
            # YOUR HW2 CODE_HERE
            # calculate the z score of the sampled actions under the policy
            sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd)
            # this maximizes likelihood by pushing z towards 0 (mean of distribution)
            sy_logprob_n = -0.5 * tf.reduce_mean(tf.square(sy_z), axis=1)
        return sy_logprob_n

    def build_computation_graph(self):
        """
            Notes on notation:

            Symbolic variables have the prefix sy_, to distinguish them from the numerical values
            that are computed later in the function

            Prefixes and suffixes:
            ob - observation
            ac - action
            _no - this tensor should have shape (batch self.size /n/, observation dim)
            _na - this tensor should have shape (batch self.size /n/, action dim), this is the policy tensor
            _n  - this tensor should have shape (batch self.size /n/)

            Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis
            is None

            ----------------------------------------------------------------------------------
            loss: a function of self.sy_logprob_n and self.sy_adv_n that we will differentiate
                to get the policy gradient.
        """
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n, self.steps = self.define_placeholders()

        # The policy takes in an observation and produces a distribution over the action space
        # sy_ob_no serves as a sequence of state, then result is a new policy (probability of action)
        # based on this sequence of state
        # each row in policy_parameters is a pi(at | st) with the st given by sy_ob_no
        # 
        self.policy_parameters = self.policy_forward_pass(self.sy_ob_no)

        # We can sample actions from this action distribution.
        # This will be called in Agent.sample_trajectory() where we generate a rollout.
        self.sy_sampled_ac = self.sample_action(self.policy_parameters)

        # We can also compute the logprob of the actions that were actually taken by the policy
        # This is used in the loss function.
        
        ### Comment ### 
        #### policy_parameters is the probability of action according to policy
        #### self.sy_ac_na is the actual action label taken in the sampling
        self.sy_logprob_n = self.get_log_prob(self.policy_parameters, self.sy_ac_na)

        #========================================================================================#
        #                           ----------PROBLEM 2----------
        # Loss Function and Training Operation
        #========================================================================================#
        loss = None # YOUR CODE HERE
        ## add a minus to make this problem a minimum problem
        #loss = - tf.reduce_sum(self.sy_logprob_n) * tf.reduce_sum(self.sy_adv_n)
        loss = - tf.reduce_sum(self.sy_logprob_n * self.sy_adv_n)
        loss = tf.math.divide(loss, self.steps)
        self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(loss)

        #========================================================================================#
        #                           ----------PROBLEM 6----------
        # Optional Baseline
        #
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        #========================================================================================#
        if self.nn_baseline:
            self.baseline_prediction = tf.squeeze(build_mlp(
                                    self.sy_ob_no,
                                    1,
                                    "nn_baseline",
                                    n_layers=self.n_layers,
                                    size=self.size))
            # YOUR_CODE_HERE
            
            self.sy_target_n = tf.placeholder(shape=[None], name="baseline_target", dtype=tf.float32)
            baseline_loss = tf.losses.mean_squared_error(self.sy_target_n, self.baseline_prediction)

            self.baseline_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(baseline_loss)

    def sample_trajectories(self, itr, env):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode=(len(paths)==0 and (itr % self.animation_interval == 0) and self.animate)
            path = self.sample_trajectory(env, animate_this_episode)
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > self.min_timesteps_per_batch:
                self.timesteps_this_batch = timesteps_this_batch
                break
        return paths, timesteps_this_batch

    def sample_trajectory(self, env, animate_this_episode):
        ob = env.reset()
        obs, acs, rewards = [], [], []
        steps = 0
        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.1)
            obs.append(ob)
           
            #====================================================================================#
            #                           ----------PROBLEM 3----------
            #====================================================================================#
            
            
            # expand the observation to a 2-D array and sample the action based on it
            ac = self.sess.run(self.sy_sampled_ac, 
                               feed_dict={self.sy_ob_no: np.expand_dims(ob, axis=0)}) # YOUR HW2 CODE HERE
            ac = ac[0]
            
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
            rewards.append(rew)
            steps += 1
            if done or steps > self.max_path_length:
                break
        path = {"observation" : np.array(obs, dtype=np.float32),
                "reward" : np.array(rewards, dtype=np.float32),
                "action" : np.array(acs, dtype=np.float32)}
        return path

    #====================================================================================#
    #                           ----------PROBLEM 3----------
    #====================================================================================#
    def sum_of_rewards(self, re_n):
        """
            Monte Carlo estimation of the Q function.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                re_n: length: num_paths. Each element in re_n is a numpy array
                    containing the rewards for the particular path

            returns:
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values
                    whose length is the sum of the lengths of the paths

            ----------------------------------------------------------------------------------

            Your code should construct numpy arrays for Q-values which will be used to compute
            advantages (which will in turn be fed to the placeholder you defined in
            Agent.define_placeholders).

            Recall that the expression for the policy gradient PG is

                  PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]

            where

                  tau=(s_0, a_0, ...) is a trajectory,
                  Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
                  and b_t is a baseline which may depend on s_t.

            You will write code for two cases, controlled by the flag 'reward_to_go':

              Case 1: trajectory-based PG

                  (reward_to_go = False)

                  Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
                  entire trajectory (regardless of which time step the Q-value should be for).

                      For this case, the policy gradient estimator is

                          E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]

                  where

                      Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.

                  Thus, you should compute

                      Q_t = Ret(tau)

              Case 2: reward-to-go PG

                  (reward_to_go = True)

                  Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
                  from time step t. Thus, you should compute

                      Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}


            Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
            like the 'ob_no' and 'ac_na' above.
        """
        
        q_n = []
        if self.reward_to_go:
            for ri in re_n:
                disc = 1
                sum = 0
                q = []
                for rj in reversed(ri):
                    sum += rj*disc
                    disc *= gamma
                    q.insert(0,sum)
                q_n += q
        else:
            for ri in re_n:
                disc = 1
                sum = 0
                for rj in ri:
                    sum += rj*disc
                    disc *= gamma
                    q_n.append(sum)
        return np.array(q_n)

    def compute_advantage(self, ob_no, q_n):
        """
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values
                    whose length is the sum of the lengths of the paths

            returns:
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                    advantages whose length is the sum of the lengths of the paths
        """
        #====================================================================================#
        #                           ----------PROBLEM 6----------
        # Computing Baselines
        #====================================================================================#
        if self.nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current batch of Q-values. (Goes with Hint
            # #bl2 in Agent.update_parameters.
            
            
            #raise NotImplementedError
            ### TODO
            b_n = self.sess.run(self.baseline_prediction,
                          feed_dict={self.sy_ob_no: ob_no})
            b_n = (b_n - q_n.mean()) / q_n.std()
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
        return adv_n

    def estimate_return(self, ob_no, re_n):
        """
            Estimates the returns over a set of trajectories.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from
                Agent.sample_trajectories
            let num_paths be the number of paths sampled from Agent.sample_trajectories

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                re_n: length: num_paths. Each element in re_n is a numpy array
                    containing the rewards for the particular path

            returns:
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values
                    whose length is the sum of the lengths of the paths
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                    advantages whose length is the sum of the lengths of the paths
        """
        q_n = self.sum_of_rewards(re_n)
        adv_n = self.compute_advantage(ob_no, q_n)
        #====================================================================================#
        #                           ----------PROBLEM 3----------
        # Advantage Normalization
        #====================================================================================#
        #if self.normalize_advantages:
        # On the next line, implement a trick which is known empirically to reduce variance
        # in policy gradient methods: normalize adv_n to have mean zero and std=1.
        
        if self.normalize_advantages:
            #mean = sum(adv_n) / len(adv_n)
            #std = (sum([(ai - mean)**2 for ai in adv_n]) / len(adv_n)) ** .5
            adv_n = np.array(adv_n)
            adv_n = (adv_n - adv_n.mean()) / adv_n.std() # YOUR HW2 CODE_HERE

        return q_n, adv_n

    def update_parameters(self, ob_no, ac_na, q_n, adv_n):
        """
            Update the parameters of the policy and (possibly) the neural network baseline,
            which is trained to approximate the value function.

            arguments:
                ob_no: shape: (sum_of_path_lengths, ob_dim)
                ac_na: shape: (sum_of_path_lengths).
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values
                    whose length is the sum of the lengths of the paths
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                    advantages whose length is the sum of the lengths of the paths

            returns:
                nothing

        """
        #====================================================================================#
        #                           ----------PROBLEM 6----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if self.nn_baseline:
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 in
            # Agent.compute_advantage.)

            # YOUR_CODE_HERE
            # TODO: Why q_n is target
            target_n = (q_n - q_n.mean()) / q_n.std() # YOUR HW2 CODE_HERE
            self.sess.run(self.baseline_update_op,
                      feed_dict={self.sy_ob_no: ob_no, self.sy_target_n: target_n})


        #====================================================================================#
        #                           ----------PROBLEM 3----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the currenat batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        
        ####Comment####
        # run the Adam optimizer we defined above with everything updated
        self.sess.run(self.update_op,
                      feed_dict={self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n, self.steps : np.array([1000])})

In [5]:
def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
    """
        Builds a feedforward neural network

        arguments:
            input_placeholder: placeholder variable for the state (batch_size, input_size)
            output_size: size of the output layer
            scope: variable scope of the network
            n_layers: number of hidden layers
            size: dimension of the hidden layer
            activation: activation of the hidden layers
            output_activation: activation of the ouput layers

        returns:
            output placeholder of the network (the result of a forward pass)

        Hint: use tf.layers.dense
    """

    # YOUR CODE HERE
    # with variable scope _scope
    batch_size = input_placeholder[1]
    input_size = input_placeholder[2]

    # network = tf.keras.models.Sequential([
    # tf.keras.layers.Flatten(input_shape=input_size),
    # tf.keras.layers.Dense(size, activation=activation),
    # tf.keras.layers.Dropout(0.2),
    # tf.keras.layers.Dense(output_size, activation=output_activation)
    # ])
    #input_layer = tf.reshape(input_placeholder, [-1, 28, 28, 1])
    input_layer = input_placeholder
    hidden_input = input_layer

    for i in range(0, n_layers):
        hidden_input = tf.layers.dense(inputs=hidden_input, units=size, activation=activation)

    output_placeholder = tf.layers.dense(inputs=hidden_input, units=output_size, activation=output_activation)

    return output_placeholder

def pathlength(path):
    return len(path["reward"])

In [6]:
computation_graph_args = {
    'n_layers': n_layers,
    'ob_dim': ob_dim,
    'ac_dim': ac_dim,
    'discrete': discrete,
    'size': size,
    'learning_rate': learning_rate
}

sample_trajectory_args = {
    'animate': animate,
    'max_path_length': max_path_length,
    'min_timesteps_per_batch': min_timesteps_per_batch,
    'animation_interval' : animation_interval
}

estimate_return_args = {
    'gamma': gamma,
    'reward_to_go': reward_to_go,
    'nn_baseline': nn_baseline,
    'normalize_advantages': normalize_advantages,
}

agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

# build computation graph
agent.build_computation_graph()

# tensorflow: config, session, variable initialization
agent.init_tf_sess()

#========================================================================================#
# Training Loop
#========================================================================================#

total_timesteps = 0

W0722 23:24:11.038002 140735683527552 deprecation.py:323] From <ipython-input-5-afc69693cd93>:36: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0722 23:24:11.043020 140735683527552 deprecation.py:506] From /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0722 23:24:11.561825 140735683527552 deprecation.py:323] From <ipython-input-4-13851732996c>:128: multinomial (from tensorflow.python.ops.random_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.random.categorical` instead.
W0722 23:24:11.875457 140735683527552 d

In [7]:
n_iter = 100

for itr in range(n_iter):
    print("********** Iteration %i ************"%itr)
    paths, timesteps_this_batch = agent.sample_trajectories(itr, env)   
    total_timesteps += timesteps_this_batch

    # Build arrays for observation, action for the policy gradient update by concatenating
    # across paths
    ob_no = np.concatenate([path["observation"] for path in paths])
    ac_na = np.concatenate([path["action"] for path in paths])
    re_n = [path["reward"] for path in paths]

    q_n, adv_n = agent.estimate_return(ob_no, re_n)

    agent.update_parameters(ob_no, ac_na, q_n, adv_n)
    
    returns = [path["reward"].sum() for path in paths]
    
    print("timesteps: ", agent.timesteps_this_batch);    
    print("returns:", np.mean(returns))

********** Iteration 0 ************
timesteps:  1004
returns: 27.88889
********** Iteration 1 ************
timesteps:  1026
returns: 35.37931
********** Iteration 2 ************
timesteps:  1008
returns: 37.333332
********** Iteration 3 ************
timesteps:  1028
returns: 54.105263
********** Iteration 4 ************
timesteps:  1017
returns: 63.5625
********** Iteration 5 ************
timesteps:  1046
returns: 69.73333
********** Iteration 6 ************
timesteps:  1033
returns: 64.5625
********** Iteration 7 ************
timesteps:  1011
returns: 67.4
********** Iteration 8 ************
timesteps:  1046
returns: 74.71429
********** Iteration 9 ************
timesteps:  1041
returns: 74.35714
********** Iteration 10 ************
timesteps:  1079
returns: 77.07143
********** Iteration 11 ************
timesteps:  1007
returns: 100.7
********** Iteration 12 ************
timesteps:  1043
returns: 173.83333
********** Iteration 13 ************
timesteps:  1109
returns: 221.8
********** 

KeyboardInterrupt: 

## Softmax:

**Softmax** $(x_{i}) = e^{x_i}  \sum_{j}e^{x_j}$

Convert logits into probabilities which can sum up to one

In [None]:
# Test softmax
import numpy as np
import math

x = [ 0.8360188,   0.11314284,  0.05083836]

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

print(softmax(x))

## Cross Entropy

**CrossEntropy** $(x) = -\sum_{i} p(x_i) \log q(x_i)$

maximizing the likelihood of training set is the same as minimizing the cross entropy.

p(x) in cross-entropy equation is true distribution, while q(x) is the distribution obtained from softmax

## Playground

In [61]:
reward1 = [1,2,3,4]
path1 = {"reward" : np.array(reward1, dtype=np.float32)}
reward2 = [3,4,5,6]
path2 = {"reward" : np.array(reward2, dtype=np.float32)}
paths = [path1, path2]
r = [path["reward"] for path in paths]

In [84]:
q_n = []
gamma = 0.99
for ri in r:
    disc = 1
    sum = 0
    q = []
    for rj in reversed(ri):
        sum += rj*disc
        disc *= gamma
        #q_n.append(sum)
        q.insert(0,sum)
    q_n += q
    
#     for rj in ri:

        
q_n

[9.900499, 8.9302, 6.97, 4.0, 17.781297, 14.8704, 10.95, 6.0]

In [2]:
#import math
a = [1,2,3,4,5]



In [3]:
mean

3.0

1.4142135623730951