# Deep RL - Policy Gradient

This notebook implements the policy gradient algorithm applied on several gym environments.

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from multiprocessing import Process
import time
import gym
from pprint import pprint

## Build the DNN for Policy, Value fn Estimator.

Build the computation graph for DNN, which can be used as our policy as well as the value function estimator.

In [3]:
def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=None, output_activation=None):
    """
    Builds a feed foreward Neural Network
    
    Arguments:
        input_placeholder - placeholder for the state observation - [batch_size, input_size]
        output_size - size of the output place holder
        scope - variable scope for the computation graph.
        n_layers - number of layers in the NN
        size - no of neurons in the hidden layers
        activatio_fn - activation function to be used in hidden layers.
        output_activation - activation function to be used in the output layer.
    Returns:
        output_placeholder - result of a forward pass.
    """
    
    with tf.variable_scope(scope):
        
        layer = input_placeholder
        for n in range(n_layers):
            layer = tf.layers.dense(layer, size, activation=activation)
            
        output_placeholder = tf.layers.dense(layer, output_size)
        
    return output_placeholder

### The Agent

The RL agent that interacts with the environment. collect experiences and improve itself to maximize the reward.

In [4]:
EPSILON = 1e-8

def pathlength(path):
    return len(path['reward'])

In [5]:
class Agent(object):
    
    def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args):
        """
        computation_graph_args: params used to define the policy/value fn approximator computatin graph
        sample_trajectory_args : params used to sample trajectory from the environment.
        estimate_return_rgs : params used to estimate the returns/rewards of the sampled trajectory
        """
        
        # Computation Graph Params
        self.ob_dim = computation_graph_args['ob_dim']
        self.ac_dim = computation_graph_args['ac_dim']
        self.discrete = computation_graph_args['discrete']
        self.size = computation_graph_args['size']
        self.n_layers = computation_graph_args['n_layer']
        self.learning_rate = computation_graph_args['learning_rate']
        
        # Sample Trajectory Params
        self.animate = sample_trajectory_args['animate']
        self.max_length_path = sample_trajectory_args['max_path_length']
        self.min_timsteps_per_batch = sample_trajectory_args['min_timesteps_per_batch']
        
        # Estimate returns params
        self.gamma = estimate_return_args['gamma']
        self.reward_to_go = estimate_return_args['reward_to_go']
        self.nn_baseline = estimate_return_args['nn_baseline']
        self.normalize_advantages = estimate_return_args['normalize_advantages']
        
        
    def init_tf_sess(self):
        
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        self.sess = tf.Session(config=tf_config)
        self.sess.__enter__()
        tf.global_variables_initializer().run()

        
    def define_placeholders(self):
        """
        placeholders for the inputs such as batch observations, actions and advantages in policy gradient loss function.
        """
        
        sy_ob_no = tf.placeholder(dtype=tf.float32, shape=[None, self.ob_dim], name='observation')
        
        if self.discrete:
            sy_ac_na = tf.placeholder(dtype=tf.int32, shape=[None], name='action')
        else:
            sy_ac_na = tf.placeholder(dtype=tf.float32, shap=e[None, self.ac_dim], name='actions')
        
        sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name='advantage')
        
        return sy_ob_no, sy_ac_na, sy_adv_n
    
    def policy_forward_pass(self, sy_ob_no):
        """
        Construct the symbolic operations/computation graph for the policy network outputs. which are the parameters of the
        policy distribution p(a|s)
        
        arguments:
            sy_ob_no: (batch_size, self.ob_dim)
            
        returns:
            the parameters of the policy
            
            if discrete, the parameters are the logits of a categorical distribution over the action
            
                sy_logits_na: (batchsize, self.ac_dim)
            
            if continous, the parameters are a tuple (mean, log_std) of a Guassian
                distribution over actions, log_std should be a trainable variable.
            
        """
        variable_scope = 'nn_policy'
        if self.discrete:
            sy_logits_na = build_mlp(sy_ob_no, self.ac_dim, variable_scope, self.n_layers, self.size, activation=tf.nn.relu, output_activation=None)
            return sy_logits_na
        else:
            sy_mean = build_mlp(sy_ob_no, self.ac_dim, variable_scope, self.n_layers, self.size, activation=tf.nn.relu, output_activatoin=None)
            sy_logstd = tf.Variable(tf.zeros(self.ac_dim), name='sy_logstd')
            return sy_mean, sy_logstd
        
        
    def sample_action(self, policy_parameters):
        """
        Constructs a symbolic operation for stochastically sampling from the policy distribution. 
        
        arguments:
            policy_parameters:
                if discrete, logits of categorical distribution over actions
                    sy_logits_na: (batch_size, self.ac_dim)
                if continous, (mean, log_std) of Guassian distribution over actions.
                    sy_mean: (batch_size, self.ac_dim)
                    sy_logstd: (self.ac_dim, )
                    
        returns:
            sy_sampled_ac:
                if discrete, (batch_size)
                if continous, (batch_size, self.ac_dim)
        """
        
        with tf.variable_scope('sampled_action'):
            if self.discrete:
                sy_logits_na = policy_parameters
                sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=1)
            else:
                sy_mean, sy_logstd = policy_parameters
                sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(shape=tf.shape(sy_mean))
                
        return sy_sampled_ac
    
    
    def get_log_prob(self, policy_parameters, sy_ac_na):
        """
        Constructs a symbolic operation for computing the log probabilities of a set of actions that were actually taken
        according to the policy
        
        arguments:
            policy_parameters
                if discrete, logits of categorical distribution over actions
                    sy_logits_na: (batch_size, self.ac_dim)
                if continours, (mean, log_std) of a Guassian distribution over actions.
                    sy_mean: (batc_size, self.ac_dim)
                    sy_logstd: (self.ac_dim,)
            sy_ac_na:
                if discrete: (batch_size, )
                if continous: (batch_size, self.ac_dim)
                    
        returns:
            sy_logprob_n: (batchsize)
            
        """
        if self.discrete:
            sy_logits_na = policy_parameters
            sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=sy_logits_na, labels=sy_ac_na)
        else:
            sy_mean, sy_logstd = policy_parameters
            probabilities = tf.distributions.Normal(sy_mean, tf.exp(sy_logstdg)).prob(sy_ac_na)
            sy_logprob_n = tf.log(tf.reduce_prod(probabilities, axis=1))
        
        return sy_logprob_n
    
    def build_computation_graph(self):
        """
        build full computation graph of the training.
        """
        
        # input placeholders
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n = self.define_placeholders()
        
        # the policy takes in an observation and produces a distribution over actions
        self.policy_parameters = self.policy_forward_pass(self.sy_ob_no)
        
        # We can sample action from this action distribution
        # this is used in Agent.sample_trajectory() where we generate a rollout
        self.sy_sampled_ac = self.sample_action(self.policy_parameters)
        
        # We can also compute the log probability of actions that were actually taken by the policy
        # This is used in loss function
        self.sy_logprob_n = self.get_log_prob(self.policy_parameters, self.sy_ac_na)
        
        # THE LOSS FUNCTION
        with tf.variable_scope("log_probability_weighted_by_advantage"):
            sy_weighted_logprob_n = tf.multiply(self.sy_logprob_n, self.sy_adv_n)
            
        with tf.variable_scope("loss"):
            self.sy_loss = tf.reduce_mean(sy_weighted_logprob_n)
            
        self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.sy_loss)
        
        # In case of baseline estimation using another neural network
        if self.nn_baseline:
            
            self.baseline_prediction = tf.squeeze(build_mlp(
                                        self.sy_ob_no,
                                        1,
                                        "nn_baseline",
                                        n_layers=self.n_layers,
                                        size=self.size
                                    ))
            self.sy_target_n = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_label')
            self.baseline_loss =tf.losses.mean_squared_error(self.sy_target_n, self.baseline_prediction, scope='nn_baseline_loss')
            self.baseline_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.baseline_loss)
            
        
    def sample_trajectories(self, itr, env):
        """
        Collect paths until we have enough timesteps.
        """
        timesteps_this_batch = 0
        paths = []
        
        while True:
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and self.animate)
            path = self.sample_trajectory(env, animate_this_episode)
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > self.min_timsteps_per_batch:
                break
        return paths, timesteps_this_batch
            
            
    def sample_trajectory(self, env, animate_this_episode):
        
        ob = env.reset()
        obs, acs, rewards = [], [], []
        steps = 0
        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.1)
            obs.append(ob)
            
            ac, policy_parameters = self.sess.run([self.sy_sampled_ac, self.policy_parameters], feed_dict={self.sy_ob_no: ob[None, :]})
            ac = ac[0]
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
            rewards.append(rew)
            steps += 1
            
            if done or steps > self.max_length_path:
                break
                
        path = {
            "observation": np.array(obs, dtype=np.float32),
            "reward": np.array(rewards, dtype=np.float32),
            "action": np.array(acs, dtype=np.float32)
        }
        
        return path
    
    def sum_of_rewards(self, re_n):
        """
        Monte Carlo estimation of the Q function.
        
        arguments:
            re_n: lenghth: num_paths, Each element in re_n is numpy array containing the rewards for particular path
            
        returns:
            q_n: shape: (sum_of_path_lengths): A single vector of estimated Q values whole length is the sum of the
                lengths of the paths.
        """
        
        q_n = []
        if self.reward_to_go:
            for path_i in re_n:
                reversed_path = np.flip(path_i)
                reward_i = []
                curr_reward = 0
                for r in reversed_path:
                    curr_reward = curr_reward * self.gamma + r
                    reward_i.append(curr_reward)
                reward_i.reverse()
                q_n += reward_i
        else: # At every timestep, gradient is weighted by the full reward.
            for path_i in re_n:
                reversed_path = np.flip(path_i)
                reward_i = []
                curr_reward = 0
                for r in reversed_path:
                    curr_reward = curr_reward * self.gamma + r
                reward_i = [curr_reward for i in range(len(path_i))]
                q_n += reward_i
        return np.array(q_n)
    
    def compute_advantage(self, ob_no, q_n):
        """
        Compute advantages by possibly subtracting a baseline from estimated q values.
        arguments: 
            ob_no: shape: (sum_of_path_lengths, ob_dim)
            q_n: shape: (sum_of_path_lengths), A single vector of estimated q values.
                
        returns:
            adv_n: shape: (sum_of_path_lengths), A single vector of the estimated advantages whose lenght is thes sum of length
                    of paths
        """
        
        if self.nn_baseline: # basline estimated with NN
            b_n = self.sess.run(self.baseline_prediction, feed_dict={self.sy_ob_no: ob_no})
            b_n = (b_n - b_n.mean()) / (b_n.std() + EPSILON) * q_n.std() + q_n.mean() # rescale b_n to match the statistics of q_n
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
        return adv_n
    
    def estimate_returns(self, ob_no, re_n):
        """
        Estimate the returns over a set of trajectories.
        
        arguments: 
            ob_no: shape: (sum_of_path_length, ob_dim)
            re_n: length: num_paths, each element is a numpy array containing rewards of a paricular path.
            
        returns:
            q_n: shape: (sum_of_path_lengths), A single vector of estimated Q values
            adv_n: shape: (sum_of_path_lengths),  A single vector of estimated Advantages.
        """
        
        q_n = self.sum_of_rewards(re_n)
        adv_n = self.compute_advantage(ob_no, q_n)
        
        # normalize the advantages to reduce the variance.
        if self.normalize_advantages:
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + EPSILON)
            
        return q_n, adv_n
    
    def update_parameters(self, ob_no, ac_na, q_n, adv_n):
        """
        Update the parameters of the policy and (possibly) the neural network baseline.
        
        argumenst: 
            ob_no: (sum_of_path_lengths, ob_dim)
            ac_na: (sum_of_path_lengths)
            q_n: (sum_of_path_lengths)
            adv_n: (sum_of_path_lengths)
            
        returns:
            nothing
        """
        
        # Optimising NN baseling
        if self.nn_baseline:
            target_n = (q_n - q_n.mean()) / (q_n.std() + EPSILON)
            _, target_loss = self.sess.run([self.baseline_update_op, self.baseline_loss], feed_dict={self.sy_ob_no: ob_no, self.sy_target_n: target_n})
            
        # Policy Update
        _, loss, policy_parameters, logprob_n = self.sess.run([self.update_op, self.sy_loss, self.policy_parameters, self.sy_logprob_n],
                                                             feed_dict={self.sy_ob_no: ob_no, self.sy_ac_na: ac_na, self.sy_adv_n: adv_n})
        return loss   

### Training the Agent

* Make the Gym Environment
* Initialize the observation and action shapes.
* Initialize computation graph params ( for setting up the training op ), sample_params ( for sampling trajectories) and estimate_return params ( for estimating rewards and advantages of sampled episodes)
* Initialze the Agent with those params
* Build the computation graph of the agent.
    * Initialize the place holders for observations(inputs), actions(loss) and advantage(loss)
    * Build the forward propagation of the policy - build the MLP - returns logits.
    * Add the graph for sampling action from those logits- Discrete and Continous.
    * Add the graph for getting the log probability between the logits and sampled action - used in loss.
    * Add the graph for log probability weighted advantage.
    * Add the graph for reduce mean of above - loss function.
    * Add the graph of update op, AdamOptimizer
    * Incase of Baseline Neural Network.
        * Build separate graph for baseline estimation
        * Create a place holder for reward label
        * Add the graph for loss function
        * Add the grapg for update op
* Initialize Tensorflow Session
* for iterations:
    * sample multiple trajectories/paths
        * Till we have the desired batchsize of obs, acs, and rewards.
            * sample paths
                * env.reset()
                * sample action by running the sample_ac graph
                * env.step()
                * collect reward
    * Stack sampled paths' obs and acs on top of another. rewards still list of lists(cause we need to calculate discounted sum of rewards which depends on the episode)
    * Estimate the returns. using observations and rewards from sampling
        * q_n : discounted sum of rewards
        * adv_n: advantage. q_n - b_n : b_n with a NN ( obs is used here to pass to the network)
    * Update parameters of Policy and NN Baseline
        * for baseline NN, target is q_n: session run baseline_update_op
        * for policy, session run update_op

In [6]:
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go,
             animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size):
    """
    Train the PG agent.
    arguments:
        exp_name: name of the experiment
        env_name: name of the gym env
        n_iter: epochs to train
        gamma: discount factor
        min_timesteps_per_batch: how many steps in a batch of training data.
        max_path_length: max path length of a trajectory/episode
        learning_rate: 
        reward_to_go: whether to use reward_to_go or full_reward when doing MC evaluation: True/False
        animate: Render env or not, True/False
        logdir: directory to store logs
        normalize_advantages: whether to normalize the advantanges calculated.
        nn_baseline: whether to use a NN for baseline estimation
        seed : random seet
        n_layers : number of layers in the MLP
        size : number neurons in the hidden layer of MLP
    """
    
    start = time.time()
    
    # Make the gym environment
    env = gym.make(env_name)
    
    # Set the random seed
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    
    # Maximum lenth of episodes.
    max_path_length = max_path_length or env.spec.max_episode_steps
    
    # Is this env continous or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    
    # Observation and action sizes.
    ob_dim = env.observation_space.shape[0]
    print("Observation Shape: ", env.observation_space.shape)
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print("Action Shape: ", env.action_space.n)
    
    
    # Initializing the agent
    computation_graph_args = {
        'n_layer': n_layer,
        'ob_dim' : ob_dim,
        'ac_dim' : ac_dim,
        'discrete' : discrete,
        'size' : size,
        'learning_rate' : learning_rate,
    }
    
    sample_trajectory_args={
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch' : min_timesteps_per_batch,
        
    }
    
    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline' : nn_baseline,
        'normalize_advantages' : normalize_advantages
    }
    
    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)
    
    # build the computation grapg
    agent.build_computation_graph()
    
    # tensorflow config, session, variable initialization.
    agent.init_tf_sess()
    
    
    # Training Loop
    
    total_timesteps = 0
    for itr in range(n_iter):
        print("************** Iteration %d ******************" % itr)
        # collect experiences.
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch
        
        # Build arrays of observation, action for the policy gradient update by concatenating accross paths.
        # put all data into a big tensor.
        ob_no = np.concatenate([path['observation'] for path in paths])
        ac_na = np.concatenate([path['action'] for path in paths])
        re_n = [path['reward'] for path in paths]
        print("Observation")
        pprint(ob_no.shape)
        print("Actions")
        pprint(ac_na.shape)
        print("Rewrads")
        pprint(len(re_n))
        
        # estimate q and advantage
        q_n, adv_n = agent.estimate_returns(ob_no, re_n)
        
        # update parameters of policy, nn baseline
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)
        
        # log stats
        returns = [path['reward'].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        print("Time: ", time.time() - start)
        print("Iteration: ,", itr)
        print("Average Return: ", np.mean(returns))
        print("Std Return: ", np.std(returns))
        print("Max Return: ", np.max(returns))
        print("MinReturn: ", np.min(returns))
        print("EplenMean: ,", np.mean(ep_lengths))
        print("EplenStd: ,", np.std(ep_lengths))
        print("TimeStepsThisBatch: ", timesteps_this_batch)
        print("TimeStepsSoFar: ", total_timesteps)
    
    

In [7]:
exp_name = "test"
env_name = 'CartPole-v0'
render = True
gamma = 0.99
n_iter = 20
batch_size = 1000
ep_len = -1
learning_rate = 5e-3
reward_to_go = True
normalize_advantages = True
nn_baseline = True
seed = 42
n_experiments = 1
n_layer = 2
size = 64

In [9]:
processes = []
for i in range(n_experiments):
    
    seed = seed + 10
    print("Running Experiment with seed: ", seed)
    
    def train_func():
        train_PG(
            exp_name=exp_name,
            env_name=env_name,
            n_iter=n_iter,
            gamma=gamma,
            min_timesteps_per_batch=batch_size,
            max_path_length=None,
            learning_rate=learning_rate,
            reward_to_go=reward_to_go,
            animate=render,
            logdir=None,
            normalize_advantages=normalize_advantages,
            nn_baseline=nn_baseline,
            seed=seed,
            n_layers=n_layer,
            size=size
        )
        
    p = Process(target=train_func, args=tuple())
    p.start()
    processes.append(p)
    
for p in processes:
    p.join()

Running Experiment with seed:  62
Observation Shape:  (4,)
Action Shape:  2
************** Iteration 0 ******************
Observation
(1006, 4)
Actions
(1006,)
Rewrads
49
Time:  5.984406232833862
Iteration: , 0
Average Return:  20.530613
Std Return:  8.86659
Max Return:  57.0
MinReturn:  9.0
EplenMean: , 20.53061224489796
EplenStd: , 8.866589526375844
TimeStepsThisBatch:  1006
TimeStepsSoFar:  1006
************** Iteration 1 ******************
Observation
(1017, 4)
Actions
(1017,)
Rewrads
39
Time:  6.835920333862305
Iteration: , 1
Average Return:  26.076923
Std Return:  13.278833
Max Return:  69.0
MinReturn:  11.0
EplenMean: , 26.076923076923077
EplenStd: , 13.278833389028193
TimeStepsThisBatch:  1017
TimeStepsSoFar:  2023
************** Iteration 2 ******************
Observation
(1004, 4)
Actions
(1004,)
Rewrads
36
Time:  7.509249448776245
Iteration: , 2
Average Return:  27.88889
Std Return:  11.034738
Max Return:  54.0
MinReturn:  10.0
EplenMean: , 27.88888888888889
EplenStd: , 11.03