In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import tensorflow as tf
import numpy as np
import gym
import time

In [4]:
exp_name = "test"
env_name = "CartPole-v0"
n_iter = 1
reward_to_go = True
nn_baseline = True
normalize_advantages = True
n_layers = 2
size = 64
render = True
gamma = 0.99
seed = 42
learning_rate = 1e-4
batch_size=1000
max_path_length = None
n_experiments = 1

In [3]:
class Agent(object):
    
    def __init__(self, computation_graph_params, sample_trajectories_params, estimate_returns_params):
        
        # computation graph params
        self.ob_dim = computation_graph_params['ob_dim']
        self.ac_dim = computation_graph_params['ac_dim']
        self.n_layers = computation_graph_params['n_layers']
        self.size = computation_graph_params['size']
        self.learning_rate = computation_graph_params['learning_rate']
        self.is_discrete = computation_graph_params['discrete']
        
        #  Sample trajectory params
        self.max_path_length = sample_trajectories_params['max_path_lenght']
        self.timesteps_per_batch = sample_trajectories_params['timesteps_per_batch']
        self.animate = sample_trajectories_params['animate']
        
        # Estimate return params
        self.reward_to_go = estimate_returns_params['reward_to_go']
        self.nn_baseline = estimate_returns_params['nn_baseline']
        self.normalize_advatantages = estimate_returns_params['normalize_advantages']
        self.gamma = estimate_returns_params['gamma']
        
    def define_placeholders(self):
        """
        define placeholders for the inputs such as observations, actions(in loss), advantage(in loss)
        """
        # observation
        sy_ob_no = tf.placeholder(dtype=tf.float32, shape=[None, self.ob_dim], name='observation')
        
        # action
        if self.is_discrete:
            sy_ac_na = tf.placeholder(dtype=tf.int32, shape=[None], name='action')
        else:
            sy_ac_na = tf.placeholder(dtype=tf.float32, shape=[None, self.ac_dim], name='actions')
            
        # advantage
        sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name='advantage')
        
    def build_mlp(self, input_placeholder, output_size, variable_scope, n_layers=2, hidden_size=64, activation=None, 
                  output_activation=None):
        """
        build the neural network graph
        """
        
        with tf.variable_scope(variable_scope):
            layer = input_placeholder
            for i in range(n_layers):
                layer = tf.layers.dense(layer, hidden_size, activation=activation)

            output_placeholder = tf.layers.dense(layer, output_size, activation=None)
            
        return output_placeholder
            
        
    def policy_forward(self, sy_ob_no):
        variable_scope = 'nn_policy'
        if self.is_discrete:
            sy_logits_na = self.build_mlp(input_placeholder=sy_ob_no, output_size=self.ac_dim, variable_scope=variable_scope, 
                                     n_layers=self.n_layers, hidden_size=self.size, activation=tf.nn.relu, 
                                     output_activation=None)
            return sy_logits_na
        else: # mean and std of guassian
            sy_mean = self.build_mlp(input_placeholder=sy_ob_no, output_size=self.ac_dim, variable_scope=variable_scope,
                                n_layers=self.n_layers, hidden_size=self.size, activation=tf.nn.relu, 
                                output_activation=None)
            sy_std = tf.Variable(tf.zeros(self.ac_dim), name='sy_logstd')
            return sy_mean, sy_std
        
    def sample_action(self, policy_parameters):
        """
        graph for sampling action given logits
        """
        with tf.variable_scope('sampled_action'):
            
            if self.is_discrete:
                sy_sampled_ac = tf.squeeze(tf.multinomial(policy_parameters, 1), axis=1)
            else:
                sy_mean, sy_logstd = policy_parameters
                sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(shape=tf.shape(sy_mean))
                
            return sy_sampled_ac
        
    def get_logprob(self, policy_parameters, sy_ac_na):
        """
        graph for computing the log probabilitie between the logits and actions taken.
        
        arguments: 
            policy_parameters: 
                if is_discrete:
                    sy_logits_na: shape(batch_size, self.ac_dim)
                else:
                    sy_mean_na : shape: (batch_size, self.ac_dim)
                    sy_logstd_n : shape: (self.ac_dim, )
            sy_ac_na:
                if is_discrete:
                    shape : (batch_size,)
                else:
                    shape : (batch_size, self.ac_dim)
        returns:
            sy_logprob_n: shape : (batch_size, )
        """
        if self.is_discrete:
            sy_logits_na = policy_parameters
            sy_logprobs_n = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=sy_logits_na, labels=sy_ac_na)
        else:
            sy_mean, sy_logstd = policy_parameters
            probabilities = tf.distribution.Normal(sy_mean, tf.exp(sy_logstd)).prob(sy_ac_na)
            sy_logprob_n = tf.log(tf.reduce_prod(probabilities, axis=1))
            
        return sy_logprob_n
        
        
    def build_computation_graph(self):
        """
        build the computation graph of the training
        """
        
        # define input placeholders. 
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n = self.define_placeholders()
        
        # build the graph forward pass through the policy
        self.policy_parameters = self.policy_forward(self.sy_ob_no)
        
        # add graph for sampling actions from logits.
        self.sy_sampled_ac = self.sample_action(self.policy_parameters)
        
        # add graph for getting log probability of actions and logits.
        self.sy_logprob_n = self.get_logprob(self.policy_parameters, self.sy_ac_na)
        
        # add graph for logprobabilty with weighted ad
        with tf.variable_scope("log_prob_with_weighted_adv"):
            sy_weighted_logprob_n = tf.multiply(self.sy_logprob_n, self.adv_n)
            
        # add grapg for the loss function.
        with tf.variable_scope("loss"):
            self.sy_loss = tf.reduce_mean(sy_weighted_logprob_n)
            
        # optimizer.
        self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.sy_loss)
        
        # in case of nn baseline
        if self.nn_baseline:
            self.baseline_predictions = tf.squeeze(build_mlp(self.sy_ob_no, 1, 'nn_baseline', 
                                                      n_layers=self.n_layers, size=self.size,
                                                      activatoin=tf.nn.relu))
            self.sy_target_n = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_label')
            self.baseline_loss = tf.losses.mean_squared_error(self.sy_target_n, self.baseline_predictions, scope='nn_baseline_loss')
            self.baseline_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.baseline_loss)
            
    def init_tf_sess(self):
        """
        initialize tensorflow session
        """
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        self.sess = tf.Session(config=tf_config)
        self.sess.__enter__()
        tf.global_variable_initializer()
        
    def batchlength(self, path):
        return len(path['rewards'])
    
    def sample_trajectories(self, iteration, env):
        """
        sample a set of trjectories
        """
        paths = []
        while True:
            animate_this_episode = len(paths) == 0 and (iteration % 10 == 0) and self.animate
            path = self.sample_trajectory(env, animate_this_episode)
            paths.append(path)
            timesteps_this_batch += self.batchlength(path)
            if timesteps_this_batch > self.timesteps_per_batch:
                break
        return paths, timesteps_this_batch
            
    def sample_trajectory(self, env, animate):
        """
        sample a trajectory/episode from the environment
        """
        ob = env.reset()
        path = dict()
        obs, acs, rewards = [], [], []
        steps = 0
        while True:
            if animate:
                env.render()
                time.sleep(0.01)
            obs.append(ob)
            ac = self.sess.run([self.sy_sampled_ac], feed_dict={self.sy_ob_no: ob})
            ac = ac[0]
            acs.append(ac)
            obs, reward, done, _ = env.step(ac)
            rewards.append(reward)
            steps += 1
            
            if done or steps > self.max_path_length:
                break
                
        path['observation'] = np.array(obs, dtype=np.float32)
        path['actions'] = np.array(acs, dtype=np.int32)
        path['rewards'] = np.array(rewards, dtype=np.int32)
        
        return path
    
    
    def estimate_returns(self, ob_no, re_n):
        
            
            
            
            
    
                

In [None]:
def train_PG(exp_name, env_name, n_iter, n_layers, size, learning_rate, batch_size, render,
             reward_to_go, nn_baseline, normalize_advantages, gamma, seed, max_path_length):
    
    
    env = gym.make(env_name)
    
    # set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.set_random_seed(seed)
    
    is_discrete = isinstance(env.action_space, gym.spaces.Discrete)
    
    obs_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if is_discrete else env.action_space.shape[0]
    
    max_path_length = max_path_length if max_path_length else env.spec.max_episode_steps
    
    # initializing the parameters for the agent
    computation_graph_params = {
        'n_layers': n_layers,
        'size' : size,
        'learning_rate': learning_rate,
        'ob_dim' : obs_dim,
        'ac_dim' : ac_dim,
        'discrete' : is_discrete,
    }
    
    sample_trajectory_params = {
        'timesteps_per_batch' : batch_size,
        'max_path_length': max_path_length,
        'animate' : render
    }
    
    estimate_return_params = {
        'gamma' : gamma,
        'nn_baseline' : nn_baseline,
        'reward_to_go' : reward_to_go,
        'normalize_advantages' : normalize_advantages
    }
    
    
    # initialize the agent
    agent = Agent(computation_graph_params, sample_trajectory_params, estimate_return_params)
    
    # build the computation graph of the agent training.
    agent.build_computation_graph()
    
    # init tensorflow session.
    agent.init_tf_sess()
    
    for iteration in range(n_iter):
        print("************* Iteration %d ***************", iteration)
        paths, timesteps_this_batch = agent.sample_trajectories(iteration, env)
        total_timesteps += timesteps_this_batch
        
        # stack observations, actions into a 1D array.
        # Rewards are not stacked yet, what we have is the immediate reward, we need to calculate the estimated return which
        # is episode specific.
        ac_na = np.concatenate([path['action'] for path in paths])
        ob_no = np.concatenate([path['observation'] for path in paths])
        re_n = [path['reward'] for path in paths] # not stacked yet.
        
        # calculate q_n and adv_n for each state and action. Here we stack it into a single 1D array
        q_n, adv_n = agent.estimate_returns(ob_no, re_n)