diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py index 57c28bdb0a..6b973756bd 100644 --- a/baselines/gail/trpo_mpi.py +++ b/baselines/gail/trpo_mpi.py @@ -146,8 +146,9 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank, dist = meankl all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] - vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] + var_list = [v for v in all_var_list if v.name.split("/")[1] == "pol"] + vf_var_list = [v for v in all_var_list if v.name.split("/")[1] == "vf"] + assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py index e03284a03b..7f979b3495 100644 --- a/baselines/ppo1/mlp_policy.py +++ b/baselines/ppo1/mlp_policy.py @@ -22,21 +22,23 @@ def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) - obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="vffc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(last_out, 1, name='vffinal', kernel_initializer=U.normc_initializer(1.0))[:,0] - - last_out = obz - for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='polfc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) - if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='polfinal', kernel_initializer=U.normc_initializer(0.01)) - logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) - pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - else: - pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='polfinal', kernel_initializer=U.normc_initializer(0.01)) + with tf.variable_scope('vf'): + obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) + last_out = obz + for i in range(num_hid_layers): + last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) + self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] + + with tf.variable_scope('pol'): + last_out = obz + for i in range(num_hid_layers): + last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) + if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): + mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) + logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) + pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) + else: + pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam)