In [1]:
import logging

import torch
from torch import nn

import pfrl

import gym
import gym.spaces
import gym.wrappers

In [2]:
import robosuite as suite
from robosuite.wrappers import GymWrapper

In [3]:
### Create the sawyer lift environment
env = GymWrapper(
        suite.make(
            "Lift",
            robots="Sawyer",                # use Sawyer robot
            use_camera_obs=False,           # do not use pixel observations
            has_offscreen_renderer=False,   # not needed since not using pixel obs
            has_renderer=False,              # make sure we can render to the screen
            reward_shaping=True,            # use dense rewards
            control_freq=20,                # control should happen fast enough so that simulation looks smooth
        )
)

adding key: robot0_robot-state
adding key: object-state




In [4]:
#ref: https://github.com/ARISE-Initiative/robosuite-benchmark/blob/master/runs/Lift-Sawyer-OSC-POSE-SEED129/Lift_Sawyer_OSC_POSE_SEED129_2020_09_21_20_07_20_0000--s-0/variant.json
### args setup
outdir = "results"
steps = 2 * 10 ** 6 #num_epochs
# num_eval_steps_per_epoch = 2500
eval_interval = 800 # steps/num_eval_steps_per_epoch
eval_n_runs = 1
offset = 3300

render = False
demo = True
load_pretrained = False
pretrained_type = "best"
log_level = logging.INFO
seed = 0
trpo_update_interval = 5000

In [5]:
## initial setup
logging.basicConfig(level=log_level)
pfrl.utils.set_random_seed(seed)
outdir = pfrl.experiments.prepare_output_dir(None, outdir)

In [6]:
def make_env(env, seed, outdir, test=False, monitor=False, render=False):
        # Use different random seeds for train and test envs
        env_seed = 2 ** 32 - 1 - seed if test else seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if monitor:
            env = gym.wrappers.Monitor(env, outdir)
        if render:
            env = pfrl.wrappers.Render(env)
        return env

In [7]:
sawyer_env = make_env(env, seed, outdir)

In [8]:
timestep_limit = 500#sawyer_env.spec.max_episode_steps
obs_space = sawyer_env.observation_space
action_space = sawyer_env.action_space
print("Observation space:", obs_space)
print("Action space:", action_space)

Observation space: Box(-inf, inf, (42,), float32)
Action space: Box(-1.0, 1.0, (8,), float32)


In [9]:
assert isinstance(obs_space, gym.spaces.Box)

In [10]:
# Normalize observations based on their empirical mean and variance
obs_normalizer = pfrl.nn.EmpiricalNormalization(
    obs_space.low.size, clip_threshold=5
)

obs_size = obs_space.low.size
action_size = action_space.low.size
policy = torch.nn.Sequential(
    nn.Linear(obs_size, 64),
    nn.Tanh(),
    nn.Linear(64, 64),
    nn.Tanh(),
    nn.Linear(64, action_size),
    pfrl.policies.GaussianHeadWithStateIndependentCovariance(
        action_size=action_size,
        var_type="diagonal",
        var_func=lambda x: torch.exp(2 * x),  # Parameterize log std
        var_param_init=0,  # log std = 0 => std = 1
    ),
)

  torch.tensor(np.broadcast_to(var_param_init, var_size), dtype=torch.float,)


In [11]:
vf = torch.nn.Sequential(
        nn.Linear(obs_size, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, 1),
)

In [12]:
# While the original paper initialized weights by normal distribution,
# we use orthogonal initialization as the latest openai/baselines does.
def ortho_init(layer, gain):
    nn.init.orthogonal_(layer.weight, gain=gain)
    nn.init.zeros_(layer.bias)

ortho_init(policy[0], gain=1)
ortho_init(policy[2], gain=1)
ortho_init(policy[4], gain=1e-2)
ortho_init(vf[0], gain=1)
ortho_init(vf[2], gain=1)
ortho_init(vf[4], gain=1e-2)

In [13]:
# TRPO's policy is optimized via CG and line search, so it doesn't require
# an Optimizer. Only the value function needs it.
vf_opt = torch.optim.Adam(vf.parameters())

In [14]:
 # Hyperparameters in http://arxiv.org/abs/1709.06560
agent = pfrl.agents.TRPO(
    policy=policy,
    vf=vf,
    vf_optimizer=vf_opt,
    obs_normalizer=obs_normalizer,
    gpu=0,
    update_interval=trpo_update_interval,
    max_kl=0.01,
    conjugate_gradient_max_iter=20,
    conjugate_gradient_damping=1e-1,
    gamma=0.995,
    lambd=0.97,
    vf_epochs=5,
    entropy_coef=0,
)

In [None]:
# Run training experiments
pfrl.experiments.train_agent_with_evaluation(
            agent=agent,
            env=sawyer_env,
            eval_env=make_env(env, seed, outdir),
            outdir=outdir,
            steps=steps,
            step_offset=offset,
            eval_n_steps=None,
            eval_n_episodes=eval_n_runs,
            eval_interval=eval_interval,
            train_max_episode_len=timestep_limit,
    )


INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:3800 episode:0 R:0.1665350169807385
INFO:pfrl.experiments.train_agent:statistics:[('average_value', -0.0005691441), ('average_entropy', 11.351507), ('average_kl', nan), ('average_policy_step_size', nan), ('explained_variance', nan)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:4300 episode:1 R:1.3981484230507155
INFO:pfrl.experiments.train_agent:statistics:[('average_value', -0.001488245), ('average_entropy', 11.351506), ('average_kl', nan), ('average_policy_step_size', nan), ('explained_variance', nan)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:0.5202092157166793
INFO:pfrl.experiments.train_agent:The best score is updated -3.4028235e+38 -> 0.5202092157166793
INFO:pfrl.experiments.train_agent:Saved the agent to results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f\best


INFO:pfrl.experiments.train_agent:statistics:[('average_value', 0.021015633), ('average_entropy', 11.329737), ('average_kl', 0.007035945542156696), ('average_policy_step_size', 1.0), ('explained_variance', 0.004829827545341003)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:6.405681833749427
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.029495913928258233
INFO:pfrl.agents.trpo:KL divergence: 0.0075407009571790695
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:13300 episode:19 R:0.26763957566655155
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 0.04738365), ('average_entropy', 11.329737), ('average_kl', 0.007288323249667883), ('average_policy_step_size', 1.0), ('explained_variance', 0.48417587702128406)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:21800 episode:36 R:3.4203400067937526
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 0.60220987), ('average_entropy', 11.315655), ('average_kl', 0.0069348242444296675), ('average_policy_step_size', 1.0), ('explained_variance', 0.10845825688815303)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:3.3434105901788147
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:22300 episode:37 R:14.72092137877535
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 1.3718466), ('average_entropy', 11.315655), ('average_kl', 0.0069348242444296675), ('average_policy_step_size', 1.0), ('explained_variance', 0.10845825688815303)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:22800 episode:38 R:0.20060401139034648
INFO:

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:30800 episode:54 R:1.5170580714326287
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 0.96609837), ('average_entropy', 11.336156), ('average_kl', 0.00693213315680623), ('average_policy_step_size', 1.0), ('explained_variance', 0.534629162158107)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:3.9412698947578653
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:31300 episode:55 R:20.350485803914324
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 1.1313936), ('average_entropy', 11.336156), ('average_kl', 0.00693213315680623), ('average_policy_step_size', 1.0), ('explained_variance', 0.534629162158107)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:4.118635836844037
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:27.665168387860867
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:39800 episode:72 R:45.84642543312321
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 4.832034), ('average_entropy', 11.359972), ('average_kl', 0.006923716781394822), ('average_policy_step_size', 1.0), ('explained_variance', 0.6140310255805482)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:40300 episode:73 R:27.863730329966938
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 4.710304), ('average_entropy', 11.359972), ('average_kl', 0.006923716781394822), ('average_policy_step_size', 1.0), ('explained_variance', 0.6140310255805482)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:37.221246320475025
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa39

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 3.773225), ('average_entropy', 11.356841), ('average_kl', 0.0069838992526961696), ('average_policy_step_size', 1.0), ('explained_variance', 0.6999097937948486)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:42.953174820201895
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:48800 episode:90 R:40.63603520637918
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 4.5992827), ('average_entropy', 11.350135), ('average_kl', 0.0069838992526961696), ('average_policy_step_size', 1.0), ('explained_variance', 0.6999097937948486)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:13.765523970199336
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:49300 episode:91 R:27.263277031219367
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 3.7516992)

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 5.6775155), ('average_entropy', 11.346522), ('average_kl', 0.00694857188500464), ('average_policy_step_size', 1.0), ('explained_variance', 0.790246209624362)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:57800 episode:108 R:15.278135173971068
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 6.0404873), ('average_entropy', 11.346522), ('average_kl', 0.00694857188500464), ('average_policy_step_size', 1.0), ('explained_variance', 0.790246209624362)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:28.898447136811015
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.0277934325276874
INFO:pfrl.agents.trpo:KL divergence: 0.006609225180000067
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:58300 episod

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 9.363654), ('average_entropy', 11.394568), ('average_kl', 0.006903950745860736), ('average_policy_step_size', 1.0), ('explained_variance', 0.8197901103843979)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:66800 episode:126 R:36.228266951500416
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 11.539191), ('average_entropy', 11.394568), ('average_kl', 0.006903950745860736), ('average_policy_step_size', 1.0), ('explained_variance', 0.8197901103843979)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:42.85644775158427
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:67300 episode:127 R:31.193353422780078
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 8.816221), ('average_entropy', 11.394568), ('average_kl', 0.006903950745860736), ('average_policy

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:81.51091369339848
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:75800 episode:144 R:57.97770031126254
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 12.830227), ('average_entropy', 11.386721), ('average_kl', 0.006855484796687961), ('average_policy_step_size', 1.0), ('explained_variance', 0.7977323406321339)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:76300 episode:145 R:33.36135213331126
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 12.626918), ('average_entropy', 11.386721), ('average_kl', 0.006855484796687961), ('average_policy_step_size', 1.0), ('explained_variance', 0.7977323406321339)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:59.94742159810845
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa3

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 15.712686), ('average_entropy', 11.394789), ('average_kl', 0.006843853712780401), ('average_policy_step_size', 1.0), ('explained_variance', 0.7879667104178821)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:84.2161986348784
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:85300 episode:163 R:118.91329366547919
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 15.950294), ('average_entropy', 11.394789), ('average_kl', 0.006843853712780401), ('average_policy_step_size', 1.0), ('explained_variance', 0.7879667104178821)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:85800 episode:164 R:87.45054732213832
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 17.923351), ('average_entropy', 11.394789), ('average_kl', 0.006843853712780401), ('average_policy

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:49.67533011292653
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:94300 episode:181 R:67.17922537403244
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 19.187193), ('average_entropy', 11.359137), ('average_kl', 0.006802211489735378), ('average_policy_step_size', 1.0), ('explained_variance', 0.8529337306511491)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:94800 episode:182 R:60.50260160605654
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 18.98168), ('average_entropy', 11.359137), ('average_kl', 0.006802211489735378), ('average_policy_step_size', 1.0), ('explained_variance', 0.8529337306511491)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:42.07051398109645
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa39

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.02999834856018424
INFO:pfrl.agents.trpo:KL divergence: 0.00639085890725255
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:103300 episode:199 R:74.80282168925577
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 22.890085), ('average_entropy', 11.3571005), ('average_kl', 0.006769008445553481), ('average_policy_step_size', 1.0), ('explained_variance', 0.865154079139719)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:64.03450354421446
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:103800 episode:200 R:82.26931001909831
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 21.186441), ('average_entropy', 11.347249), ('average_kl', 0.006769008445553481), ('average_policy_step_size', 1.0), ('explained_va

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 24.563211), ('average_entropy', 11.32475), ('average_kl', 0.006746281781012104), ('average_policy_step_size', 1.0), ('explained_variance', 0.8941282122824623)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:124.17354143808488
INFO:pfrl.experiments.train_agent:The best score is updated 114.62440155536989 -> 124.17354143808488
INFO:pfrl.experiments.train_agent:Saved the agent to results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f\best
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:112800 episode:218 R:56.65043448180905
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 19.852331), ('average_entropy', 11.32475), ('average_kl', 0.006746281781012104), ('average_policy_step_size', 1.0), ('explained_variance', 0.8941282122824623)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:66.21564708149704
I

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:121300 episode:235 R:77.45328594633835
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 23.131863), ('average_entropy', 11.349364), ('average_kl', 0.006716285933456991), ('average_policy_step_size', 1.0), ('explained_variance', 0.8035879869747498)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:121800 episode:236 R:61.21743467409283
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 21.314264), ('average_entropy', 11.349364), ('average_kl', 0.006716285933456991), ('average_policy_step_size', 1.0), ('explained_variance', 0.8035879869747498)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:55.55503318880764
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:122300 episode:237 R:82.11675811365413
INFO:pfr

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:106.62895374112259
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:130300 episode:253 R:69.1697116932717
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 24.98907), ('average_entropy', 11.324143), ('average_kl', 0.006686082575470209), ('average_policy_step_size', 1.0), ('explained_variance', 0.8392840312105586)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:130800 episode:254 R:54.925555144061434
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 23.835426), ('average_entropy', 11.324143), ('average_kl', 0.006686082575470209), ('average_policy_step_size', 1.0), ('explained_variance', 0.8392840312105586)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:46.77352707174164
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581a

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:139300 episode:271 R:86.3606377513315
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 26.626234), ('average_entropy', 11.356349), ('average_kl', 0.00665432200939567), ('average_policy_step_size', 1.0), ('explained_variance', 0.8975352004943064)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:90.7687362526333
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:139800 episode:272 R:138.15157563360108
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 26.215687), ('average_entropy', 11.356349), ('average_kl', 0.00665432200939567), ('average_policy_step_size', 1.0), ('explained_variance', 0.8975352004943064)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:140300 episode:273 R:112.1252188154855
INFO:pfrl.e

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 29.058926), ('average_entropy', 11.325395), ('average_kl', 0.006625752815784052), ('average_policy_step_size', 1.0), ('explained_variance', 0.9227890081312357)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:96.13232857313328
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:148800 episode:290 R:108.32098392705318
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 28.366337), ('average_entropy', 11.314224), ('average_kl', 0.006625752815784052), ('average_policy_step_size', 1.0), ('explained_variance', 0.9227890081312357)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:135.71661312251356
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:149300 episode:291 R:102.68723587656854
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 30.5092

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:157800 episode:308 R:119.08948274308185
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 30.303005), ('average_entropy', 11.295445), ('average_kl', 0.006558751470098893), ('average_policy_step_size', 1.0), ('explained_variance', 0.8326870913431812)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:114.1392169073829
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.03006012024707161
INFO:pfrl.agents.trpo:KL divergence: 0.006494430359452963
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:158300 episode:309 R:113.87855377776877
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 28.66399), ('average_entropy', 11.295445), ('average_kl', 0.006556676595561927), ('average_policy_step_size', 1.0), ('explained_

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 34.048733), ('average_entropy', 11.321231), ('average_kl', 0.006548305143951438), ('average_policy_step_size', 1.0), ('explained_variance', 0.8829260126155334)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:57.02688667113497
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:167300 episode:327 R:114.42400386835541
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 30.490555), ('average_entropy', 11.321231), ('average_kl', 0.006548305143951438), ('average_policy_step_size', 1.0), ('explained_variance', 0.8829260126155334)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:111.82493849747526
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:167800 episode:328 R:102.07823739870342
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 33.4520

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:103.12714290615563
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:175800 episode:344 R:102.45719261959077
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 33.548737), ('average_entropy', 11.286119), ('average_kl', 0.006518987048526897), ('average_policy_step_size', 1.0), ('explained_variance', 0.8458895427629602)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:176300 episode:345 R:112.38240972146957
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 32.739197), ('average_entropy', 11.286119), ('average_kl', 0.006518987048526897), ('average_policy_step_size', 1.0), ('explained_variance', 0.8458895427629602)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:82.61458240158963
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c5

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:184800 episode:362 R:119.3023533529984
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 37.940475), ('average_entropy', 11.2590275), ('average_kl', 0.0065008771554049515), ('average_policy_step_size', 1.0), ('explained_variance', 0.8293548154093593)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:126.3437913382164
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:185300 episode:363 R:109.61722197094515
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 37.49182), ('average_entropy', 11.2590275), ('average_kl', 0.0065008771554049515), ('average_policy_step_size', 1.0), ('explained_variance', 0.8293548154093593)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:185800 episode:364 R:102.74161195728233
INF

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 38.170162), ('average_entropy', 11.279526), ('average_kl', 0.006482196199756704), ('average_policy_step_size', 1.0), ('explained_variance', 0.7348946986526906)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:109.26600433166323
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:194300 episode:381 R:126.65473119473413
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 37.848392), ('average_entropy', 11.295655), ('average_kl', 0.006482196199756704), ('average_policy_step_size', 1.0), ('explained_variance', 0.7348946986526906)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:194800 episode:382 R:85.5417224343186
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 35.745975), ('average_entropy', 11.295655), ('average_kl', 0.006482196199756704), ('average_pol

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.028116692090407014
INFO:pfrl.agents.trpo:KL divergence: 0.0052462294697761536
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:203300 episode:399 R:136.29600530247578
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 36.702545), ('average_entropy', 11.268663), ('average_kl', 0.006443396396934986), ('average_policy_step_size', 1.0), ('explained_variance', 0.8429111349830336)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:103.76551939479377
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:203800 episode:400 R:124.50794922699427
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 40.113956), ('average_entropy', 11.267175), ('average_kl', 0.006443396396934986), ('average_policy_step_size', 1.0), ('explai

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 41.242237), ('average_entropy', 11.28354), ('average_kl', 0.006441730403954663), ('average_policy_step_size', 1.0), ('explained_variance', 0.877006041962669)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:129.26001234827459
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:212800 episode:418 R:91.60765278584716
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 42.519), ('average_entropy', 11.28354), ('average_kl', 0.006441730403954663), ('average_policy_step_size', 1.0), ('explained_variance', 0.877006041962669)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:112.61111726493016
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.036623685533413664
INFO:pfrl.agents.trpo:KL divergence: 0.004804680589586496
INFO:pfrl.experiments.train_agent:outdi

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 45.022007), ('average_entropy', 11.283391), ('average_kl', 0.006386863327650137), ('average_policy_step_size', 1.0), ('explained_variance', 0.7864979487778413)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:221800 episode:436 R:159.6025090475832
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 45.385338), ('average_entropy', 11.283391), ('average_kl', 0.006386863327650137), ('average_policy_step_size', 1.0), ('explained_variance', 0.7864979487778413)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:118.3893639309895
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:222300 episode:437 R:95.13154554528572
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 37.924416), ('average_entropy', 11.283391), ('average_kl', 0.006386863327650137), ('average_poli

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:230800 episode:454 R:105.87839343016181
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 40.460957), ('average_entropy', 11.291979), ('average_kl', 0.0063462459896173745), ('average_policy_step_size', 1.0), ('explained_variance', 0.839609662370181)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:126.12896804162149
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:231300 episode:455 R:121.32289952777731
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 45.274437), ('average_entropy', 11.291979), ('average_kl', 0.0063462459896173745), ('average_policy_step_size', 1.0), ('explained_variance', 0.839609662370181)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:131.54678815800068
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:239800 episode:472 R:139.17432428418354
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 46.3422), ('average_entropy', 11.300409), ('average_kl', 0.006326384961288026), ('average_policy_step_size', 1.0), ('explained_variance', 0.7540694878090096)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:240300 episode:473 R:132.18062037092002
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 47.57417), ('average_entropy', 11.300409), ('average_kl', 0.006326384961288026), ('average_policy_step_size', 1.0), ('explained_variance', 0.7540694878090096)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:116.71419191509523
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:240800 episode:474 R:135.53288349273754
INFO:pf

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 43.632378), ('average_entropy', 11.2799835), ('average_kl', 0.006328194848812965), ('average_policy_step_size', 1.0), ('explained_variance', 0.8628292486454274)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:140.06173139515099
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:249300 episode:491 R:121.34996646675822
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 45.232445), ('average_entropy', 11.266631), ('average_kl', 0.006328194848812965), ('average_policy_step_size', 1.0), ('explained_variance', 0.8628292486454274)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:249800 episode:492 R:124.5656045922435
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 48.451588), ('average_entropy', 11.266631), ('average_kl', 0.006328194848812965), ('average_p

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.02952539840771351
INFO:pfrl.agents.trpo:KL divergence: 0.006422804668545723
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:258300 episode:509 R:153.41016674750847
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 48.807255), ('average_entropy', 11.258682), ('average_kl', 0.00632261892086735), ('average_policy_step_size', 1.0), ('explained_variance', 0.7890972424436757)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:258800 episode:510 R:142.68793915441358
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.1716), ('average_entropy', 11.266164), ('average_kl', 0.00632261892086735), ('average_policy_step_size', 1.0), ('explained_variance', 0.7890972424436757)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 le

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 48.45399), ('average_entropy', 11.267813), ('average_kl', 0.006329871785755341), ('average_policy_step_size', 1.0), ('explained_variance', 0.8375570620884712)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:122.45383240730328
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:267800 episode:528 R:148.92597373928805
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 47.873985), ('average_entropy', 11.267813), ('average_kl', 0.006329871785755341), ('average_policy_step_size', 1.0), ('explained_variance', 0.8375570620884712)]
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.026950978266540915
INFO:pfrl.agents.trpo:KL divergence: 0.0058974046260118484
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:26830

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:123.16631888534049
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:276800 episode:546 R:143.4167476809255
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.063026), ('average_entropy', 11.202894), ('average_kl', 0.006326450862818294), ('average_policy_step_size', 1.0), ('explained_variance', 0.8005604345422954)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:143.66607361660016
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:277300 episode:547 R:136.8269961670978
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.858788), ('average_entropy', 11.202894), ('average_kl', 0.006326450862818294), ('average_policy_step_size', 1.0), ('explained_variance', 0.8005604345422954)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c58

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:285800 episode:564 R:157.32227356257056
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 46.866695), ('average_entropy', 11.19156), ('average_kl', 0.0063181631932301184), ('average_policy_step_size', 1.0), ('explained_variance', 0.8410306240742254)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:132.48564003463198
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:286300 episode:565 R:149.7385942548888
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.321465), ('average_entropy', 11.19156), ('average_kl', 0.0063181631932301184), ('average_policy_step_size', 1.0), ('explained_variance', 0.8410306240742254)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:286800 episode:566 R:141.81359378159618
INFO:

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 49.568874), ('average_entropy', 11.184159), ('average_kl', 0.0062988603262808815), ('average_policy_step_size', 1.0), ('explained_variance', 0.8481620279543133)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:136.38835058866047
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:295300 episode:583 R:145.031470118545
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.76904), ('average_entropy', 11.184159), ('average_kl', 0.0062988603262808815), ('average_policy_step_size', 1.0), ('explained_variance', 0.8481620279543133)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:83.10419763506322
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:295800 episode:584 R:125.49229735928274
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 52.74623

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 53.222454), ('average_entropy', 11.1765175), ('average_kl', 0.006306017345438401), ('average_policy_step_size', 1.0), ('explained_variance', 0.5955015601737137)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:304300 episode:601 R:149.37888025185921
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 52.261955), ('average_entropy', 11.164412), ('average_kl', 0.006306017345438401), ('average_policy_step_size', 1.0), ('explained_variance', 0.5955015601737137)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:192.4692791486627
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:304800 episode:602 R:144.23587633229366
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 53.23564), ('average_entropy', 11.164412), ('average_kl', 0.006306017345438401), ('average_po

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.029656778322532773
INFO:pfrl.agents.trpo:KL divergence: 0.004275864455848932
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:313300 episode:619 R:142.85601314583536
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.87504), ('average_entropy', 11.153686), ('average_kl', 0.006265857609950246), ('average_policy_step_size', 1.0), ('explained_variance', 0.5372958911016726)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:313800 episode:620 R:76.60682699122579
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 48.732876), ('average_entropy', 11.154977), ('average_kl', 0.006265857609950246), ('average_policy_step_size', 1.0), ('explained_variance', 0.5372958911016726)]
INFO:pfrl.experiments.train_agent:evaluation episode 0

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.267), ('average_entropy', 11.138747), ('average_kl', 0.006266462269224345), ('average_policy_step_size', 1.0), ('explained_variance', 0.8756834689278593)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:322800 episode:638 R:158.2376273350518
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.412495), ('average_entropy', 11.138747), ('average_kl', 0.006266462269224345), ('average_policy_step_size', 1.0), ('explained_variance', 0.8756834689278593)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:98.79243151696978
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.031533102825051174
INFO:pfrl.agents.trpo:KL divergence: 0.006160613149404526
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:323300 epi

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:139.99463298487186
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:331800 episode:656 R:149.68034738378114
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 54.01373), ('average_entropy', 11.039607), ('average_kl', 0.006264814318945775), ('average_policy_step_size', 1.0), ('explained_variance', 0.8491061042776469)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:332300 episode:657 R:132.64522265703664
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.89795), ('average_entropy', 11.039607), ('average_kl', 0.006264814318945775), ('average_policy_step_size', 1.0), ('explained_variance', 0.8491061042776469)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:149.8858349348007
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581

INFO:pfrl.experiments.train_agent:The best score is updated 196.78586875202896 -> 207.06279099377477
INFO:pfrl.experiments.train_agent:Saved the agent to results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f\best
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:340800 episode:674 R:124.34142887995202
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 49.631092), ('average_entropy', 11.031291), ('average_kl', 0.006266954452244204), ('average_policy_step_size', 1.0), ('explained_variance', 0.868948426530408)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:135.1601091432284
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:341300 episode:675 R:76.69810499227839
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 43.882), ('average_entropy', 11.031291), ('average_kl', 0.006266954452244204), ('average_policy

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:349800 episode:692 R:137.18226363591216
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.90579), ('average_entropy', 11.007437), ('average_kl', 0.0062395565252265205), ('average_policy_step_size', 1.0), ('explained_variance', 0.9060922571602279)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:119.02519444070195
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:350300 episode:693 R:113.38548981524839
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.821167), ('average_entropy', 11.007437), ('average_kl', 0.0062395565252265205), ('average_policy_step_size', 1.0), ('explained_variance', 0.9060922571602279)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:350800 episode:694 R:132.45929855890674
INF

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 54.475643), ('average_entropy', 10.972481), ('average_kl', 0.006246320122826687), ('average_policy_step_size', 1.0), ('explained_variance', 0.8222206411294609)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:160.3900580014953
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:359300 episode:711 R:145.65742227531683
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 55.182518), ('average_entropy', 10.950815), ('average_kl', 0.006246320122826687), ('average_policy_step_size', 1.0), ('explained_variance', 0.8222206411294609)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:143.52342265506505
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:359800 episode:712 R:131.37557437955346
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 53.5675

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.03446093856473453
INFO:pfrl.agents.trpo:KL divergence: 0.005964208859950304
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:368300 episode:729 R:153.17782990965
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 52.970287), ('average_entropy', 10.957845), ('average_kl', 0.006243935007959196), ('average_policy_step_size', 1.0), ('explained_variance', 0.7892975030562925)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:154.31091547577654
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:368800 episode:730 R:47.12679263194983
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 36.486176), ('average_entropy', 10.952879), ('average_kl', 0.006243935007959196), ('average_policy_step_size', 1.0), ('explained_va

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:377300 episode:747 R:161.63015008438617
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 52.728912), ('average_entropy', 10.951275), ('average_kl', 0.006238444632775075), ('average_policy_step_size', 1.0), ('explained_variance', 0.9585789605165306)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:377800 episode:748 R:136.64267630368482
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 54.14512), ('average_entropy', 10.951275), ('average_kl', 0.006238444632775075), ('average_policy_step_size', 1.0), ('explained_variance', 0.9585789605165306)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:156.79070156377844
INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.031230800683260895
INFO:pfrl.agents.trpo:KL

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 51.258778), ('average_entropy', 10.945177), ('average_kl', 0.006239177389560561), ('average_policy_step_size', 1.0), ('explained_variance', 0.8523254857203315)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:386800 episode:766 R:158.16913827468002
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.66243), ('average_entropy', 10.945177), ('average_kl', 0.006239177389560561), ('average_policy_step_size', 1.0), ('explained_variance', 0.8523254857203315)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:145.56943415285076
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:387300 episode:767 R:150.06221618644244
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 55.295773), ('average_entropy', 10.945177), ('average_kl', 0.006239177389560561), ('average_po

INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:148.39730009671342
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:395800 episode:784 R:154.50853433606352
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.50222), ('average_entropy', 10.944543), ('average_kl', 0.006207268816442826), ('average_policy_step_size', 1.0), ('explained_variance', 0.8071264717138438)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:396300 episode:785 R:152.2732688548364
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 50.79643), ('average_entropy', 10.944543), ('average_kl', 0.006207268816442826), ('average_policy_step_size', 1.0), ('explained_variance', 0.8071264717138438)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:144.00795878035314
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581

INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:404800 episode:802 R:142.30380089855583
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 56.204327), ('average_entropy', 10.9242115), ('average_kl', 0.006194862991105765), ('average_policy_step_size', 1.0), ('explained_variance', 0.7214885862943324)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:139.85047291773145
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:405300 episode:803 R:143.21716827210415
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 56.7736), ('average_entropy', 10.9242115), ('average_kl', 0.006194862991105765), ('average_policy_step_size', 1.0), ('explained_variance', 0.7214885862943324)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:405800 episode:804 R:127.19913625840488
INFO

INFO:pfrl.experiments.train_agent:statistics:[('average_value', 55.003326), ('average_entropy', 10.925545), ('average_kl', 0.006191581061745926), ('average_policy_step_size', 1.0), ('explained_variance', 0.9765327351001888)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:125.55316825972746
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:414300 episode:821 R:145.61209952275001
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 54.930866), ('average_entropy', 10.917374), ('average_kl', 0.006191581061745926), ('average_policy_step_size', 1.0), ('explained_variance', 0.9765327351001888)]
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:414800 episode:822 R:143.13789185710803
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 56.240807), ('average_entropy', 10.917374), ('average_kl', 0.006191581061745926), ('average_p

INFO:pfrl.agents.trpo:Line search iteration: 0 step size: 1.0
INFO:pfrl.agents.trpo:Surrogate objective improve: 0.04205893170001218
INFO:pfrl.agents.trpo:KL divergence: 0.005277795251458883
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:423300 episode:839 R:59.84289169641732
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 40.80189), ('average_entropy', 10.9101305), ('average_kl', 0.006178876750969461), ('average_policy_step_size', 1.0), ('explained_variance', 0.9307349164506944)]
INFO:pfrl.experiments.train_agent:evaluation episode 0 length:500 R:134.2821898578202
INFO:pfrl.experiments.train_agent:outdir:results\74b7ccfb1886c5e2ff20c581aa398dfd7298d1cf-00000000-1554415f step:423800 episode:840 R:82.55282263693898
INFO:pfrl.experiments.train_agent:statistics:[('average_value', 30.373318), ('average_entropy', 10.91127), ('average_kl', 0.006178876750969461), ('average_policy_step_size', 1.0), ('explained_va

In [None]:
pfrl.experiments.train_agent_with_evaluation?