In [1]:
import coax
import gym
import haiku as hk
import jax
import jax.numpy as jnp
import optax
from coax.value_losses import mse



# the name of this script
name = 'a2c'

# the cart-pole MDP
env = gym.make('CartPole-v0')
env = coax.wrappers.TrainMonitor(env, name=name, tensorboard_dir=f"./data/tensorboard/{name}")

In [2]:

def func_pi(S, is_training):
    logits = hk.Sequential((
        hk.Linear(8), jax.nn.relu,
        hk.Linear(8), jax.nn.relu,
        hk.Linear(8), jax.nn.relu,
        hk.Linear(env.action_space.n, w_init=jnp.zeros)
    ))
    return {'logits': logits(S)}


def func_v(S, is_training):
    value = hk.Sequential((
        hk.Linear(8), jax.nn.relu,
        hk.Linear(8), jax.nn.relu,
        hk.Linear(8), jax.nn.relu,
        hk.Linear(1, w_init=jnp.zeros), jnp.ravel
    ))
    return value(S)



In [6]:
from emlp import T, Scalar
from emlp.groups import SO, S, O, Trivial
from emlp_haiku import EMLPBlock, Sequential, Linear,EMLP
from emlp.reps import Rep
from emlp.nn import gated,gate_indices,uniform_rep
from math import prod

group=Trivial(2)
rep_in = T(0)*prod(env.observation_space.shape)
rep_out = T(0)*env.action_space.n#prod(env.action_space.shape)
nn_pi = EMLP(rep_in,rep_out,group,ch=100,num_layers=2)
nn_v = EMLP(rep_in,T(0),group,ch=100,num_layers=2)

def func_pi(S, is_training):
    return {'logits': nn_pi(S)}


def func_v(S, is_training):
    return nn_v(S).reshape(-1)

[a2c|root|INFO] Initing EMLP
[a2c|root|INFO] Linear W components:400 rep:96V⁰+48V+20V²+8V³+4V⁴
[a2c|root|INFO] Linear W components:10000 rep:576V⁰+576V+384V²+216V³+121V⁴+44V⁵+14V⁶+4V⁷+V⁸
[a2c|root|INFO] Linear W components:200 rep:48V⁰+24V+10V²+4V³+2V⁴
[a2c|root|INFO] Initing EMLP
[a2c|root|INFO] Linear W components:400 rep:96V⁰+48V+20V²+8V³+4V⁴
[a2c|root|INFO] Linear W components:10000 rep:576V⁰+576V+384V²+216V³+121V⁴+44V⁵+14V⁶+4V⁷+V⁸
[a2c|root|INFO] Linear W components:100 rep:24V⁰+12V+5V²+2V³+V⁴


In [None]:

# these optimizers collect batches of grads before applying updates
optimizer_v = optax.chain(optax.apply_every(k=32), optax.adam(0.002))
optimizer_pi = optax.chain(optax.apply_every(k=32), optax.adam(0.001))


# value function and its derived policy
v = coax.V(func_v, env)
pi = coax.Policy(func_pi, env)

# experience tracer
tracer = coax.reward_tracing.NStep(n=1, gamma=0.9)

# updaters
vanilla_pg = coax.policy_objectives.VanillaPG(pi, optimizer=optimizer_pi)
simple_td = coax.td_learning.SimpleTD(v, loss_function=mse, optimizer=optimizer_v)


# train
for ep in range(1000):
    s = env.reset()
    er = 0
    for t in range(env.spec.max_episode_steps):
        a = pi(s)
        s_next, r, done, info = env.step(a)
        
        if done and (t == env.spec.max_episode_steps - 1):
            r = 1 / (1 - tracer.gamma)
        er+=r
        tracer.add(s, a, r, done)
        while tracer:
            transition_batch = tracer.pop()
            metrics_v, td_error = simple_td.update(transition_batch, return_td_error=True)
            metrics_pi = vanilla_pg.update(transition_batch, td_error)
            env.record_metrics(metrics_v)
            env.record_metrics(metrics_pi)

        if done:
            break

        s = s_next
    
    print("Epoch reward",er)
    # early stopping
    if env.avg_G > env.spec.reward_threshold:
        break


# run env one more time to render
#coax.utils.generate_gif(env, policy=pi, filepath=f"./data/{name}.gif", duration=25)

[a2c|TrainMonitor|INFO] ep: 229,	T: 6,594,	G: 11,	avg_r: 1,	avg_G: 54.3,	t: 11,	dt: 19073.448ms,	SimpleTD/loss: 0.0919,	VanillaPG/loss: -0.0509
[a2c|TrainMonitor|INFO] ep: 230,	T: 6,612,	G: 17,	avg_r: 1,	avg_G: 50.6,	t: 17,	dt: 461.121ms,	SimpleTD/loss: 0.453,	VanillaPG/loss: 0.646


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 231,	T: 6,628,	G: 15,	avg_r: 1,	avg_G: 47,	t: 15,	dt: 15.362ms,	SimpleTD/loss: 0.466,	VanillaPG/loss: 0.657
[a2c|TrainMonitor|INFO] ep: 232,	T: 6,638,	G: 9,	avg_r: 1,	avg_G: 43.2,	t: 9,	dt: 14.995ms,	SimpleTD/loss: 0.344,	VanillaPG/loss: 0.315


Epoch reward 15.0
Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 233,	T: 6,656,	G: 17,	avg_r: 1,	avg_G: 40.6,	t: 17,	dt: 14.915ms,	SimpleTD/loss: 0.333,	VanillaPG/loss: 0.352


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 234,	T: 6,671,	G: 14,	avg_r: 1,	avg_G: 37.9,	t: 14,	dt: 14.720ms,	SimpleTD/loss: 0.587,	VanillaPG/loss: 0.471
[a2c|TrainMonitor|INFO] ep: 235,	T: 6,682,	G: 10,	avg_r: 1,	avg_G: 35.2,	t: 10,	dt: 14.264ms,	SimpleTD/loss: 3.96,	VanillaPG/loss: 0.0658


Epoch reward 14.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 236,	T: 6,695,	G: 12,	avg_r: 1,	avg_G: 32.8,	t: 12,	dt: 14.271ms,	SimpleTD/loss: 4.12,	VanillaPG/loss: 0.0902
[a2c|TrainMonitor|INFO] ep: 237,	T: 6,705,	G: 9,	avg_r: 1,	avg_G: 30.5,	t: 9,	dt: 14.316ms,	SimpleTD/loss: 2.98,	VanillaPG/loss: 0.0923


Epoch reward 12.0
Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 238,	T: 6,716,	G: 10,	avg_r: 1,	avg_G: 28.4,	t: 10,	dt: 14.699ms,	SimpleTD/loss: 0.117,	VanillaPG/loss: 0.00942
[a2c|TrainMonitor|INFO] ep: 239,	T: 6,727,	G: 10,	avg_r: 1,	avg_G: 26.6,	t: 10,	dt: 14.593ms,	SimpleTD/loss: 0.323,	VanillaPG/loss: 0.00014


Epoch reward 10.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 240,	T: 6,736,	G: 8,	avg_r: 1,	avg_G: 24.7,	t: 8,	dt: 15.270ms,	SimpleTD/loss: 0.0192,	VanillaPG/loss: -0.00448
[a2c|TrainMonitor|INFO] ep: 241,	T: 6,747,	G: 10,	avg_r: 1,	avg_G: 23.2,	t: 10,	dt: 14.751ms,	SimpleTD/loss: 0.503,	VanillaPG/loss: 0.0056


Epoch reward 8.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 242,	T: 6,757,	G: 9,	avg_r: 1,	avg_G: 21.8,	t: 9,	dt: 14.376ms,	SimpleTD/loss: 0.999,	VanillaPG/loss: 0.0148
[a2c|TrainMonitor|INFO] ep: 243,	T: 6,768,	G: 10,	avg_r: 1,	avg_G: 20.6,	t: 10,	dt: 14.819ms,	SimpleTD/loss: 1.17,	VanillaPG/loss: 0.0174


Epoch reward 9.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 244,	T: 6,777,	G: 8,	avg_r: 1,	avg_G: 19.4,	t: 8,	dt: 14.576ms,	SimpleTD/loss: 0.64,	VanillaPG/loss: 0.0207
[a2c|TrainMonitor|INFO] ep: 245,	T: 6,787,	G: 9,	avg_r: 1,	avg_G: 18.3,	t: 9,	dt: 14.058ms,	SimpleTD/loss: 0.273,	VanillaPG/loss: 0.00918


Epoch reward 8.0
Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 246,	T: 6,797,	G: 9,	avg_r: 1,	avg_G: 17.4,	t: 9,	dt: 15.149ms,	SimpleTD/loss: 0.204,	VanillaPG/loss: 0.00751
[a2c|TrainMonitor|INFO] ep: 247,	T: 6,809,	G: 11,	avg_r: 1,	avg_G: 16.8,	t: 11,	dt: 14.725ms,	SimpleTD/loss: 0.0593,	VanillaPG/loss: 0.00569


Epoch reward 9.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 248,	T: 6,819,	G: 9,	avg_r: 1,	avg_G: 16,	t: 9,	dt: 15.602ms,	SimpleTD/loss: 0.0665,	VanillaPG/loss: 0.00241
[a2c|TrainMonitor|INFO] ep: 249,	T: 6,830,	G: 10,	avg_r: 1,	avg_G: 15.4,	t: 10,	dt: 14.518ms,	SimpleTD/loss: 0.0201,	VanillaPG/loss: 0.000972


Epoch reward 9.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 250,	T: 6,846,	G: 15,	avg_r: 1,	avg_G: 15.3,	t: 15,	dt: 14.670ms,	SimpleTD/loss: 0.0487,	VanillaPG/loss: 0.288
[a2c|TrainMonitor|INFO] ep: 251,	T: 6,859,	G: 12,	avg_r: 1,	avg_G: 15,	t: 12,	dt: 13.910ms,	SimpleTD/loss: 0.0697,	VanillaPG/loss: 0.245


Epoch reward 15.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 252,	T: 6,870,	G: 10,	avg_r: 1,	avg_G: 14.5,	t: 10,	dt: 14.443ms,	SimpleTD/loss: 0.0632,	VanillaPG/loss: -0.00289
[a2c|TrainMonitor|INFO] ep: 253,	T: 6,883,	G: 12,	avg_r: 1,	avg_G: 14.3,	t: 12,	dt: 14.511ms,	SimpleTD/loss: 0.109,	VanillaPG/loss: 0.233


Epoch reward 10.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 254,	T: 6,894,	G: 10,	avg_r: 1,	avg_G: 13.8,	t: 10,	dt: 14.296ms,	SimpleTD/loss: 6.29,	VanillaPG/loss: 0.13
[a2c|TrainMonitor|INFO] ep: 255,	T: 6,908,	G: 13,	avg_r: 1,	avg_G: 13.8,	t: 13,	dt: 14.265ms,	SimpleTD/loss: 0.0604,	VanillaPG/loss: -0.00309


Epoch reward 10.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 256,	T: 6,919,	G: 10,	avg_r: 1,	avg_G: 13.4,	t: 10,	dt: 15.275ms,	SimpleTD/loss: 0.551,	VanillaPG/loss: 0.17


Epoch reward 10.0
Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 257,	T: 6,934,	G: 14,	avg_r: 1,	avg_G: 13.4,	t: 14,	dt: 14.097ms,	SimpleTD/loss: 0.0911,	VanillaPG/loss: 0.34
[a2c|TrainMonitor|INFO] ep: 258,	T: 6,944,	G: 9,	avg_r: 1,	avg_G: 13,	t: 9,	dt: 15.764ms,	SimpleTD/loss: 0.0527,	VanillaPG/loss: 0.0184
[a2c|TrainMonitor|INFO] ep: 259,	T: 6,957,	G: 12,	avg_r: 1,	avg_G: 12.9,	t: 12,	dt: 15.372ms,	SimpleTD/loss: 0.324,	VanillaPG/loss: 0.187


Epoch reward 9.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 260,	T: 6,970,	G: 12,	avg_r: 1,	avg_G: 12.8,	t: 12,	dt: 14.628ms,	SimpleTD/loss: 0.0338,	VanillaPG/loss: 0.133
[a2c|TrainMonitor|INFO] ep: 261,	T: 6,981,	G: 10,	avg_r: 1,	avg_G: 12.5,	t: 10,	dt: 15.568ms,	SimpleTD/loss: 0.00192,	VanillaPG/loss: 0.00388


Epoch reward 12.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 262,	T: 6,993,	G: 11,	avg_r: 1,	avg_G: 12.4,	t: 11,	dt: 15.758ms,	SimpleTD/loss: 0.281,	VanillaPG/loss: 0.0329
[a2c|TrainMonitor|INFO] ep: 263,	T: 7,005,	G: 11,	avg_r: 1,	avg_G: 12.2,	t: 11,	dt: 14.830ms,	SimpleTD/loss: 0.81,	VanillaPG/loss: 0.118


Epoch reward 11.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 264,	T: 7,016,	G: 10,	avg_r: 1,	avg_G: 12,	t: 10,	dt: 14.220ms,	SimpleTD/loss: 0.357,	VanillaPG/loss: -0.000799


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 265,	T: 7,034,	G: 17,	avg_r: 1,	avg_G: 12.5,	t: 17,	dt: 13.903ms,	SimpleTD/loss: 1.61,	VanillaPG/loss: 0.0713
[a2c|TrainMonitor|INFO] ep: 266,	T: 7,047,	G: 12,	avg_r: 1,	avg_G: 12.5,	t: 12,	dt: 13.833ms,	SimpleTD/loss: 2.21,	VanillaPG/loss: -0.0135


Epoch reward 17.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 267,	T: 7,060,	G: 12,	avg_r: 1,	avg_G: 12.4,	t: 12,	dt: 14.403ms,	SimpleTD/loss: 0.307,	VanillaPG/loss: 0.044
[a2c|TrainMonitor|INFO] ep: 268,	T: 7,070,	G: 9,	avg_r: 1,	avg_G: 12.1,	t: 9,	dt: 14.266ms,	SimpleTD/loss: 0.198,	VanillaPG/loss: 0.0238


Epoch reward 12.0
Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 269,	T: 7,082,	G: 11,	avg_r: 1,	avg_G: 12,	t: 11,	dt: 14.407ms,	SimpleTD/loss: 0.132,	VanillaPG/loss: 0.093
[a2c|TrainMonitor|INFO] ep: 270,	T: 7,093,	G: 10,	avg_r: 1,	avg_G: 11.8,	t: 10,	dt: 15.185ms,	SimpleTD/loss: 0.571,	VanillaPG/loss: 0.0112


Epoch reward 11.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 271,	T: 7,103,	G: 9,	avg_r: 1,	avg_G: 11.5,	t: 9,	dt: 14.045ms,	SimpleTD/loss: 0.512,	VanillaPG/loss: 0.00294
[a2c|TrainMonitor|INFO] ep: 272,	T: 7,114,	G: 10,	avg_r: 1,	avg_G: 11.3,	t: 10,	dt: 14.266ms,	SimpleTD/loss: 0.371,	VanillaPG/loss: 1.44


Epoch reward 9.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 273,	T: 7,123,	G: 8,	avg_r: 1,	avg_G: 11,	t: 8,	dt: 14.738ms,	SimpleTD/loss: 0.439,	VanillaPG/loss: -0.026
[a2c|TrainMonitor|INFO] ep: 274,	T: 7,134,	G: 10,	avg_r: 1,	avg_G: 10.9,	t: 10,	dt: 14.185ms,	SimpleTD/loss: 1.36,	VanillaPG/loss: -0.816


Epoch reward 8.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 275,	T: 7,145,	G: 10,	avg_r: 1,	avg_G: 10.8,	t: 10,	dt: 14.377ms,	SimpleTD/loss: 2.13,	VanillaPG/loss: -0.188
[a2c|TrainMonitor|INFO] ep: 276,	T: 7,158,	G: 12,	avg_r: 1,	avg_G: 10.9,	t: 12,	dt: 14.531ms,	SimpleTD/loss: 0.572,	VanillaPG/loss: 0.178


Epoch reward 10.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 277,	T: 7,170,	G: 11,	avg_r: 1,	avg_G: 10.9,	t: 11,	dt: 14.344ms,	SimpleTD/loss: 0.871,	VanillaPG/loss: 0.379
[a2c|TrainMonitor|INFO] ep: 278,	T: 7,183,	G: 12,	avg_r: 1,	avg_G: 11,	t: 12,	dt: 14.524ms,	SimpleTD/loss: 1.07,	VanillaPG/loss: 0.685


Epoch reward 11.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 279,	T: 7,195,	G: 11,	avg_r: 1,	avg_G: 11,	t: 11,	dt: 14.542ms,	SimpleTD/loss: 0.231,	VanillaPG/loss: 0.235
[a2c|TrainMonitor|INFO] ep: 280,	T: 7,209,	G: 13,	avg_r: 1,	avg_G: 11.2,	t: 13,	dt: 14.516ms,	SimpleTD/loss: 0.262,	VanillaPG/loss: 0.245


Epoch reward 11.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 281,	T: 7,220,	G: 10,	avg_r: 1,	avg_G: 11.1,	t: 10,	dt: 14.784ms,	SimpleTD/loss: 0.206,	VanillaPG/loss: -0.0466


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 282,	T: 7,236,	G: 15,	avg_r: 1,	avg_G: 11.5,	t: 15,	dt: 14.359ms,	SimpleTD/loss: 0.554,	VanillaPG/loss: 0.125
[a2c|TrainMonitor|INFO] ep: 283,	T: 7,250,	G: 13,	avg_r: 1,	avg_G: 11.7,	t: 13,	dt: 15.024ms,	SimpleTD/loss: 0.406,	VanillaPG/loss: 0.0143


Epoch reward 15.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 284,	T: 7,270,	G: 19,	avg_r: 1,	avg_G: 12.4,	t: 19,	dt: 14.440ms,	SimpleTD/loss: 0.603,	VanillaPG/loss: 0.043


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 285,	T: 7,343,	G: 72,	avg_r: 1,	avg_G: 18.3,	t: 72,	dt: 14.220ms,	SimpleTD/loss: 0.888,	VanillaPG/loss: 0.00207
[a2c|TrainMonitor|INFO] ep: 286,	T: 7,357,	G: 13,	avg_r: 1,	avg_G: 17.8,	t: 13,	dt: 14.697ms,	SimpleTD/loss: 3.53,	VanillaPG/loss: -0.64


Epoch reward 72.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 287,	T: 7,375,	G: 17,	avg_r: 1,	avg_G: 17.7,	t: 17,	dt: 14.502ms,	SimpleTD/loss: 0.17,	VanillaPG/loss: 0.26


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 288,	T: 7,391,	G: 15,	avg_r: 1,	avg_G: 17.5,	t: 15,	dt: 16.071ms,	SimpleTD/loss: 0.778,	VanillaPG/loss: 0.581
[a2c|TrainMonitor|INFO] ep: 289,	T: 7,403,	G: 11,	avg_r: 1,	avg_G: 16.8,	t: 11,	dt: 14.016ms,	SimpleTD/loss: 1.35,	VanillaPG/loss: 0.563


Epoch reward 15.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 290,	T: 7,421,	G: 17,	avg_r: 1,	avg_G: 16.8,	t: 17,	dt: 13.854ms,	SimpleTD/loss: 0.786,	VanillaPG/loss: 0.957


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 291,	T: 7,464,	G: 42,	avg_r: 1,	avg_G: 19.3,	t: 42,	dt: 14.363ms,	SimpleTD/loss: 0.479,	VanillaPG/loss: 0.273


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 292,	T: 7,483,	G: 18,	avg_r: 1,	avg_G: 19.2,	t: 18,	dt: 13.774ms,	SimpleTD/loss: 1.32,	VanillaPG/loss: -0.0324


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 293,	T: 7,537,	G: 53,	avg_r: 1,	avg_G: 22.6,	t: 53,	dt: 13.820ms,	SimpleTD/loss: 0.492,	VanillaPG/loss: 0.0856


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 294,	T: 7,597,	G: 59,	avg_r: 1,	avg_G: 26.2,	t: 59,	dt: 14.319ms,	SimpleTD/loss: 0.845,	VanillaPG/loss: -0.0365


Epoch reward 59.0


[a2c|TrainMonitor|INFO] ep: 295,	T: 7,640,	G: 42,	avg_r: 1,	avg_G: 27.8,	t: 42,	dt: 14.225ms,	SimpleTD/loss: 0.677,	VanillaPG/loss: -0.158
[a2c|TrainMonitor|INFO] ep: 296,	T: 7,652,	G: 11,	avg_r: 1,	avg_G: 26.1,	t: 11,	dt: 13.978ms,	SimpleTD/loss: 2.28,	VanillaPG/loss: -0.355


Epoch reward 42.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 297,	T: 7,668,	G: 15,	avg_r: 1,	avg_G: 25,	t: 15,	dt: 14.423ms,	SimpleTD/loss: 0.876,	VanillaPG/loss: -0.0111
[a2c|TrainMonitor|INFO] ep: 298,	T: 7,681,	G: 12,	avg_r: 1,	avg_G: 23.7,	t: 12,	dt: 14.455ms,	SimpleTD/loss: 0.881,	VanillaPG/loss: -0.132


Epoch reward 15.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 299,	T: 7,699,	G: 17,	avg_r: 1,	avg_G: 23,	t: 17,	dt: 14.770ms,	SimpleTD/loss: 0.458,	VanillaPG/loss: 0.2
[a2c|TrainMonitor|INFO] ep: 300,	T: 7,712,	G: 12,	avg_r: 1,	avg_G: 21.9,	t: 12,	dt: 14.691ms,	SimpleTD/loss: 0.686,	VanillaPG/loss: -0.025


Epoch reward 17.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 301,	T: 7,725,	G: 12,	avg_r: 1,	avg_G: 20.9,	t: 12,	dt: 14.465ms,	SimpleTD/loss: 0.423,	VanillaPG/loss: 0.0426
[a2c|TrainMonitor|INFO] ep: 302,	T: 7,736,	G: 10,	avg_r: 1,	avg_G: 19.9,	t: 10,	dt: 14.169ms,	SimpleTD/loss: 0.384,	VanillaPG/loss: -0.00102


Epoch reward 12.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 303,	T: 7,746,	G: 9,	avg_r: 1,	avg_G: 18.8,	t: 9,	dt: 14.913ms,	SimpleTD/loss: 0.239,	VanillaPG/loss: -0.171
[a2c|TrainMonitor|INFO] ep: 304,	T: 7,757,	G: 10,	avg_r: 1,	avg_G: 17.9,	t: 10,	dt: 14.912ms,	SimpleTD/loss: 0.43,	VanillaPG/loss: -0.00204


Epoch reward 9.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 305,	T: 7,767,	G: 9,	avg_r: 1,	avg_G: 17,	t: 9,	dt: 14.416ms,	SimpleTD/loss: 0.124,	VanillaPG/loss: 0.153


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 306,	T: 7,782,	G: 14,	avg_r: 1,	avg_G: 16.7,	t: 14,	dt: 15.723ms,	SimpleTD/loss: 0.375,	VanillaPG/loss: 0.457
[a2c|TrainMonitor|INFO] ep: 307,	T: 7,794,	G: 11,	avg_r: 1,	avg_G: 16.1,	t: 11,	dt: 15.002ms,	SimpleTD/loss: 0.246,	VanillaPG/loss: 0.36


Epoch reward 14.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 308,	T: 7,810,	G: 15,	avg_r: 1,	avg_G: 16,	t: 15,	dt: 14.839ms,	SimpleTD/loss: 1.1,	VanillaPG/loss: 0.483
[a2c|TrainMonitor|INFO] ep: 309,	T: 7,821,	G: 10,	avg_r: 1,	avg_G: 15.4,	t: 10,	dt: 14.862ms,	SimpleTD/loss: 0.328,	VanillaPG/loss: 0.0378


Epoch reward 15.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 310,	T: 7,838,	G: 16,	avg_r: 1,	avg_G: 15.5,	t: 16,	dt: 15.098ms,	SimpleTD/loss: 0.791,	VanillaPG/loss: 0.0617


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 311,	T: 7,882,	G: 43,	avg_r: 1,	avg_G: 18.2,	t: 43,	dt: 15.003ms,	SimpleTD/loss: 0.58,	VanillaPG/loss: 0.0198


Epoch reward 43.0


[a2c|TrainMonitor|INFO] ep: 312,	T: 7,902,	G: 19,	avg_r: 1,	avg_G: 18.3,	t: 19,	dt: 14.784ms,	SimpleTD/loss: 1.3,	VanillaPG/loss: -0.335


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 313,	T: 7,934,	G: 31,	avg_r: 1,	avg_G: 19.6,	t: 31,	dt: 14.319ms,	SimpleTD/loss: 0.663,	VanillaPG/loss: -0.00121


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 314,	T: 7,955,	G: 20,	avg_r: 1,	avg_G: 19.6,	t: 20,	dt: 14.804ms,	SimpleTD/loss: 0.983,	VanillaPG/loss: -0.108


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 315,	T: 7,977,	G: 21,	avg_r: 1,	avg_G: 19.8,	t: 21,	dt: 14.676ms,	SimpleTD/loss: 0.394,	VanillaPG/loss: 0.108


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 316,	T: 7,996,	G: 18,	avg_r: 1,	avg_G: 19.6,	t: 18,	dt: 14.964ms,	SimpleTD/loss: 0.546,	VanillaPG/loss: 0.0699


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 317,	T: 8,020,	G: 23,	avg_r: 1,	avg_G: 19.9,	t: 23,	dt: 14.096ms,	SimpleTD/loss: 0.564,	VanillaPG/loss: 0.1


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 318,	T: 8,041,	G: 20,	avg_r: 1,	avg_G: 19.9,	t: 20,	dt: 14.505ms,	SimpleTD/loss: 0.4,	VanillaPG/loss: 0.107
[a2c|TrainMonitor|INFO] ep: 319,	T: 8,055,	G: 13,	avg_r: 1,	avg_G: 19.2,	t: 13,	dt: 14.847ms,	SimpleTD/loss: 0.803,	VanillaPG/loss: 0.032


Epoch reward 20.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 320,	T: 8,071,	G: 15,	avg_r: 1,	avg_G: 18.8,	t: 15,	dt: 14.598ms,	SimpleTD/loss: 0.929,	VanillaPG/loss: 0.114


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 321,	T: 8,087,	G: 15,	avg_r: 1,	avg_G: 18.4,	t: 15,	dt: 14.616ms,	SimpleTD/loss: 0.638,	VanillaPG/loss: -0.206


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 322,	T: 8,138,	G: 50,	avg_r: 1,	avg_G: 21.6,	t: 50,	dt: 15.068ms,	SimpleTD/loss: 0.453,	VanillaPG/loss: 0.0232


Epoch reward 50.0


[a2c|TrainMonitor|INFO] ep: 323,	T: 8,181,	G: 42,	avg_r: 1,	avg_G: 23.6,	t: 42,	dt: 14.163ms,	SimpleTD/loss: 1.32,	VanillaPG/loss: -0.0867


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 324,	T: 8,211,	G: 29,	avg_r: 1,	avg_G: 24.2,	t: 29,	dt: 14.264ms,	SimpleTD/loss: 0.554,	VanillaPG/loss: -0.238


Epoch reward 29.0


[a2c|TrainMonitor|INFO] ep: 325,	T: 8,231,	G: 19,	avg_r: 1,	avg_G: 23.6,	t: 19,	dt: 14.042ms,	SimpleTD/loss: 0.962,	VanillaPG/loss: 0.0945


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 326,	T: 8,262,	G: 30,	avg_r: 1,	avg_G: 24.3,	t: 30,	dt: 14.467ms,	SimpleTD/loss: 0.805,	VanillaPG/loss: 0.0313


Epoch reward 30.0


[a2c|TrainMonitor|INFO] ep: 327,	T: 8,287,	G: 24,	avg_r: 1,	avg_G: 24.3,	t: 24,	dt: 15.978ms,	SimpleTD/loss: 1.13,	VanillaPG/loss: -0.289


Epoch reward 24.0


[a2c|TrainMonitor|INFO] ep: 328,	T: 8,341,	G: 53,	avg_r: 1,	avg_G: 27.1,	t: 53,	dt: 15.746ms,	SimpleTD/loss: 0.502,	VanillaPG/loss: 0.164


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 329,	T: 8,402,	G: 60,	avg_r: 1,	avg_G: 30.4,	t: 60,	dt: 15.612ms,	SimpleTD/loss: 0.548,	VanillaPG/loss: -0.293


Epoch reward 60.0


[a2c|TrainMonitor|INFO] ep: 330,	T: 8,435,	G: 32,	avg_r: 1,	avg_G: 30.6,	t: 32,	dt: 15.475ms,	SimpleTD/loss: 0.916,	VanillaPG/loss: -0.182


Epoch reward 32.0


[a2c|TrainMonitor|INFO] ep: 331,	T: 8,452,	G: 16,	avg_r: 1,	avg_G: 29.1,	t: 16,	dt: 15.825ms,	SimpleTD/loss: 1.64,	VanillaPG/loss: -0.114


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 332,	T: 8,476,	G: 23,	avg_r: 1,	avg_G: 28.5,	t: 23,	dt: 16.184ms,	SimpleTD/loss: 0.518,	VanillaPG/loss: 0.123


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 333,	T: 8,493,	G: 16,	avg_r: 1,	avg_G: 27.3,	t: 16,	dt: 13.987ms,	SimpleTD/loss: 0.781,	VanillaPG/loss: -0.273


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 334,	T: 8,516,	G: 22,	avg_r: 1,	avg_G: 26.7,	t: 22,	dt: 14.460ms,	SimpleTD/loss: 0.782,	VanillaPG/loss: 0.0731


Epoch reward 22.0


[a2c|TrainMonitor|INFO] ep: 335,	T: 8,536,	G: 19,	avg_r: 1,	avg_G: 26,	t: 19,	dt: 14.705ms,	SimpleTD/loss: 0.796,	VanillaPG/loss: 0.0969


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 336,	T: 8,561,	G: 24,	avg_r: 1,	avg_G: 25.8,	t: 24,	dt: 14.419ms,	SimpleTD/loss: 0.599,	VanillaPG/loss: 0.122


Epoch reward 24.0


[a2c|TrainMonitor|INFO] ep: 337,	T: 8,585,	G: 23,	avg_r: 1,	avg_G: 25.5,	t: 23,	dt: 14.402ms,	SimpleTD/loss: 0.72,	VanillaPG/loss: 0.0586


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 338,	T: 8,616,	G: 30,	avg_r: 1,	avg_G: 25.9,	t: 30,	dt: 14.628ms,	SimpleTD/loss: 0.419,	VanillaPG/loss: 0.131


Epoch reward 30.0


[a2c|TrainMonitor|INFO] ep: 339,	T: 8,634,	G: 17,	avg_r: 1,	avg_G: 25,	t: 17,	dt: 14.681ms,	SimpleTD/loss: 0.905,	VanillaPG/loss: -0.197


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 340,	T: 8,669,	G: 34,	avg_r: 1,	avg_G: 25.9,	t: 34,	dt: 14.619ms,	SimpleTD/loss: 0.486,	VanillaPG/loss: 0.0518


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 341,	T: 8,690,	G: 20,	avg_r: 1,	avg_G: 25.3,	t: 20,	dt: 14.153ms,	SimpleTD/loss: 0.892,	VanillaPG/loss: -0.0021


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 342,	T: 8,721,	G: 30,	avg_r: 1,	avg_G: 25.8,	t: 30,	dt: 14.254ms,	SimpleTD/loss: 0.333,	VanillaPG/loss: 0.121


Epoch reward 30.0


[a2c|TrainMonitor|INFO] ep: 343,	T: 8,743,	G: 21,	avg_r: 1,	avg_G: 25.3,	t: 21,	dt: 14.384ms,	SimpleTD/loss: 0.395,	VanillaPG/loss: 0.0884


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 344,	T: 8,775,	G: 31,	avg_r: 1,	avg_G: 25.9,	t: 31,	dt: 14.143ms,	SimpleTD/loss: 0.431,	VanillaPG/loss: 0.0243


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 345,	T: 8,834,	G: 58,	avg_r: 1,	avg_G: 29.1,	t: 58,	dt: 14.628ms,	SimpleTD/loss: 0.309,	VanillaPG/loss: 0.076


Epoch reward 58.0


[a2c|TrainMonitor|INFO] ep: 346,	T: 8,873,	G: 38,	avg_r: 1,	avg_G: 30,	t: 38,	dt: 14.729ms,	SimpleTD/loss: 0.339,	VanillaPG/loss: 0.0188


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 347,	T: 8,963,	G: 89,	avg_r: 1,	avg_G: 35.9,	t: 89,	dt: 14.125ms,	SimpleTD/loss: 0.968,	VanillaPG/loss: -0.0265


Epoch reward 89.0


[a2c|TrainMonitor|INFO] ep: 348,	T: 9,002,	G: 38,	avg_r: 1,	avg_G: 36.1,	t: 38,	dt: 14.854ms,	SimpleTD/loss: 0.431,	VanillaPG/loss: -0.29


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 349,	T: 9,086,	G: 83,	avg_r: 1,	avg_G: 40.8,	t: 83,	dt: 15.076ms,	SimpleTD/loss: 0.375,	VanillaPG/loss: 0.143


Epoch reward 83.0


[a2c|TrainMonitor|INFO] ep: 350,	T: 9,146,	G: 59,	avg_r: 1,	avg_G: 42.6,	t: 59,	dt: 14.312ms,	SimpleTD/loss: 0.275,	VanillaPG/loss: -0.225


Epoch reward 59.0


[a2c|TrainMonitor|INFO] ep: 351,	T: 9,262,	G: 115,	avg_r: 1,	avg_G: 49.9,	t: 115,	dt: 14.121ms,	SimpleTD/loss: 0.524,	VanillaPG/loss: -0.0614


Epoch reward 115.0


[a2c|TrainMonitor|INFO] ep: 352,	T: 9,346,	G: 83,	avg_r: 1,	avg_G: 53.2,	t: 83,	dt: 14.551ms,	SimpleTD/loss: 0.526,	VanillaPG/loss: 0.00791


Epoch reward 83.0


[a2c|TrainMonitor|INFO] ep: 353,	T: 9,408,	G: 61,	avg_r: 1,	avg_G: 54,	t: 61,	dt: 14.195ms,	SimpleTD/loss: 0.26,	VanillaPG/loss: 0.0334


Epoch reward 61.0


[a2c|TrainMonitor|INFO] ep: 354,	T: 9,455,	G: 46,	avg_r: 1,	avg_G: 53.2,	t: 46,	dt: 14.621ms,	SimpleTD/loss: 0.459,	VanillaPG/loss: 0.0349


Epoch reward 46.0


[a2c|TrainMonitor|INFO] ep: 355,	T: 9,544,	G: 88,	avg_r: 1,	avg_G: 56.6,	t: 88,	dt: 14.310ms,	SimpleTD/loss: 0.161,	VanillaPG/loss: 0.0357


Epoch reward 88.0


[a2c|TrainMonitor|INFO] ep: 356,	T: 9,587,	G: 42,	avg_r: 1,	avg_G: 55.2,	t: 42,	dt: 14.829ms,	SimpleTD/loss: 0.116,	VanillaPG/loss: -0.00557


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 357,	T: 9,641,	G: 53,	avg_r: 1,	avg_G: 55,	t: 53,	dt: 14.071ms,	SimpleTD/loss: 0.0795,	VanillaPG/loss: 0.0227


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 358,	T: 9,675,	G: 33,	avg_r: 1,	avg_G: 52.8,	t: 33,	dt: 14.345ms,	SimpleTD/loss: 0.407,	VanillaPG/loss: -0.199


Epoch reward 33.0


[a2c|TrainMonitor|INFO] ep: 359,	T: 9,729,	G: 53,	avg_r: 1,	avg_G: 52.8,	t: 53,	dt: 14.385ms,	SimpleTD/loss: 0.668,	VanillaPG/loss: 0.114


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 360,	T: 9,833,	G: 103,	avg_r: 1,	avg_G: 57.8,	t: 103,	dt: 14.565ms,	SimpleTD/loss: 0.261,	VanillaPG/loss: -0.0275


Epoch reward 103.0


[a2c|TrainMonitor|INFO] ep: 361,	T: 10,034,	G: 200,	avg_r: 1,	avg_G: 72,	t: 200,	dt: 14.471ms,	SimpleTD/loss: 0.00555,	VanillaPG/loss: -0.0204


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 362,	T: 10,105,	G: 70,	avg_r: 1,	avg_G: 71.8,	t: 70,	dt: 14.668ms,	SimpleTD/loss: 0.373,	VanillaPG/loss: 0.0117


Epoch reward 70.0


[a2c|TrainMonitor|INFO] ep: 363,	T: 10,152,	G: 46,	avg_r: 1,	avg_G: 69.2,	t: 46,	dt: 14.446ms,	SimpleTD/loss: 0.383,	VanillaPG/loss: 0.0165


Epoch reward 46.0


[a2c|TrainMonitor|INFO] ep: 364,	T: 10,185,	G: 32,	avg_r: 1,	avg_G: 65.5,	t: 32,	dt: 14.135ms,	SimpleTD/loss: 0.37,	VanillaPG/loss: -0.0874


Epoch reward 32.0


[a2c|TrainMonitor|INFO] ep: 365,	T: 10,224,	G: 38,	avg_r: 1,	avg_G: 62.8,	t: 38,	dt: 15.555ms,	SimpleTD/loss: 0.327,	VanillaPG/loss: 0.0376


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 366,	T: 10,263,	G: 38,	avg_r: 1,	avg_G: 60.3,	t: 38,	dt: 13.951ms,	SimpleTD/loss: 0.345,	VanillaPG/loss: -0.0223


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 367,	T: 10,295,	G: 31,	avg_r: 1,	avg_G: 57.4,	t: 31,	dt: 14.612ms,	SimpleTD/loss: 0.266,	VanillaPG/loss: 0.0163


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 368,	T: 10,342,	G: 46,	avg_r: 1,	avg_G: 56.2,	t: 46,	dt: 14.820ms,	SimpleTD/loss: 0.06,	VanillaPG/loss: 0.0414


Epoch reward 46.0


[a2c|TrainMonitor|INFO] ep: 369,	T: 10,378,	G: 35,	avg_r: 1,	avg_G: 54.1,	t: 35,	dt: 14.391ms,	SimpleTD/loss: 0.281,	VanillaPG/loss: 0.0176


Epoch reward 35.0


[a2c|TrainMonitor|INFO] ep: 370,	T: 10,418,	G: 39,	avg_r: 1,	avg_G: 52.6,	t: 39,	dt: 14.711ms,	SimpleTD/loss: 0.247,	VanillaPG/loss: -0.0803


Epoch reward 39.0


[a2c|TrainMonitor|INFO] ep: 371,	T: 10,456,	G: 37,	avg_r: 1,	avg_G: 51,	t: 37,	dt: 14.781ms,	SimpleTD/loss: 0.0704,	VanillaPG/loss: 0.06


Epoch reward 37.0


[a2c|TrainMonitor|INFO] ep: 372,	T: 10,584,	G: 127,	avg_r: 1,	avg_G: 58.6,	t: 127,	dt: 14.362ms,	SimpleTD/loss: 0.14,	VanillaPG/loss: 0.0331


Epoch reward 127.0


[a2c|TrainMonitor|INFO] ep: 373,	T: 10,663,	G: 78,	avg_r: 1,	avg_G: 60.6,	t: 78,	dt: 14.099ms,	SimpleTD/loss: 0.338,	VanillaPG/loss: -0.0762


Epoch reward 78.0


[a2c|TrainMonitor|INFO] ep: 374,	T: 10,748,	G: 84,	avg_r: 1,	avg_G: 62.9,	t: 84,	dt: 14.373ms,	SimpleTD/loss: 0.407,	VanillaPG/loss: -0.0379


Epoch reward 84.0


[a2c|TrainMonitor|INFO] ep: 375,	T: 10,828,	G: 79,	avg_r: 1,	avg_G: 64.5,	t: 79,	dt: 14.587ms,	SimpleTD/loss: 0.513,	VanillaPG/loss: 0.041


Epoch reward 79.0


[a2c|TrainMonitor|INFO] ep: 376,	T: 11,029,	G: 200,	avg_r: 1,	avg_G: 78.1,	t: 200,	dt: 14.372ms,	SimpleTD/loss: 0.0209,	VanillaPG/loss: -0.00736


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 377,	T: 11,188,	G: 158,	avg_r: 1,	avg_G: 86.1,	t: 158,	dt: 14.621ms,	SimpleTD/loss: 0.202,	VanillaPG/loss: -0.0339


Epoch reward 158.0


[a2c|TrainMonitor|INFO] ep: 378,	T: 11,245,	G: 56,	avg_r: 1,	avg_G: 83.1,	t: 56,	dt: 14.621ms,	SimpleTD/loss: 0.437,	VanillaPG/loss: -0.413


Epoch reward 56.0


[a2c|TrainMonitor|INFO] ep: 379,	T: 11,446,	G: 200,	avg_r: 1,	avg_G: 94.7,	t: 200,	dt: 14.798ms,	SimpleTD/loss: 0.0102,	VanillaPG/loss: -0.00883


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 380,	T: 11,647,	G: 200,	avg_r: 1,	avg_G: 105,	t: 200,	dt: 14.644ms,	SimpleTD/loss: 0.00319,	VanillaPG/loss: -0.0107


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 381,	T: 11,848,	G: 200,	avg_r: 1,	avg_G: 115,	t: 200,	dt: 14.762ms,	SimpleTD/loss: 0.00176,	VanillaPG/loss: -0.0191


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 382,	T: 12,040,	G: 191,	avg_r: 1,	avg_G: 122,	t: 191,	dt: 14.209ms,	SimpleTD/loss: 0.196,	VanillaPG/loss: -0.0343


Epoch reward 191.0


[a2c|TrainMonitor|INFO] ep: 383,	T: 12,119,	G: 78,	avg_r: 1,	avg_G: 118,	t: 78,	dt: 14.518ms,	SimpleTD/loss: 0.176,	VanillaPG/loss: 0.018


Epoch reward 78.0


[a2c|TrainMonitor|INFO] ep: 384,	T: 12,245,	G: 125,	avg_r: 1,	avg_G: 119,	t: 125,	dt: 14.589ms,	SimpleTD/loss: 0.0593,	VanillaPG/loss: 0.00326


Epoch reward 125.0


[a2c|TrainMonitor|INFO] ep: 385,	T: 12,446,	G: 200,	avg_r: 1,	avg_G: 127,	t: 200,	dt: 14.395ms,	SimpleTD/loss: 0.0247,	VanillaPG/loss: -0.0141


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 386,	T: 12,647,	G: 200,	avg_r: 1,	avg_G: 134,	t: 200,	dt: 14.676ms,	SimpleTD/loss: 0.00408,	VanillaPG/loss: -0.0137


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 387,	T: 12,848,	G: 200,	avg_r: 1,	avg_G: 141,	t: 200,	dt: 15.066ms,	SimpleTD/loss: 0.00196,	VanillaPG/loss: -0.00342


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 388,	T: 13,049,	G: 200,	avg_r: 1,	avg_G: 147,	t: 200,	dt: 14.430ms,	SimpleTD/loss: 0.000721,	VanillaPG/loss: -0.00896


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 389,	T: 13,250,	G: 200,	avg_r: 1,	avg_G: 152,	t: 200,	dt: 14.071ms,	SimpleTD/loss: 0.00265,	VanillaPG/loss: 0.000195


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 390,	T: 13,451,	G: 200,	avg_r: 1,	avg_G: 157,	t: 200,	dt: 14.391ms,	SimpleTD/loss: 0.00378,	VanillaPG/loss: -0.0136


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 391,	T: 13,613,	G: 161,	avg_r: 1,	avg_G: 157,	t: 161,	dt: 14.324ms,	SimpleTD/loss: 0.264,	VanillaPG/loss: -0.00895


Epoch reward 161.0


[a2c|TrainMonitor|INFO] ep: 392,	T: 13,764,	G: 150,	avg_r: 1,	avg_G: 156,	t: 150,	dt: 14.879ms,	SimpleTD/loss: 0.144,	VanillaPG/loss: 0.0284


Epoch reward 150.0


[a2c|TrainMonitor|INFO] ep: 393,	T: 13,846,	G: 81,	avg_r: 1,	avg_G: 149,	t: 81,	dt: 14.262ms,	SimpleTD/loss: 0.484,	VanillaPG/loss: 0.013


Epoch reward 81.0


[a2c|TrainMonitor|INFO] ep: 394,	T: 13,987,	G: 140,	avg_r: 1,	avg_G: 148,	t: 140,	dt: 14.486ms,	SimpleTD/loss: 0.174,	VanillaPG/loss: 0.0296


Epoch reward 140.0


[a2c|TrainMonitor|INFO] ep: 395,	T: 14,152,	G: 164,	avg_r: 1,	avg_G: 150,	t: 164,	dt: 14.218ms,	SimpleTD/loss: 0.267,	VanillaPG/loss: -0.00415


Epoch reward 164.0


[a2c|TrainMonitor|INFO] ep: 396,	T: 14,279,	G: 126,	avg_r: 1,	avg_G: 147,	t: 126,	dt: 14.636ms,	SimpleTD/loss: 0.227,	VanillaPG/loss: 0.0301


Epoch reward 126.0


[a2c|TrainMonitor|INFO] ep: 397,	T: 14,305,	G: 25,	avg_r: 1,	avg_G: 135,	t: 25,	dt: 14.781ms,	SimpleTD/loss: 2.07,	VanillaPG/loss: -0.106


Epoch reward 25.0


[a2c|TrainMonitor|INFO] ep: 398,	T: 14,337,	G: 31,	avg_r: 1,	avg_G: 125,	t: 31,	dt: 14.450ms,	SimpleTD/loss: 0.855,	VanillaPG/loss: -0.0163


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 399,	T: 14,364,	G: 26,	avg_r: 1,	avg_G: 115,	t: 26,	dt: 14.362ms,	SimpleTD/loss: 0.88,	VanillaPG/loss: -0.262


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 400,	T: 14,389,	G: 24,	avg_r: 1,	avg_G: 106,	t: 24,	dt: 14.968ms,	SimpleTD/loss: 0.957,	VanillaPG/loss: -0.0277


Epoch reward 24.0


[a2c|TrainMonitor|INFO] ep: 401,	T: 14,407,	G: 17,	avg_r: 1,	avg_G: 96.8,	t: 17,	dt: 15.118ms,	SimpleTD/loss: 1.41,	VanillaPG/loss: 0.0435
[a2c|TrainMonitor|INFO] ep: 402,	T: 14,421,	G: 13,	avg_r: 1,	avg_G: 88.4,	t: 13,	dt: 14.573ms,	SimpleTD/loss: 1.06,	VanillaPG/loss: 0.0105


Epoch reward 17.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 403,	T: 14,433,	G: 11,	avg_r: 1,	avg_G: 80.7,	t: 11,	dt: 14.247ms,	SimpleTD/loss: 0.731,	VanillaPG/loss: -0.0358
[a2c|TrainMonitor|INFO] ep: 404,	T: 14,444,	G: 10,	avg_r: 1,	avg_G: 73.6,	t: 10,	dt: 14.776ms,	SimpleTD/loss: 0.604,	VanillaPG/loss: -0.132


Epoch reward 11.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 405,	T: 14,457,	G: 12,	avg_r: 1,	avg_G: 67.5,	t: 12,	dt: 14.535ms,	SimpleTD/loss: 0.289,	VanillaPG/loss: 0.0426


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 406,	T: 14,478,	G: 20,	avg_r: 1,	avg_G: 62.7,	t: 20,	dt: 15.272ms,	SimpleTD/loss: 0.902,	VanillaPG/loss: 0.00659


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 407,	T: 14,502,	G: 23,	avg_r: 1,	avg_G: 58.7,	t: 23,	dt: 14.372ms,	SimpleTD/loss: 0.912,	VanillaPG/loss: 0.0401


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 408,	T: 14,522,	G: 19,	avg_r: 1,	avg_G: 54.8,	t: 19,	dt: 14.144ms,	SimpleTD/loss: 1.08,	VanillaPG/loss: 0.0435


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 409,	T: 14,554,	G: 31,	avg_r: 1,	avg_G: 52.4,	t: 31,	dt: 15.155ms,	SimpleTD/loss: 0.734,	VanillaPG/loss: 0.013


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 410,	T: 14,581,	G: 26,	avg_r: 1,	avg_G: 49.8,	t: 26,	dt: 14.383ms,	SimpleTD/loss: 0.852,	VanillaPG/loss: -1.21


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 411,	T: 14,627,	G: 45,	avg_r: 1,	avg_G: 49.3,	t: 45,	dt: 14.333ms,	SimpleTD/loss: 0.323,	VanillaPG/loss: 0.0233


Epoch reward 45.0


[a2c|TrainMonitor|INFO] ep: 412,	T: 14,675,	G: 47,	avg_r: 1,	avg_G: 49.1,	t: 47,	dt: 14.818ms,	SimpleTD/loss: 0.319,	VanillaPG/loss: 0.0121


Epoch reward 47.0


[a2c|TrainMonitor|INFO] ep: 413,	T: 14,721,	G: 45,	avg_r: 1,	avg_G: 48.6,	t: 45,	dt: 14.165ms,	SimpleTD/loss: 0.285,	VanillaPG/loss: 0.00506


Epoch reward 45.0


[a2c|TrainMonitor|INFO] ep: 414,	T: 14,789,	G: 67,	avg_r: 1,	avg_G: 50.5,	t: 67,	dt: 14.703ms,	SimpleTD/loss: 0.171,	VanillaPG/loss: 0.00604


Epoch reward 67.0


[a2c|TrainMonitor|INFO] ep: 415,	T: 14,844,	G: 54,	avg_r: 1,	avg_G: 50.8,	t: 54,	dt: 14.935ms,	SimpleTD/loss: 0.105,	VanillaPG/loss: -0.0035


Epoch reward 54.0


[a2c|TrainMonitor|INFO] ep: 416,	T: 14,929,	G: 84,	avg_r: 1,	avg_G: 54.2,	t: 84,	dt: 14.681ms,	SimpleTD/loss: 0.0547,	VanillaPG/loss: 0.00226


Epoch reward 84.0


[a2c|TrainMonitor|INFO] ep: 417,	T: 15,003,	G: 73,	avg_r: 1,	avg_G: 56,	t: 73,	dt: 14.615ms,	SimpleTD/loss: 0.0367,	VanillaPG/loss: 0.00851


Epoch reward 73.0


[a2c|TrainMonitor|INFO] ep: 418,	T: 15,089,	G: 85,	avg_r: 1,	avg_G: 58.9,	t: 85,	dt: 13.863ms,	SimpleTD/loss: 0.264,	VanillaPG/loss: -0.0105


Epoch reward 85.0


[a2c|TrainMonitor|INFO] ep: 419,	T: 15,163,	G: 73,	avg_r: 1,	avg_G: 60.3,	t: 73,	dt: 14.610ms,	SimpleTD/loss: 0.303,	VanillaPG/loss: -0.047


Epoch reward 73.0


[a2c|TrainMonitor|INFO] ep: 420,	T: 15,293,	G: 129,	avg_r: 1,	avg_G: 67.2,	t: 129,	dt: 14.331ms,	SimpleTD/loss: 0.272,	VanillaPG/loss: 0.0204


Epoch reward 129.0


[a2c|TrainMonitor|INFO] ep: 421,	T: 15,449,	G: 155,	avg_r: 1,	avg_G: 76,	t: 155,	dt: 14.641ms,	SimpleTD/loss: 0.121,	VanillaPG/loss: 0.00601


Epoch reward 155.0


[a2c|TrainMonitor|INFO] ep: 422,	T: 15,603,	G: 153,	avg_r: 1,	avg_G: 83.7,	t: 153,	dt: 14.403ms,	SimpleTD/loss: 0.0731,	VanillaPG/loss: 0.00458


Epoch reward 153.0


[a2c|TrainMonitor|INFO] ep: 423,	T: 15,785,	G: 181,	avg_r: 1,	avg_G: 93.4,	t: 181,	dt: 14.235ms,	SimpleTD/loss: 0.147,	VanillaPG/loss: 0.00666


Epoch reward 181.0


[a2c|TrainMonitor|INFO] ep: 424,	T: 15,940,	G: 154,	avg_r: 1,	avg_G: 99.5,	t: 154,	dt: 14.442ms,	SimpleTD/loss: 0.202,	VanillaPG/loss: -0.00467


Epoch reward 154.0


[a2c|TrainMonitor|INFO] ep: 425,	T: 16,141,	G: 200,	avg_r: 1,	avg_G: 110,	t: 200,	dt: 14.548ms,	SimpleTD/loss: 0.0356,	VanillaPG/loss: 0.00253


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 426,	T: 16,271,	G: 129,	avg_r: 1,	avg_G: 111,	t: 129,	dt: 14.680ms,	SimpleTD/loss: 0.277,	VanillaPG/loss: 6.52e-05


Epoch reward 129.0


[a2c|TrainMonitor|INFO] ep: 427,	T: 16,389,	G: 117,	avg_r: 1,	avg_G: 112,	t: 117,	dt: 14.631ms,	SimpleTD/loss: 0.204,	VanillaPG/loss: 0.00415


Epoch reward 117.0


[a2c|TrainMonitor|INFO] ep: 428,	T: 16,552,	G: 162,	avg_r: 1,	avg_G: 117,	t: 162,	dt: 15.057ms,	SimpleTD/loss: 0.0525,	VanillaPG/loss: -0.0431


Epoch reward 162.0


[a2c|TrainMonitor|INFO] ep: 429,	T: 16,611,	G: 58,	avg_r: 1,	avg_G: 111,	t: 58,	dt: 14.304ms,	SimpleTD/loss: 0.608,	VanillaPG/loss: 0.00558


Epoch reward 58.0


[a2c|TrainMonitor|INFO] ep: 430,	T: 16,688,	G: 76,	avg_r: 1,	avg_G: 108,	t: 76,	dt: 14.537ms,	SimpleTD/loss: 0.266,	VanillaPG/loss: 0.0146


Epoch reward 76.0


[a2c|TrainMonitor|INFO] ep: 431,	T: 16,774,	G: 85,	avg_r: 1,	avg_G: 105,	t: 85,	dt: 14.208ms,	SimpleTD/loss: 0.114,	VanillaPG/loss: 0.0105


Epoch reward 85.0


[a2c|TrainMonitor|INFO] ep: 432,	T: 16,861,	G: 86,	avg_r: 1,	avg_G: 103,	t: 86,	dt: 14.327ms,	SimpleTD/loss: 0.0969,	VanillaPG/loss: 0.00666


Epoch reward 86.0


[a2c|TrainMonitor|INFO] ep: 433,	T: 16,943,	G: 81,	avg_r: 1,	avg_G: 101,	t: 81,	dt: 14.338ms,	SimpleTD/loss: 0.0417,	VanillaPG/loss: 0.00752


Epoch reward 81.0


[a2c|TrainMonitor|INFO] ep: 434,	T: 17,012,	G: 68,	avg_r: 1,	avg_G: 97.9,	t: 68,	dt: 14.385ms,	SimpleTD/loss: 0.331,	VanillaPG/loss: -0.00705


Epoch reward 68.0


[a2c|TrainMonitor|INFO] ep: 435,	T: 17,155,	G: 142,	avg_r: 1,	avg_G: 102,	t: 142,	dt: 14.730ms,	SimpleTD/loss: 0.144,	VanillaPG/loss: 0.00915


Epoch reward 142.0


[a2c|TrainMonitor|INFO] ep: 436,	T: 17,312,	G: 156,	avg_r: 1,	avg_G: 108,	t: 156,	dt: 14.413ms,	SimpleTD/loss: 0.142,	VanillaPG/loss: 0.0031


Epoch reward 156.0


[a2c|TrainMonitor|INFO] ep: 437,	T: 17,513,	G: 200,	avg_r: 1,	avg_G: 117,	t: 200,	dt: 14.640ms,	SimpleTD/loss: 0.00962,	VanillaPG/loss: 0.00589


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 438,	T: 17,714,	G: 200,	avg_r: 1,	avg_G: 125,	t: 200,	dt: 14.275ms,	SimpleTD/loss: 0.00145,	VanillaPG/loss: -0.0064


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 439,	T: 17,855,	G: 140,	avg_r: 1,	avg_G: 127,	t: 140,	dt: 14.430ms,	SimpleTD/loss: 0.255,	VanillaPG/loss: 0.00153


Epoch reward 140.0


[a2c|TrainMonitor|INFO] ep: 440,	T: 17,987,	G: 131,	avg_r: 1,	avg_G: 127,	t: 131,	dt: 14.323ms,	SimpleTD/loss: 0.269,	VanillaPG/loss: 0.00597


Epoch reward 131.0


[a2c|TrainMonitor|INFO] ep: 441,	T: 18,188,	G: 200,	avg_r: 1,	avg_G: 134,	t: 200,	dt: 14.303ms,	SimpleTD/loss: 0.00246,	VanillaPG/loss: 0.0039


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 442,	T: 18,389,	G: 200,	avg_r: 1,	avg_G: 141,	t: 200,	dt: 14.386ms,	SimpleTD/loss: 0.00176,	VanillaPG/loss: -0.00561


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 443,	T: 18,590,	G: 200,	avg_r: 1,	avg_G: 147,	t: 200,	dt: 14.869ms,	SimpleTD/loss: 0.00146,	VanillaPG/loss: 0.000901


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 444,	T: 18,791,	G: 200,	avg_r: 1,	avg_G: 152,	t: 200,	dt: 14.456ms,	SimpleTD/loss: 0.00179,	VanillaPG/loss: 4.22e-05


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 445,	T: 18,992,	G: 200,	avg_r: 1,	avg_G: 157,	t: 200,	dt: 14.617ms,	SimpleTD/loss: 0.0022,	VanillaPG/loss: -0.00154


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 446,	T: 19,169,	G: 176,	avg_r: 1,	avg_G: 159,	t: 176,	dt: 14.353ms,	SimpleTD/loss: 0.226,	VanillaPG/loss: -0.00522


Epoch reward 176.0


[a2c|TrainMonitor|INFO] ep: 447,	T: 19,370,	G: 200,	avg_r: 1,	avg_G: 163,	t: 200,	dt: 15.243ms,	SimpleTD/loss: 0.0141,	VanillaPG/loss: 0.0031


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 448,	T: 19,489,	G: 118,	avg_r: 1,	avg_G: 158,	t: 118,	dt: 16.224ms,	SimpleTD/loss: 0.416,	VanillaPG/loss: -0.0108


Epoch reward 118.0


[a2c|TrainMonitor|INFO] ep: 449,	T: 19,594,	G: 104,	avg_r: 1,	avg_G: 153,	t: 104,	dt: 14.879ms,	SimpleTD/loss: 0.535,	VanillaPG/loss: 0.0171


Epoch reward 104.0


[a2c|TrainMonitor|INFO] ep: 450,	T: 19,703,	G: 108,	avg_r: 1,	avg_G: 149,	t: 108,	dt: 14.997ms,	SimpleTD/loss: 0.309,	VanillaPG/loss: 0.0175


Epoch reward 108.0


[a2c|TrainMonitor|INFO] ep: 451,	T: 19,743,	G: 39,	avg_r: 1,	avg_G: 138,	t: 39,	dt: 14.558ms,	SimpleTD/loss: 0.803,	VanillaPG/loss: -0.0117


Epoch reward 39.0


[a2c|TrainMonitor|INFO] ep: 452,	T: 19,767,	G: 23,	avg_r: 1,	avg_G: 126,	t: 23,	dt: 14.080ms,	SimpleTD/loss: 1.08,	VanillaPG/loss: 0.00743


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 453,	T: 19,791,	G: 23,	avg_r: 1,	avg_G: 116,	t: 23,	dt: 14.536ms,	SimpleTD/loss: 1.03,	VanillaPG/loss: 0.0184


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 454,	T: 19,818,	G: 26,	avg_r: 1,	avg_G: 107,	t: 26,	dt: 14.274ms,	SimpleTD/loss: 0.806,	VanillaPG/loss: 0.0474


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 455,	T: 19,853,	G: 34,	avg_r: 1,	avg_G: 99.5,	t: 34,	dt: 14.262ms,	SimpleTD/loss: 0.589,	VanillaPG/loss: 0.00997


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 456,	T: 19,890,	G: 36,	avg_r: 1,	avg_G: 93.2,	t: 36,	dt: 14.583ms,	SimpleTD/loss: 0.475,	VanillaPG/loss: -0.00788


Epoch reward 36.0


[a2c|TrainMonitor|INFO] ep: 457,	T: 19,942,	G: 51,	avg_r: 1,	avg_G: 89,	t: 51,	dt: 14.848ms,	SimpleTD/loss: 0.364,	VanillaPG/loss: -0.0197


Epoch reward 51.0


[a2c|TrainMonitor|INFO] ep: 458,	T: 19,986,	G: 43,	avg_r: 1,	avg_G: 84.4,	t: 43,	dt: 14.478ms,	SimpleTD/loss: 0.539,	VanillaPG/loss: 0.00777


Epoch reward 43.0


[a2c|TrainMonitor|INFO] ep: 459,	T: 20,045,	G: 58,	avg_r: 1,	avg_G: 81.7,	t: 58,	dt: 14.589ms,	SimpleTD/loss: 0.31,	VanillaPG/loss: 0.0174


Epoch reward 58.0


[a2c|TrainMonitor|INFO] ep: 460,	T: 20,165,	G: 119,	avg_r: 1,	avg_G: 85.5,	t: 119,	dt: 14.423ms,	SimpleTD/loss: 0.145,	VanillaPG/loss: 0.0166


Epoch reward 119.0


[a2c|TrainMonitor|INFO] ep: 461,	T: 20,224,	G: 58,	avg_r: 1,	avg_G: 82.7,	t: 58,	dt: 14.609ms,	SimpleTD/loss: 0.645,	VanillaPG/loss: -0.00385


Epoch reward 58.0


[a2c|TrainMonitor|INFO] ep: 462,	T: 20,274,	G: 49,	avg_r: 1,	avg_G: 79.3,	t: 49,	dt: 14.745ms,	SimpleTD/loss: 0.54,	VanillaPG/loss: -0.0614


Epoch reward 49.0


[a2c|TrainMonitor|INFO] ep: 463,	T: 20,382,	G: 107,	avg_r: 1,	avg_G: 82.1,	t: 107,	dt: 14.610ms,	SimpleTD/loss: 0.166,	VanillaPG/loss: -0.024


Epoch reward 107.0


[a2c|TrainMonitor|INFO] ep: 464,	T: 20,526,	G: 143,	avg_r: 1,	avg_G: 88.2,	t: 143,	dt: 14.661ms,	SimpleTD/loss: 0.348,	VanillaPG/loss: 0.00359


Epoch reward 143.0


[a2c|TrainMonitor|INFO] ep: 465,	T: 20,727,	G: 200,	avg_r: 1,	avg_G: 99.4,	t: 200,	dt: 14.774ms,	SimpleTD/loss: 0.0366,	VanillaPG/loss: -0.00511


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 466,	T: 20,885,	G: 157,	avg_r: 1,	avg_G: 105,	t: 157,	dt: 14.528ms,	SimpleTD/loss: 0.258,	VanillaPG/loss: -0.00812


Epoch reward 157.0


[a2c|TrainMonitor|INFO] ep: 467,	T: 21,086,	G: 200,	avg_r: 1,	avg_G: 115,	t: 200,	dt: 14.976ms,	SimpleTD/loss: 0.0331,	VanillaPG/loss: 0.00348


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 468,	T: 21,276,	G: 189,	avg_r: 1,	avg_G: 122,	t: 189,	dt: 14.688ms,	SimpleTD/loss: 0.16,	VanillaPG/loss: -0.0199


Epoch reward 189.0


[a2c|TrainMonitor|INFO] ep: 469,	T: 21,477,	G: 200,	avg_r: 1,	avg_G: 130,	t: 200,	dt: 14.361ms,	SimpleTD/loss: 0.025,	VanillaPG/loss: -0.00195


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 470,	T: 21,678,	G: 200,	avg_r: 1,	avg_G: 137,	t: 200,	dt: 14.351ms,	SimpleTD/loss: 0.0141,	VanillaPG/loss: 0.000402


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 471,	T: 21,879,	G: 200,	avg_r: 1,	avg_G: 143,	t: 200,	dt: 15.707ms,	SimpleTD/loss: 0.00465,	VanillaPG/loss: -0.00428


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 472,	T: 22,077,	G: 197,	avg_r: 1,	avg_G: 149,	t: 197,	dt: 14.702ms,	SimpleTD/loss: 0.14,	VanillaPG/loss: -0.00174


Epoch reward 197.0
