In [1]:
import coax
import gym
import haiku as hk
import jax
import jax.numpy as jnp
import optax
from coax.value_losses import mse
from incline_cartpole import InclinedCartpole


# the name of this script
name = 'a2c'

# the cart-pole MDP
env = gym.make('CartPole-v0')
# env = gym.make("rpp_gym:InclinedCartpole-v0")

In [2]:
env.alpha = 0.2

In [3]:
env = coax.wrappers.TrainMonitor(env, name=name, tensorboard_dir=f"./data/tensorboard/{name}")

In [4]:
# env.spec.max_episode_steps = 200
# env.spec.reward_threshold = 195.0

In [5]:
from emlp import T, Scalar
from emlp.groups import SO, S, O, Trivial,Z
from emlp_haiku import EMLPBlock, Sequential, Linear,EMLP, MLP
from emlp.reps import Rep
from emlp.nn import gated,gate_indices,uniform_rep
from math import prod
from representations import PseudoScalar
from mixed_emlp_haiku import MixedEMLP

## Trivial
# group=Trivial(2)
# rep_in = T(0)*prod(env.observation_space.shape)
# rep_out = T(0)*env.action_space.n#prod(env.action_space.shape)

## Reflection
group=Z(2)
rep_in = PseudoScalar()*prod(env.observation_space.shape)
rep_out = T(1)#*env.action_space.n#prod(env.action_space.shape)

# nn_pi = EMLP(rep_in,rep_out,group,ch=100,num_layers=2)
# nn_v = EMLP(rep_in,T(0),group,ch=100,num_layers=2)

# nn_pi = MixedEMLP(rep_in,rep_out(group),group,ch=100,num_layers=2)
# nn_v = MixedEMLP(rep_in,T(0),group,ch=100,num_layers=2)


# def func_pi(S, is_training):
#     return {'logits': nn_pi(S)}


# def func_v(S, is_training):
#     return nn_v(S).reshape(-1)



def func_pi(S, is_training):
    logits = hk.Sequential((
        hk.Linear(16), jax.nn.relu,
        hk.Linear(16), jax.nn.relu,
        hk.Linear(16), jax.nn.relu,
        hk.Linear(env.action_space.n, w_init=jnp.zeros)
    ))
    return {'logits': logits(S)}


def func_v(S, is_training):
    value = hk.Sequential((
        hk.Linear(16), jax.nn.relu,
        hk.Linear(16), jax.nn.relu,
        hk.Linear(16), jax.nn.relu,
        hk.Linear(1, w_init=jnp.zeros), jnp.ravel
    ))
    return value(S)


In [6]:
# these optimizers collect batches of grads before applying updates
optimizer_v = optax.chain(optax.apply_every(k=32), optax.adam(0.002))
optimizer_pi = optax.chain(optax.apply_every(k=32), optax.adam(0.001))


# value function and its derived policy
v = coax.V(func_v, env)
pi = coax.Policy(func_pi, env)

In [7]:
store = v.params

In [8]:
# experience tracer
tracer = coax.reward_tracing.NStep(n=1, gamma=0.9)

# updaters
vanilla_pg = coax.policy_objectives.VanillaPG(pi, optimizer=optimizer_pi)
simple_td = coax.td_learning.SimpleTD(v, loss_function=mse, optimizer=optimizer_v)


# train
for ep in range(1000):
    s = env.reset()
    er = 0
    for t in range(env.spec.max_episode_steps):
        a = pi(s)
        s_next, r, done, info = env.step(a)
        
        if done and (t == env.spec.max_episode_steps - 1):
            r = 1 / (1 - tracer.gamma)
        er+=r
        tracer.add(s, a, r, done)
        while tracer:
            transition_batch = tracer.pop()
            metrics_v, td_error = simple_td.update(transition_batch, return_td_error=True)
            metrics_pi = vanilla_pg.update(transition_batch, td_error)
            env.record_metrics(metrics_v)
            env.record_metrics(metrics_pi)

        if done:
            break

        s = s_next
    
    print("Epoch reward",er)
    # early stopping
    if env.avg_G > env.spec.reward_threshold:
        break


# run env one more time to render
#coax.utils.generate_gif(env, policy=pi, filepath=f"./data/{name}.gif", duration=25)

[a2c|TrainMonitor|INFO] ep: 1,	T: 15,	G: 14,	avg_r: 1,	avg_G: 14,	t: 14,	dt: 370.929ms,	SimpleTD/loss: 0.5,	VanillaPG/loss: 0.693


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 2,	T: 27,	G: 11,	avg_r: 1,	avg_G: 12.5,	t: 11,	dt: 23.628ms,	SimpleTD/loss: 0.5,	VanillaPG/loss: 0.693


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 3,	T: 45,	G: 17,	avg_r: 1,	avg_G: 14,	t: 17,	dt: 24.417ms,	SimpleTD/loss: 0.499,	VanillaPG/loss: 0.692


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 4,	T: 57,	G: 11,	avg_r: 1,	avg_G: 13.2,	t: 11,	dt: 24.562ms,	SimpleTD/loss: 0.497,	VanillaPG/loss: 0.691


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 5,	T: 79,	G: 21,	avg_r: 1,	avg_G: 14.8,	t: 21,	dt: 25.227ms,	SimpleTD/loss: 0.496,	VanillaPG/loss: 0.69


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 6,	T: 115,	G: 35,	avg_r: 1,	avg_G: 18.2,	t: 35,	dt: 26.697ms,	SimpleTD/loss: 0.49,	VanillaPG/loss: 0.683


Epoch reward 35.0


[a2c|TrainMonitor|INFO] ep: 7,	T: 132,	G: 16,	avg_r: 1,	avg_G: 17.9,	t: 16,	dt: 24.710ms,	SimpleTD/loss: 0.484,	VanillaPG/loss: 0.677


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 8,	T: 152,	G: 19,	avg_r: 1,	avg_G: 18,	t: 19,	dt: 24.366ms,	SimpleTD/loss: 0.477,	VanillaPG/loss: 0.661


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 9,	T: 164,	G: 11,	avg_r: 1,	avg_G: 17.2,	t: 11,	dt: 25.966ms,	SimpleTD/loss: 0.467,	VanillaPG/loss: 0.628


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 10,	T: 179,	G: 14,	avg_r: 1,	avg_G: 16.9,	t: 14,	dt: 25.138ms,	SimpleTD/loss: 0.476,	VanillaPG/loss: 0.623


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 11,	T: 194,	G: 14,	avg_r: 1,	avg_G: 16.6,	t: 14,	dt: 24.793ms,	SimpleTD/loss: 0.463,	VanillaPG/loss: 0.618


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 12,	T: 214,	G: 19,	avg_r: 1,	avg_G: 16.8,	t: 19,	dt: 24.715ms,	SimpleTD/loss: 0.452,	VanillaPG/loss: 0.653


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 13,	T: 227,	G: 12,	avg_r: 1,	avg_G: 16.4,	t: 12,	dt: 25.603ms,	SimpleTD/loss: 0.475,	VanillaPG/loss: 0.627


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 14,	T: 286,	G: 58,	avg_r: 1,	avg_G: 20.5,	t: 58,	dt: 25.999ms,	SimpleTD/loss: 0.604,	VanillaPG/loss: 0.544


Epoch reward 58.0


[a2c|TrainMonitor|INFO] ep: 15,	T: 359,	G: 72,	avg_r: 1,	avg_G: 25.7,	t: 72,	dt: 26.127ms,	SimpleTD/loss: 2.01,	VanillaPG/loss: 0.357


Epoch reward 72.0


[a2c|TrainMonitor|INFO] ep: 16,	T: 396,	G: 36,	avg_r: 1,	avg_G: 26.7,	t: 36,	dt: 25.144ms,	SimpleTD/loss: 0.72,	VanillaPG/loss: 0.294


Epoch reward 36.0


[a2c|TrainMonitor|INFO] ep: 17,	T: 411,	G: 14,	avg_r: 1,	avg_G: 25.4,	t: 14,	dt: 24.399ms,	SimpleTD/loss: 1.65,	VanillaPG/loss: 0.121


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 18,	T: 446,	G: 34,	avg_r: 1,	avg_G: 26.3,	t: 34,	dt: 25.593ms,	SimpleTD/loss: 0.838,	VanillaPG/loss: 0.339


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 19,	T: 464,	G: 17,	avg_r: 1,	avg_G: 25.4,	t: 17,	dt: 23.330ms,	SimpleTD/loss: 2.45,	VanillaPG/loss: 0.117


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 20,	T: 493,	G: 28,	avg_r: 1,	avg_G: 25.6,	t: 28,	dt: 25.303ms,	SimpleTD/loss: 2.95,	VanillaPG/loss: 0.192


Epoch reward 28.0


[a2c|TrainMonitor|INFO] ep: 21,	T: 505,	G: 11,	avg_r: 1,	avg_G: 24.2,	t: 11,	dt: 24.078ms,	SimpleTD/loss: 5.02,	VanillaPG/loss: 0.0873


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 22,	T: 525,	G: 19,	avg_r: 1,	avg_G: 23.6,	t: 19,	dt: 24.746ms,	SimpleTD/loss: 1.06,	VanillaPG/loss: 0.14


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 23,	T: 538,	G: 12,	avg_r: 1,	avg_G: 22.5,	t: 12,	dt: 25.341ms,	SimpleTD/loss: 2.31,	VanillaPG/loss: 0.0731


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 24,	T: 556,	G: 17,	avg_r: 1,	avg_G: 21.9,	t: 17,	dt: 24.563ms,	SimpleTD/loss: 0.737,	VanillaPG/loss: 0.109


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 25,	T: 571,	G: 14,	avg_r: 1,	avg_G: 21.1,	t: 14,	dt: 25.114ms,	SimpleTD/loss: 2.02,	VanillaPG/loss: 0.165


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 26,	T: 585,	G: 13,	avg_r: 1,	avg_G: 20.3,	t: 13,	dt: 24.379ms,	SimpleTD/loss: 1.17,	VanillaPG/loss: 0.183


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 27,	T: 596,	G: 10,	avg_r: 1,	avg_G: 19.3,	t: 10,	dt: 24.880ms,	SimpleTD/loss: 2.32,	VanillaPG/loss: -0.551


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 28,	T: 607,	G: 10,	avg_r: 1,	avg_G: 18.4,	t: 10,	dt: 24.794ms,	SimpleTD/loss: 3.14,	VanillaPG/loss: -0.931


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 29,	T: 627,	G: 19,	avg_r: 1,	avg_G: 18.4,	t: 19,	dt: 24.180ms,	SimpleTD/loss: 1.6,	VanillaPG/loss: 0.274


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 30,	T: 639,	G: 11,	avg_r: 1,	avg_G: 17.7,	t: 11,	dt: 24.508ms,	SimpleTD/loss: 2.38,	VanillaPG/loss: 0.211


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 31,	T: 651,	G: 11,	avg_r: 1,	avg_G: 17,	t: 11,	dt: 25.351ms,	SimpleTD/loss: 1.74,	VanillaPG/loss: 0.164


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 32,	T: 664,	G: 12,	avg_r: 1,	avg_G: 16.5,	t: 12,	dt: 25.130ms,	SimpleTD/loss: 1.28,	VanillaPG/loss: 0.232


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 33,	T: 680,	G: 15,	avg_r: 1,	avg_G: 16.4,	t: 15,	dt: 26.451ms,	SimpleTD/loss: 0.572,	VanillaPG/loss: -0.197


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 34,	T: 700,	G: 19,	avg_r: 1,	avg_G: 16.6,	t: 19,	dt: 25.880ms,	SimpleTD/loss: 1.28,	VanillaPG/loss: 0.244


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 35,	T: 717,	G: 16,	avg_r: 1,	avg_G: 16.6,	t: 16,	dt: 24.740ms,	SimpleTD/loss: 0.515,	VanillaPG/loss: 0.337


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 36,	T: 729,	G: 11,	avg_r: 1,	avg_G: 16,	t: 11,	dt: 24.297ms,	SimpleTD/loss: 0.749,	VanillaPG/loss: 0.18


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 37,	T: 742,	G: 12,	avg_r: 1,	avg_G: 15.6,	t: 12,	dt: 25.069ms,	SimpleTD/loss: 2.82,	VanillaPG/loss: 0.146


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 38,	T: 760,	G: 17,	avg_r: 1,	avg_G: 15.7,	t: 17,	dt: 24.157ms,	SimpleTD/loss: 0.769,	VanillaPG/loss: 0.262


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 39,	T: 783,	G: 22,	avg_r: 1,	avg_G: 16.4,	t: 22,	dt: 26.370ms,	SimpleTD/loss: 0.93,	VanillaPG/loss: 0.157


Epoch reward 22.0


[a2c|TrainMonitor|INFO] ep: 40,	T: 805,	G: 21,	avg_r: 1,	avg_G: 16.8,	t: 21,	dt: 26.061ms,	SimpleTD/loss: 0.825,	VanillaPG/loss: 0.247


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 41,	T: 819,	G: 13,	avg_r: 1,	avg_G: 16.5,	t: 13,	dt: 26.355ms,	SimpleTD/loss: 0.671,	VanillaPG/loss: 0.138


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 42,	T: 829,	G: 9,	avg_r: 1,	avg_G: 15.7,	t: 9,	dt: 24.776ms,	SimpleTD/loss: 0.654,	VanillaPG/loss: 0.058


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 43,	T: 841,	G: 11,	avg_r: 1,	avg_G: 15.2,	t: 11,	dt: 26.730ms,	SimpleTD/loss: 0.489,	VanillaPG/loss: 0.128


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 44,	T: 853,	G: 11,	avg_r: 1,	avg_G: 14.8,	t: 11,	dt: 26.285ms,	SimpleTD/loss: 0.474,	VanillaPG/loss: 0.138


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 45,	T: 869,	G: 15,	avg_r: 1,	avg_G: 14.8,	t: 15,	dt: 25.434ms,	SimpleTD/loss: 0.39,	VanillaPG/loss: 0.246


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 46,	T: 882,	G: 12,	avg_r: 1,	avg_G: 14.5,	t: 12,	dt: 25.167ms,	SimpleTD/loss: 0.449,	VanillaPG/loss: 0.267


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 47,	T: 893,	G: 10,	avg_r: 1,	avg_G: 14.1,	t: 10,	dt: 24.831ms,	SimpleTD/loss: 0.754,	VanillaPG/loss: -0.708


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 48,	T: 908,	G: 14,	avg_r: 1,	avg_G: 14.1,	t: 14,	dt: 25.308ms,	SimpleTD/loss: 0.46,	VanillaPG/loss: 0.23


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 49,	T: 927,	G: 18,	avg_r: 1,	avg_G: 14.5,	t: 18,	dt: 25.989ms,	SimpleTD/loss: 0.383,	VanillaPG/loss: 0.145


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 50,	T: 949,	G: 21,	avg_r: 1,	avg_G: 15.1,	t: 21,	dt: 25.934ms,	SimpleTD/loss: 3.12,	VanillaPG/loss: -0.286


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 51,	T: 959,	G: 9,	avg_r: 1,	avg_G: 14.5,	t: 9,	dt: 25.626ms,	SimpleTD/loss: 0.706,	VanillaPG/loss: -0.0435


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 52,	T: 970,	G: 10,	avg_r: 1,	avg_G: 14.1,	t: 10,	dt: 25.502ms,	SimpleTD/loss: 0.654,	VanillaPG/loss: 0.00166


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 53,	T: 980,	G: 9,	avg_r: 1,	avg_G: 13.6,	t: 9,	dt: 25.730ms,	SimpleTD/loss: 17.3,	VanillaPG/loss: -0.066


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 54,	T: 992,	G: 11,	avg_r: 1,	avg_G: 13.3,	t: 11,	dt: 25.292ms,	SimpleTD/loss: 2.55,	VanillaPG/loss: -0.0403


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 55,	T: 1,004,	G: 11,	avg_r: 1,	avg_G: 13.1,	t: 11,	dt: 24.721ms,	SimpleTD/loss: 0.195,	VanillaPG/loss: 0.165


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 56,	T: 1,014,	G: 9,	avg_r: 1,	avg_G: 12.7,	t: 9,	dt: 25.496ms,	SimpleTD/loss: 0.198,	VanillaPG/loss: 0.0897


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 57,	T: 1,030,	G: 15,	avg_r: 1,	avg_G: 12.9,	t: 15,	dt: 25.574ms,	SimpleTD/loss: 0.207,	VanillaPG/loss: 0.228


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 58,	T: 1,040,	G: 9,	avg_r: 1,	avg_G: 12.5,	t: 9,	dt: 26.838ms,	SimpleTD/loss: 2.66,	VanillaPG/loss: 0.159


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 59,	T: 1,070,	G: 29,	avg_r: 1,	avg_G: 14.2,	t: 29,	dt: 25.570ms,	SimpleTD/loss: 0.639,	VanillaPG/loss: 0.32


Epoch reward 29.0


[a2c|TrainMonitor|INFO] ep: 60,	T: 1,086,	G: 15,	avg_r: 1,	avg_G: 14.2,	t: 15,	dt: 24.201ms,	SimpleTD/loss: 1.08,	VanillaPG/loss: 0.199


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 61,	T: 1,101,	G: 14,	avg_r: 1,	avg_G: 14.2,	t: 14,	dt: 24.504ms,	SimpleTD/loss: 2.08,	VanillaPG/loss: 0.177


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 62,	T: 1,121,	G: 19,	avg_r: 1,	avg_G: 14.7,	t: 19,	dt: 24.846ms,	SimpleTD/loss: 1.34,	VanillaPG/loss: 0.161


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 63,	T: 1,136,	G: 14,	avg_r: 1,	avg_G: 14.6,	t: 14,	dt: 24.309ms,	SimpleTD/loss: 0.424,	VanillaPG/loss: 0.217


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 64,	T: 1,146,	G: 9,	avg_r: 1,	avg_G: 14.1,	t: 9,	dt: 25.239ms,	SimpleTD/loss: 3.1,	VanillaPG/loss: 0.0247


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 65,	T: 1,157,	G: 10,	avg_r: 1,	avg_G: 13.7,	t: 10,	dt: 25.061ms,	SimpleTD/loss: 1.71,	VanillaPG/loss: 0.0131


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 66,	T: 1,169,	G: 11,	avg_r: 1,	avg_G: 13.4,	t: 11,	dt: 24.799ms,	SimpleTD/loss: 0.543,	VanillaPG/loss: 0.157


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 67,	T: 1,185,	G: 15,	avg_r: 1,	avg_G: 13.6,	t: 15,	dt: 24.784ms,	SimpleTD/loss: 0.439,	VanillaPG/loss: 0.223


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 68,	T: 1,207,	G: 21,	avg_r: 1,	avg_G: 14.3,	t: 21,	dt: 26.053ms,	SimpleTD/loss: 0.692,	VanillaPG/loss: 0.148


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 69,	T: 1,225,	G: 17,	avg_r: 1,	avg_G: 14.6,	t: 17,	dt: 26.020ms,	SimpleTD/loss: 0.83,	VanillaPG/loss: 0.18


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 70,	T: 1,242,	G: 16,	avg_r: 1,	avg_G: 14.7,	t: 16,	dt: 24.328ms,	SimpleTD/loss: 1.13,	VanillaPG/loss: 0.111


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 71,	T: 1,263,	G: 20,	avg_r: 1,	avg_G: 15.2,	t: 20,	dt: 25.097ms,	SimpleTD/loss: 0.461,	VanillaPG/loss: 0.171


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 72,	T: 1,281,	G: 17,	avg_r: 1,	avg_G: 15.4,	t: 17,	dt: 24.733ms,	SimpleTD/loss: 0.622,	VanillaPG/loss: 0.128


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 73,	T: 1,304,	G: 22,	avg_r: 1,	avg_G: 16.1,	t: 22,	dt: 25.309ms,	SimpleTD/loss: 0.425,	VanillaPG/loss: 0.142


Epoch reward 22.0


[a2c|TrainMonitor|INFO] ep: 74,	T: 1,326,	G: 21,	avg_r: 1,	avg_G: 16.6,	t: 21,	dt: 24.906ms,	SimpleTD/loss: 0.436,	VanillaPG/loss: 0.00242


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 75,	T: 1,350,	G: 23,	avg_r: 1,	avg_G: 17.2,	t: 23,	dt: 25.574ms,	SimpleTD/loss: 0.408,	VanillaPG/loss: 0.0772


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 76,	T: 1,361,	G: 10,	avg_r: 1,	avg_G: 16.5,	t: 10,	dt: 25.246ms,	SimpleTD/loss: 2.31,	VanillaPG/loss: -0.173


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 77,	T: 1,372,	G: 10,	avg_r: 1,	avg_G: 15.8,	t: 10,	dt: 24.225ms,	SimpleTD/loss: 0.736,	VanillaPG/loss: -0.0277


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 78,	T: 1,389,	G: 16,	avg_r: 1,	avg_G: 15.9,	t: 16,	dt: 23.905ms,	SimpleTD/loss: 1.96,	VanillaPG/loss: -0.203


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 79,	T: 1,406,	G: 16,	avg_r: 1,	avg_G: 15.9,	t: 16,	dt: 26.268ms,	SimpleTD/loss: 0.988,	VanillaPG/loss: 0.0575


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 80,	T: 1,417,	G: 10,	avg_r: 1,	avg_G: 15.3,	t: 10,	dt: 27.672ms,	SimpleTD/loss: 0.325,	VanillaPG/loss: 0.0381


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 81,	T: 1,432,	G: 14,	avg_r: 1,	avg_G: 15.2,	t: 14,	dt: 25.242ms,	SimpleTD/loss: 1.08,	VanillaPG/loss: -0.012


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 82,	T: 1,449,	G: 16,	avg_r: 1,	avg_G: 15.2,	t: 16,	dt: 24.228ms,	SimpleTD/loss: 0.922,	VanillaPG/loss: 0.0897


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 83,	T: 1,493,	G: 43,	avg_r: 1,	avg_G: 18,	t: 43,	dt: 24.815ms,	SimpleTD/loss: 0.176,	VanillaPG/loss: 0.23


Epoch reward 43.0


[a2c|TrainMonitor|INFO] ep: 84,	T: 1,517,	G: 23,	avg_r: 1,	avg_G: 18.5,	t: 23,	dt: 24.920ms,	SimpleTD/loss: 1.05,	VanillaPG/loss: -0.00545


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 85,	T: 1,535,	G: 17,	avg_r: 1,	avg_G: 18.4,	t: 17,	dt: 25.733ms,	SimpleTD/loss: 1.11,	VanillaPG/loss: -0.111


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 86,	T: 1,546,	G: 10,	avg_r: 1,	avg_G: 17.5,	t: 10,	dt: 24.724ms,	SimpleTD/loss: 2,	VanillaPG/loss: -0.17


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 87,	T: 1,556,	G: 9,	avg_r: 1,	avg_G: 16.7,	t: 9,	dt: 24.711ms,	SimpleTD/loss: 0.722,	VanillaPG/loss: -0.177


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 88,	T: 1,567,	G: 10,	avg_r: 1,	avg_G: 16,	t: 10,	dt: 25.501ms,	SimpleTD/loss: 1.41,	VanillaPG/loss: -0.329


Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 89,	T: 1,579,	G: 11,	avg_r: 1,	avg_G: 15.5,	t: 11,	dt: 24.850ms,	SimpleTD/loss: 1.03,	VanillaPG/loss: -0.147


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 90,	T: 1,600,	G: 20,	avg_r: 1,	avg_G: 16,	t: 20,	dt: 23.569ms,	SimpleTD/loss: 0.605,	VanillaPG/loss: 0.0867


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 91,	T: 1,626,	G: 25,	avg_r: 1,	avg_G: 16.9,	t: 25,	dt: 25.090ms,	SimpleTD/loss: 0.3,	VanillaPG/loss: 0.188


Epoch reward 25.0


[a2c|TrainMonitor|INFO] ep: 92,	T: 1,639,	G: 12,	avg_r: 1,	avg_G: 16.4,	t: 12,	dt: 24.836ms,	SimpleTD/loss: 1.12,	VanillaPG/loss: 0.175


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 93,	T: 1,651,	G: 11,	avg_r: 1,	avg_G: 15.8,	t: 11,	dt: 23.669ms,	SimpleTD/loss: 0.959,	VanillaPG/loss: 0.032


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 94,	T: 1,677,	G: 25,	avg_r: 1,	avg_G: 16.8,	t: 25,	dt: 24.666ms,	SimpleTD/loss: 0.366,	VanillaPG/loss: 0.121


Epoch reward 25.0


[a2c|TrainMonitor|INFO] ep: 95,	T: 1,693,	G: 15,	avg_r: 1,	avg_G: 16.6,	t: 15,	dt: 24.998ms,	SimpleTD/loss: 0.661,	VanillaPG/loss: -0.0475


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 96,	T: 1,732,	G: 38,	avg_r: 1,	avg_G: 18.7,	t: 38,	dt: 24.488ms,	SimpleTD/loss: 0.26,	VanillaPG/loss: 0.108


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 97,	T: 1,754,	G: 21,	avg_r: 1,	avg_G: 18.9,	t: 21,	dt: 24.333ms,	SimpleTD/loss: 0.87,	VanillaPG/loss: 0.00558


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 98,	T: 1,770,	G: 15,	avg_r: 1,	avg_G: 18.6,	t: 15,	dt: 24.598ms,	SimpleTD/loss: 1.1,	VanillaPG/loss: -0.0522


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 99,	T: 1,783,	G: 12,	avg_r: 1,	avg_G: 17.9,	t: 12,	dt: 25.210ms,	SimpleTD/loss: 1.85,	VanillaPG/loss: -0.0933


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 100,	T: 1,808,	G: 24,	avg_r: 1,	avg_G: 18.5,	t: 24,	dt: 24.549ms,	SimpleTD/loss: 0.691,	VanillaPG/loss: -0.00557


Epoch reward 24.0


[a2c|TrainMonitor|INFO] ep: 101,	T: 1,847,	G: 38,	avg_r: 1,	avg_G: 20.5,	t: 38,	dt: 24.921ms,	SimpleTD/loss: 0.753,	VanillaPG/loss: 0.108


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 102,	T: 1,863,	G: 15,	avg_r: 1,	avg_G: 19.9,	t: 15,	dt: 25.267ms,	SimpleTD/loss: 1.38,	VanillaPG/loss: -0.064


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 103,	T: 1,901,	G: 37,	avg_r: 1,	avg_G: 21.6,	t: 37,	dt: 24.101ms,	SimpleTD/loss: 0.421,	VanillaPG/loss: 0.0773


Epoch reward 37.0


[a2c|TrainMonitor|INFO] ep: 104,	T: 1,915,	G: 13,	avg_r: 1,	avg_G: 20.8,	t: 13,	dt: 24.624ms,	SimpleTD/loss: 0.97,	VanillaPG/loss: -0.117


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 105,	T: 1,948,	G: 32,	avg_r: 1,	avg_G: 21.9,	t: 32,	dt: 25.327ms,	SimpleTD/loss: 0.401,	VanillaPG/loss: 0.0834


Epoch reward 32.0


[a2c|TrainMonitor|INFO] ep: 106,	T: 1,969,	G: 20,	avg_r: 1,	avg_G: 21.7,	t: 20,	dt: 25.025ms,	SimpleTD/loss: 0.771,	VanillaPG/loss: -0.0118


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 107,	T: 1,999,	G: 29,	avg_r: 1,	avg_G: 22.4,	t: 29,	dt: 24.178ms,	SimpleTD/loss: 0.501,	VanillaPG/loss: -0.000681


Epoch reward 29.0


[a2c|TrainMonitor|INFO] ep: 108,	T: 2,020,	G: 20,	avg_r: 1,	avg_G: 22.2,	t: 20,	dt: 24.029ms,	SimpleTD/loss: 0.731,	VanillaPG/loss: -0.0681


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 109,	T: 2,039,	G: 18,	avg_r: 1,	avg_G: 21.8,	t: 18,	dt: 24.746ms,	SimpleTD/loss: 0.679,	VanillaPG/loss: -0.0864


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 110,	T: 2,066,	G: 26,	avg_r: 1,	avg_G: 22.2,	t: 26,	dt: 24.467ms,	SimpleTD/loss: 0.581,	VanillaPG/loss: 0.0622


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 111,	T: 2,101,	G: 34,	avg_r: 1,	avg_G: 23.4,	t: 34,	dt: 24.938ms,	SimpleTD/loss: 0.468,	VanillaPG/loss: 0.0449


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 112,	T: 2,115,	G: 13,	avg_r: 1,	avg_G: 22.3,	t: 13,	dt: 25.612ms,	SimpleTD/loss: 1.77,	VanillaPG/loss: -0.224


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 113,	T: 2,143,	G: 27,	avg_r: 1,	avg_G: 22.8,	t: 27,	dt: 25.081ms,	SimpleTD/loss: 0.407,	VanillaPG/loss: 0.0413


Epoch reward 27.0


[a2c|TrainMonitor|INFO] ep: 114,	T: 2,184,	G: 40,	avg_r: 1,	avg_G: 24.5,	t: 40,	dt: 23.305ms,	SimpleTD/loss: 0.465,	VanillaPG/loss: 0.0355


Epoch reward 40.0


[a2c|TrainMonitor|INFO] ep: 115,	T: 2,203,	G: 18,	avg_r: 1,	avg_G: 23.9,	t: 18,	dt: 25.470ms,	SimpleTD/loss: 0.476,	VanillaPG/loss: -0.0519


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 116,	T: 2,231,	G: 27,	avg_r: 1,	avg_G: 24.2,	t: 27,	dt: 24.553ms,	SimpleTD/loss: 1.04,	VanillaPG/loss: -0.0649


Epoch reward 27.0


[a2c|TrainMonitor|INFO] ep: 117,	T: 2,249,	G: 17,	avg_r: 1,	avg_G: 23.5,	t: 17,	dt: 25.816ms,	SimpleTD/loss: 0.8,	VanillaPG/loss: -0.199


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 118,	T: 2,268,	G: 18,	avg_r: 1,	avg_G: 22.9,	t: 18,	dt: 24.381ms,	SimpleTD/loss: 0.604,	VanillaPG/loss: -0.153


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 119,	T: 2,281,	G: 12,	avg_r: 1,	avg_G: 21.8,	t: 12,	dt: 24.467ms,	SimpleTD/loss: 0.502,	VanillaPG/loss: -0.0943


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 120,	T: 2,313,	G: 31,	avg_r: 1,	avg_G: 22.7,	t: 31,	dt: 24.649ms,	SimpleTD/loss: 0.249,	VanillaPG/loss: 0.103


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 121,	T: 2,332,	G: 18,	avg_r: 1,	avg_G: 22.3,	t: 18,	dt: 25.925ms,	SimpleTD/loss: 0.377,	VanillaPG/loss: -0.0166


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 122,	T: 2,353,	G: 20,	avg_r: 1,	avg_G: 22,	t: 20,	dt: 25.715ms,	SimpleTD/loss: 0.563,	VanillaPG/loss: 0.00675


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 123,	T: 2,380,	G: 26,	avg_r: 1,	avg_G: 22.4,	t: 26,	dt: 24.180ms,	SimpleTD/loss: 0.52,	VanillaPG/loss: -0.073


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 124,	T: 2,409,	G: 28,	avg_r: 1,	avg_G: 23,	t: 28,	dt: 24.613ms,	SimpleTD/loss: 0.631,	VanillaPG/loss: -0.0452


Epoch reward 28.0


[a2c|TrainMonitor|INFO] ep: 125,	T: 2,429,	G: 19,	avg_r: 1,	avg_G: 22.6,	t: 19,	dt: 23.460ms,	SimpleTD/loss: 0.809,	VanillaPG/loss: -0.129


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 126,	T: 2,444,	G: 14,	avg_r: 1,	avg_G: 21.7,	t: 14,	dt: 22.914ms,	SimpleTD/loss: 0.405,	VanillaPG/loss: -0.142


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 127,	T: 2,464,	G: 19,	avg_r: 1,	avg_G: 21.5,	t: 19,	dt: 25.289ms,	SimpleTD/loss: 0.33,	VanillaPG/loss: 0.00761


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 128,	T: 2,505,	G: 40,	avg_r: 1,	avg_G: 23.3,	t: 40,	dt: 23.914ms,	SimpleTD/loss: 0.694,	VanillaPG/loss: -0.0306


Epoch reward 40.0


[a2c|TrainMonitor|INFO] ep: 129,	T: 2,529,	G: 23,	avg_r: 1,	avg_G: 23.3,	t: 23,	dt: 24.799ms,	SimpleTD/loss: 0.406,	VanillaPG/loss: -0.117


Epoch reward 23.0


[a2c|TrainMonitor|INFO] ep: 130,	T: 2,546,	G: 16,	avg_r: 1,	avg_G: 22.6,	t: 16,	dt: 25.043ms,	SimpleTD/loss: 1.04,	VanillaPG/loss: -0.145


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 131,	T: 2,567,	G: 20,	avg_r: 1,	avg_G: 22.3,	t: 20,	dt: 26.275ms,	SimpleTD/loss: 1.59,	VanillaPG/loss: -0.0684


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 132,	T: 2,618,	G: 50,	avg_r: 1,	avg_G: 25.1,	t: 50,	dt: 24.759ms,	SimpleTD/loss: 0.539,	VanillaPG/loss: 0.00952


Epoch reward 50.0


[a2c|TrainMonitor|INFO] ep: 133,	T: 2,652,	G: 33,	avg_r: 1,	avg_G: 25.9,	t: 33,	dt: 24.687ms,	SimpleTD/loss: 0.669,	VanillaPG/loss: -0.0152


Epoch reward 33.0


[a2c|TrainMonitor|INFO] ep: 134,	T: 2,701,	G: 48,	avg_r: 1,	avg_G: 28.1,	t: 48,	dt: 24.900ms,	SimpleTD/loss: 0.594,	VanillaPG/loss: -0.0253


Epoch reward 48.0


[a2c|TrainMonitor|INFO] ep: 135,	T: 2,755,	G: 53,	avg_r: 1,	avg_G: 30.6,	t: 53,	dt: 24.855ms,	SimpleTD/loss: 0.37,	VanillaPG/loss: -0.0428


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 136,	T: 2,791,	G: 35,	avg_r: 1,	avg_G: 31,	t: 35,	dt: 25.420ms,	SimpleTD/loss: 0.476,	VanillaPG/loss: -0.0484


Epoch reward 35.0


[a2c|TrainMonitor|INFO] ep: 137,	T: 2,826,	G: 34,	avg_r: 1,	avg_G: 31.3,	t: 34,	dt: 24.795ms,	SimpleTD/loss: 0.972,	VanillaPG/loss: -0.225


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 138,	T: 2,842,	G: 15,	avg_r: 1,	avg_G: 29.7,	t: 15,	dt: 24.583ms,	SimpleTD/loss: 1.46,	VanillaPG/loss: -0.314


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 139,	T: 2,878,	G: 35,	avg_r: 1,	avg_G: 30.2,	t: 35,	dt: 25.061ms,	SimpleTD/loss: 0.307,	VanillaPG/loss: -0.0503


Epoch reward 35.0


[a2c|TrainMonitor|INFO] ep: 140,	T: 2,946,	G: 67,	avg_r: 1,	avg_G: 33.9,	t: 67,	dt: 24.655ms,	SimpleTD/loss: 0.702,	VanillaPG/loss: -0.00973


Epoch reward 67.0


[a2c|TrainMonitor|INFO] ep: 141,	T: 2,985,	G: 38,	avg_r: 1,	avg_G: 34.3,	t: 38,	dt: 24.068ms,	SimpleTD/loss: 0.384,	VanillaPG/loss: -0.0255


Epoch reward 38.0


[a2c|TrainMonitor|INFO] ep: 142,	T: 3,017,	G: 31,	avg_r: 1,	avg_G: 34,	t: 31,	dt: 25.479ms,	SimpleTD/loss: 0.298,	VanillaPG/loss: -0.178


Epoch reward 31.0


[a2c|TrainMonitor|INFO] ep: 143,	T: 3,071,	G: 53,	avg_r: 1,	avg_G: 35.9,	t: 53,	dt: 24.774ms,	SimpleTD/loss: 0.648,	VanillaPG/loss: -0.169


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 144,	T: 3,170,	G: 98,	avg_r: 1,	avg_G: 42.1,	t: 98,	dt: 24.618ms,	SimpleTD/loss: 0.567,	VanillaPG/loss: -0.268


Epoch reward 98.0


[a2c|TrainMonitor|INFO] ep: 145,	T: 3,303,	G: 132,	avg_r: 1,	avg_G: 51.1,	t: 132,	dt: 25.244ms,	SimpleTD/loss: 0.998,	VanillaPG/loss: -0.0589


Epoch reward 132.0


[a2c|TrainMonitor|INFO] ep: 146,	T: 3,462,	G: 158,	avg_r: 1,	avg_G: 61.8,	t: 158,	dt: 25.231ms,	SimpleTD/loss: 0.32,	VanillaPG/loss: -0.0575


Epoch reward 158.0


[a2c|TrainMonitor|INFO] ep: 147,	T: 3,569,	G: 106,	avg_r: 1,	avg_G: 66.2,	t: 106,	dt: 23.889ms,	SimpleTD/loss: 0.223,	VanillaPG/loss: -0.058


Epoch reward 106.0


[a2c|TrainMonitor|INFO] ep: 148,	T: 3,674,	G: 104,	avg_r: 1,	avg_G: 70,	t: 104,	dt: 22.983ms,	SimpleTD/loss: 0.224,	VanillaPG/loss: -0.134


Epoch reward 104.0


[a2c|TrainMonitor|INFO] ep: 149,	T: 3,722,	G: 47,	avg_r: 1,	avg_G: 67.7,	t: 47,	dt: 22.549ms,	SimpleTD/loss: 0.301,	VanillaPG/loss: -0.0933


Epoch reward 47.0


[a2c|TrainMonitor|INFO] ep: 150,	T: 3,793,	G: 70,	avg_r: 1,	avg_G: 67.9,	t: 70,	dt: 24.221ms,	SimpleTD/loss: 0.174,	VanillaPG/loss: -0.047


Epoch reward 70.0


[a2c|TrainMonitor|INFO] ep: 151,	T: 3,846,	G: 52,	avg_r: 1,	avg_G: 66.3,	t: 52,	dt: 25.055ms,	SimpleTD/loss: 0.283,	VanillaPG/loss: -0.139


Epoch reward 52.0


[a2c|TrainMonitor|INFO] ep: 152,	T: 3,937,	G: 90,	avg_r: 1,	avg_G: 68.7,	t: 90,	dt: 24.686ms,	SimpleTD/loss: 0.142,	VanillaPG/loss: -0.0549


Epoch reward 90.0


[a2c|TrainMonitor|INFO] ep: 153,	T: 3,991,	G: 53,	avg_r: 1,	avg_G: 67.1,	t: 53,	dt: 25.153ms,	SimpleTD/loss: 0.239,	VanillaPG/loss: -0.114


Epoch reward 53.0


[a2c|TrainMonitor|INFO] ep: 154,	T: 4,074,	G: 82,	avg_r: 1,	avg_G: 68.6,	t: 82,	dt: 24.977ms,	SimpleTD/loss: 0.178,	VanillaPG/loss: -0.137


Epoch reward 82.0


[a2c|TrainMonitor|INFO] ep: 155,	T: 4,265,	G: 190,	avg_r: 1,	avg_G: 80.7,	t: 190,	dt: 25.017ms,	SimpleTD/loss: 0.355,	VanillaPG/loss: -0.0578


Epoch reward 190.0


[a2c|TrainMonitor|INFO] ep: 156,	T: 4,379,	G: 113,	avg_r: 1,	avg_G: 84,	t: 113,	dt: 24.935ms,	SimpleTD/loss: 0.273,	VanillaPG/loss: 0.117


Epoch reward 113.0


[a2c|TrainMonitor|INFO] ep: 157,	T: 4,580,	G: 200,	avg_r: 1,	avg_G: 95.6,	t: 200,	dt: 24.657ms,	SimpleTD/loss: 0.0358,	VanillaPG/loss: -0.0218


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 158,	T: 4,705,	G: 124,	avg_r: 1,	avg_G: 98.4,	t: 124,	dt: 23.947ms,	SimpleTD/loss: 0.372,	VanillaPG/loss: 0.0287


Epoch reward 124.0


[a2c|TrainMonitor|INFO] ep: 159,	T: 4,784,	G: 78,	avg_r: 1,	avg_G: 96.4,	t: 78,	dt: 24.318ms,	SimpleTD/loss: 0.322,	VanillaPG/loss: 0.00258


Epoch reward 78.0


[a2c|TrainMonitor|INFO] ep: 160,	T: 4,871,	G: 86,	avg_r: 1,	avg_G: 95.3,	t: 86,	dt: 24.728ms,	SimpleTD/loss: 0.41,	VanillaPG/loss: -0.000757


Epoch reward 86.0


[a2c|TrainMonitor|INFO] ep: 161,	T: 4,924,	G: 52,	avg_r: 1,	avg_G: 91,	t: 52,	dt: 25.148ms,	SimpleTD/loss: 0.413,	VanillaPG/loss: -0.319


Epoch reward 52.0


[a2c|TrainMonitor|INFO] ep: 162,	T: 4,995,	G: 70,	avg_r: 1,	avg_G: 88.9,	t: 70,	dt: 25.327ms,	SimpleTD/loss: 0.336,	VanillaPG/loss: 0.0468


Epoch reward 70.0


[a2c|TrainMonitor|INFO] ep: 163,	T: 5,063,	G: 67,	avg_r: 1,	avg_G: 86.7,	t: 67,	dt: 24.842ms,	SimpleTD/loss: 0.322,	VanillaPG/loss: -0.0112


Epoch reward 67.0


[a2c|TrainMonitor|INFO] ep: 164,	T: 5,127,	G: 63,	avg_r: 1,	avg_G: 84.3,	t: 63,	dt: 25.093ms,	SimpleTD/loss: 0.335,	VanillaPG/loss: -0.00889


Epoch reward 63.0


[a2c|TrainMonitor|INFO] ep: 165,	T: 5,231,	G: 103,	avg_r: 1,	avg_G: 86.2,	t: 103,	dt: 24.510ms,	SimpleTD/loss: 0.229,	VanillaPG/loss: -0.129


Epoch reward 103.0


[a2c|TrainMonitor|INFO] ep: 166,	T: 5,287,	G: 55,	avg_r: 1,	avg_G: 83.1,	t: 55,	dt: 24.402ms,	SimpleTD/loss: 0.26,	VanillaPG/loss: 0.0173


Epoch reward 55.0


[a2c|TrainMonitor|INFO] ep: 167,	T: 5,372,	G: 84,	avg_r: 1,	avg_G: 83.2,	t: 84,	dt: 25.440ms,	SimpleTD/loss: 0.215,	VanillaPG/loss: 0.0213


Epoch reward 84.0


[a2c|TrainMonitor|INFO] ep: 168,	T: 5,507,	G: 134,	avg_r: 1,	avg_G: 88.3,	t: 134,	dt: 25.087ms,	SimpleTD/loss: 0.148,	VanillaPG/loss: 0.0102


Epoch reward 134.0


[a2c|TrainMonitor|INFO] ep: 169,	T: 5,626,	G: 118,	avg_r: 1,	avg_G: 91.2,	t: 118,	dt: 24.718ms,	SimpleTD/loss: 0.236,	VanillaPG/loss: 0.0179


Epoch reward 118.0


[a2c|TrainMonitor|INFO] ep: 170,	T: 5,797,	G: 170,	avg_r: 1,	avg_G: 99.1,	t: 170,	dt: 24.574ms,	SimpleTD/loss: 0.116,	VanillaPG/loss: -0.0229


Epoch reward 170.0


[a2c|TrainMonitor|INFO] ep: 171,	T: 5,952,	G: 154,	avg_r: 1,	avg_G: 105,	t: 154,	dt: 24.481ms,	SimpleTD/loss: 0.312,	VanillaPG/loss: -0.0648


Epoch reward 154.0


[a2c|TrainMonitor|INFO] ep: 172,	T: 6,111,	G: 158,	avg_r: 1,	avg_G: 110,	t: 158,	dt: 24.326ms,	SimpleTD/loss: 0.287,	VanillaPG/loss: -0.00181


Epoch reward 158.0


[a2c|TrainMonitor|INFO] ep: 173,	T: 6,246,	G: 134,	avg_r: 1,	avg_G: 112,	t: 134,	dt: 25.033ms,	SimpleTD/loss: 0.397,	VanillaPG/loss: -0.0243


Epoch reward 134.0


[a2c|TrainMonitor|INFO] ep: 174,	T: 6,379,	G: 132,	avg_r: 1,	avg_G: 114,	t: 132,	dt: 24.177ms,	SimpleTD/loss: 0.276,	VanillaPG/loss: 0.0451


Epoch reward 132.0


[a2c|TrainMonitor|INFO] ep: 175,	T: 6,426,	G: 46,	avg_r: 1,	avg_G: 107,	t: 46,	dt: 24.462ms,	SimpleTD/loss: 0.756,	VanillaPG/loss: -0.0767


Epoch reward 46.0


[a2c|TrainMonitor|INFO] ep: 176,	T: 6,478,	G: 51,	avg_r: 1,	avg_G: 102,	t: 51,	dt: 24.894ms,	SimpleTD/loss: 0.391,	VanillaPG/loss: -0.00758


Epoch reward 51.0


[a2c|TrainMonitor|INFO] ep: 177,	T: 6,547,	G: 68,	avg_r: 1,	avg_G: 98.4,	t: 68,	dt: 24.952ms,	SimpleTD/loss: 0.387,	VanillaPG/loss: 0.0195


Epoch reward 68.0


[a2c|TrainMonitor|INFO] ep: 178,	T: 6,607,	G: 59,	avg_r: 1,	avg_G: 94.5,	t: 59,	dt: 24.677ms,	SimpleTD/loss: 0.364,	VanillaPG/loss: -0.0215


Epoch reward 59.0


[a2c|TrainMonitor|INFO] ep: 179,	T: 6,671,	G: 63,	avg_r: 1,	avg_G: 91.4,	t: 63,	dt: 24.376ms,	SimpleTD/loss: 0.374,	VanillaPG/loss: -0.00874


Epoch reward 63.0


[a2c|TrainMonitor|INFO] ep: 180,	T: 6,709,	G: 37,	avg_r: 1,	avg_G: 85.9,	t: 37,	dt: 24.341ms,	SimpleTD/loss: 0.688,	VanillaPG/loss: 0.0254


Epoch reward 37.0


[a2c|TrainMonitor|INFO] ep: 181,	T: 6,752,	G: 42,	avg_r: 1,	avg_G: 81.5,	t: 42,	dt: 25.280ms,	SimpleTD/loss: 0.461,	VanillaPG/loss: -0.228


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 182,	T: 6,813,	G: 60,	avg_r: 1,	avg_G: 79.4,	t: 60,	dt: 25.134ms,	SimpleTD/loss: 0.235,	VanillaPG/loss: 0.0734


Epoch reward 60.0


[a2c|TrainMonitor|INFO] ep: 183,	T: 6,893,	G: 79,	avg_r: 1,	avg_G: 79.3,	t: 79,	dt: 24.458ms,	SimpleTD/loss: 0.23,	VanillaPG/loss: 0.0146


Epoch reward 79.0


[a2c|TrainMonitor|INFO] ep: 184,	T: 6,951,	G: 57,	avg_r: 1,	avg_G: 77.1,	t: 57,	dt: 24.279ms,	SimpleTD/loss: 0.341,	VanillaPG/loss: 0.0138


Epoch reward 57.0


[a2c|TrainMonitor|INFO] ep: 185,	T: 7,015,	G: 63,	avg_r: 1,	avg_G: 75.7,	t: 63,	dt: 25.457ms,	SimpleTD/loss: 0.231,	VanillaPG/loss: 0.00572


Epoch reward 63.0


[a2c|TrainMonitor|INFO] ep: 186,	T: 7,085,	G: 69,	avg_r: 1,	avg_G: 75,	t: 69,	dt: 24.532ms,	SimpleTD/loss: 0.208,	VanillaPG/loss: 0.0565


Epoch reward 69.0


[a2c|TrainMonitor|INFO] ep: 187,	T: 7,163,	G: 77,	avg_r: 1,	avg_G: 75.2,	t: 77,	dt: 25.279ms,	SimpleTD/loss: 0.179,	VanillaPG/loss: -0.166


Epoch reward 77.0


[a2c|TrainMonitor|INFO] ep: 188,	T: 7,302,	G: 138,	avg_r: 1,	avg_G: 81.5,	t: 138,	dt: 24.422ms,	SimpleTD/loss: 0.348,	VanillaPG/loss: -0.0276


Epoch reward 138.0


[a2c|TrainMonitor|INFO] ep: 189,	T: 7,458,	G: 155,	avg_r: 1,	avg_G: 88.8,	t: 155,	dt: 24.899ms,	SimpleTD/loss: 0.164,	VanillaPG/loss: 0.00325


Epoch reward 155.0


[a2c|TrainMonitor|INFO] ep: 190,	T: 7,553,	G: 94,	avg_r: 1,	avg_G: 89.4,	t: 94,	dt: 24.840ms,	SimpleTD/loss: 0.229,	VanillaPG/loss: -0.00887


Epoch reward 94.0


[a2c|TrainMonitor|INFO] ep: 191,	T: 7,672,	G: 118,	avg_r: 1,	avg_G: 92.2,	t: 118,	dt: 24.589ms,	SimpleTD/loss: 0.145,	VanillaPG/loss: -0.00671


Epoch reward 118.0


[a2c|TrainMonitor|INFO] ep: 192,	T: 7,873,	G: 200,	avg_r: 1,	avg_G: 103,	t: 200,	dt: 24.943ms,	SimpleTD/loss: 0.0899,	VanillaPG/loss: -0.013


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 193,	T: 7,938,	G: 64,	avg_r: 1,	avg_G: 99.1,	t: 64,	dt: 24.477ms,	SimpleTD/loss: 0.636,	VanillaPG/loss: -0.0745


Epoch reward 64.0


[a2c|TrainMonitor|INFO] ep: 194,	T: 8,014,	G: 75,	avg_r: 1,	avg_G: 96.7,	t: 75,	dt: 24.943ms,	SimpleTD/loss: 0.29,	VanillaPG/loss: 0.0164


Epoch reward 75.0


[a2c|TrainMonitor|INFO] ep: 195,	T: 8,115,	G: 100,	avg_r: 1,	avg_G: 97,	t: 100,	dt: 25.507ms,	SimpleTD/loss: 0.31,	VanillaPG/loss: 0.0261


Epoch reward 100.0


[a2c|TrainMonitor|INFO] ep: 196,	T: 8,316,	G: 200,	avg_r: 1,	avg_G: 107,	t: 200,	dt: 24.623ms,	SimpleTD/loss: 0.00594,	VanillaPG/loss: 0.0156


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 197,	T: 8,459,	G: 142,	avg_r: 1,	avg_G: 111,	t: 142,	dt: 22.785ms,	SimpleTD/loss: 0.284,	VanillaPG/loss: -0.0288


Epoch reward 142.0


[a2c|TrainMonitor|INFO] ep: 198,	T: 8,660,	G: 200,	avg_r: 1,	avg_G: 120,	t: 200,	dt: 24.641ms,	SimpleTD/loss: 0.00686,	VanillaPG/loss: 0.00417


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 199,	T: 8,861,	G: 200,	avg_r: 1,	avg_G: 128,	t: 200,	dt: 24.379ms,	SimpleTD/loss: 0.00563,	VanillaPG/loss: -0.00486


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 200,	T: 9,062,	G: 200,	avg_r: 1,	avg_G: 135,	t: 200,	dt: 23.140ms,	SimpleTD/loss: 0.00409,	VanillaPG/loss: -0.0162


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 201,	T: 9,263,	G: 200,	avg_r: 1,	avg_G: 141,	t: 200,	dt: 23.989ms,	SimpleTD/loss: 0.00692,	VanillaPG/loss: -0.0271


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 202,	T: 9,464,	G: 200,	avg_r: 1,	avg_G: 147,	t: 200,	dt: 23.978ms,	SimpleTD/loss: 0.00631,	VanillaPG/loss: 0.0117


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 203,	T: 9,644,	G: 179,	avg_r: 1,	avg_G: 150,	t: 179,	dt: 24.770ms,	SimpleTD/loss: 0.185,	VanillaPG/loss: -0.00598


Epoch reward 179.0


[a2c|TrainMonitor|INFO] ep: 204,	T: 9,845,	G: 200,	avg_r: 1,	avg_G: 155,	t: 200,	dt: 23.626ms,	SimpleTD/loss: 0.00938,	VanillaPG/loss: 0.00832


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 205,	T: 10,046,	G: 200,	avg_r: 1,	avg_G: 160,	t: 200,	dt: 24.512ms,	SimpleTD/loss: 0.00495,	VanillaPG/loss: -0.0268


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 206,	T: 10,247,	G: 200,	avg_r: 1,	avg_G: 164,	t: 200,	dt: 24.271ms,	SimpleTD/loss: 0.00146,	VanillaPG/loss: -0.0135


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 207,	T: 10,448,	G: 200,	avg_r: 1,	avg_G: 168,	t: 200,	dt: 24.325ms,	SimpleTD/loss: 0.00935,	VanillaPG/loss: -0.0236


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 208,	T: 10,649,	G: 200,	avg_r: 1,	avg_G: 171,	t: 200,	dt: 24.641ms,	SimpleTD/loss: 0.00365,	VanillaPG/loss: -0.021


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 209,	T: 10,850,	G: 200,	avg_r: 1,	avg_G: 174,	t: 200,	dt: 24.243ms,	SimpleTD/loss: 0.0298,	VanillaPG/loss: -0.00213


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 210,	T: 11,027,	G: 176,	avg_r: 1,	avg_G: 174,	t: 176,	dt: 24.426ms,	SimpleTD/loss: 0.198,	VanillaPG/loss: -0.00269


Epoch reward 176.0


[a2c|TrainMonitor|INFO] ep: 211,	T: 11,228,	G: 200,	avg_r: 1,	avg_G: 177,	t: 200,	dt: 24.719ms,	SimpleTD/loss: 0.0194,	VanillaPG/loss: 0.0137


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 212,	T: 11,412,	G: 183,	avg_r: 1,	avg_G: 177,	t: 183,	dt: 22.687ms,	SimpleTD/loss: 0.242,	VanillaPG/loss: -0.0436


Epoch reward 183.0


[a2c|TrainMonitor|INFO] ep: 213,	T: 11,606,	G: 193,	avg_r: 1,	avg_G: 179,	t: 193,	dt: 24.315ms,	SimpleTD/loss: 0.21,	VanillaPG/loss: -0.00417


Epoch reward 193.0


[a2c|TrainMonitor|INFO] ep: 214,	T: 11,807,	G: 200,	avg_r: 1,	avg_G: 181,	t: 200,	dt: 24.586ms,	SimpleTD/loss: 0.0294,	VanillaPG/loss: 0.0109


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 215,	T: 12,008,	G: 200,	avg_r: 1,	avg_G: 183,	t: 200,	dt: 24.546ms,	SimpleTD/loss: 0.000358,	VanillaPG/loss: -0.00545


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 216,	T: 12,209,	G: 200,	avg_r: 1,	avg_G: 185,	t: 200,	dt: 24.888ms,	SimpleTD/loss: 0.000289,	VanillaPG/loss: 0.00142


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 217,	T: 12,410,	G: 200,	avg_r: 1,	avg_G: 186,	t: 200,	dt: 24.631ms,	SimpleTD/loss: 0.000396,	VanillaPG/loss: -0.00588


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 218,	T: 12,611,	G: 200,	avg_r: 1,	avg_G: 187,	t: 200,	dt: 24.100ms,	SimpleTD/loss: 0.000251,	VanillaPG/loss: -0.00189


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 219,	T: 12,812,	G: 200,	avg_r: 1,	avg_G: 189,	t: 200,	dt: 24.654ms,	SimpleTD/loss: 0.000185,	VanillaPG/loss: -0.00222


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 220,	T: 13,013,	G: 200,	avg_r: 1,	avg_G: 190,	t: 200,	dt: 24.510ms,	SimpleTD/loss: 0.000401,	VanillaPG/loss: 0.00188


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 221,	T: 13,214,	G: 200,	avg_r: 1,	avg_G: 191,	t: 200,	dt: 24.873ms,	SimpleTD/loss: 0.000559,	VanillaPG/loss: -0.00644


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 222,	T: 13,415,	G: 200,	avg_r: 1,	avg_G: 192,	t: 200,	dt: 22.943ms,	SimpleTD/loss: 0.00023,	VanillaPG/loss: 0.000652


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 223,	T: 13,616,	G: 200,	avg_r: 1,	avg_G: 193,	t: 200,	dt: 24.840ms,	SimpleTD/loss: 8.13e-05,	VanillaPG/loss: -0.000253


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 224,	T: 13,817,	G: 200,	avg_r: 1,	avg_G: 193,	t: 200,	dt: 24.640ms,	SimpleTD/loss: 0.00141,	VanillaPG/loss: -0.000934


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 225,	T: 14,018,	G: 200,	avg_r: 1,	avg_G: 194,	t: 200,	dt: 24.688ms,	SimpleTD/loss: 0.000692,	VanillaPG/loss: -0.0022


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 226,	T: 14,219,	G: 200,	avg_r: 1,	avg_G: 195,	t: 200,	dt: 22.953ms,	SimpleTD/loss: 0.000393,	VanillaPG/loss: -0.00264


Epoch reward 209.0
Epoch reward 209.0


In [10]:
coax.utils.dump(pi, "./mlp_pi.lz4")

In [16]:
pi.params['lin']

FlatMapping({
  'linear': FlatMapping({
              'b': DeviceArray([-0.00078749,  0.22243674,  0.24990653,  0.25380114,
                                 0.01685529,  0.02420999,  0.35115007,  0.05400022,
                                 0.21136887,  0.29219517,  0.18837172,  0.22474289,
                                 0.00078776,  0.11020071,  0.35213456,  0.19622795],            dtype=float32),
              'w': DeviceArray([[-0.03512973,  0.05438015, -0.36982128,  0.09777525,
                                 -0.20791632,  0.21831983,  0.05427854,  0.16727945,
                                  0.2172411 ,  0.3122285 , -0.5020807 ,  0.42083606,
                                  0.40071788, -0.6656895 , -0.8764311 , -0.28065702],
                                [-0.27200034, -0.08180515,  0.194785  ,  0.78121173,
                                 -0.86986077, -0.45019484,  0.62767553, -0.7282137 ,
                                 -0.26940435, -0.37256098, -0.19365811, -0.56482714,


In [11]:
loaded_pi = coax.utils.load("./mlp_pi.lz4")

In [13]:
loaded_pi.params

FlatMapping({
  'linear': FlatMapping({
              'b': array([-0.00078749,  0.22243674,  0.24990653,  0.25380114,  0.01685529,
                           0.02420999,  0.35115007,  0.05400022,  0.21136887,  0.29219517,
                           0.18837172,  0.22474289,  0.00078776,  0.11020071,  0.35213456,
                           0.19622795], dtype=float32),
              'w': array([[-0.03512973,  0.05438015, -0.36982128,  0.09777525, -0.20791632,
                            0.21831983,  0.05427854,  0.16727945,  0.2172411 ,  0.3122285 ,
                           -0.5020807 ,  0.42083606,  0.40071788, -0.6656895 , -0.8764311 ,
                           -0.28065702],
                          [-0.27200034, -0.08180515,  0.194785  ,  0.78121173, -0.86986077,
                           -0.45019484,  0.62767553, -0.7282137 , -0.26940435, -0.37256098,
                           -0.19365811, -0.56482714,  1.0171323 , -0.08690273,  0.15433046,
                            0.18701562