In [1]:
import coax
import gym
import haiku as hk
import jax
import jax.numpy as jnp
import optax
from coax.value_losses import mse



# the name of this script
name = 'a2c'

# the cart-pole MDP
env = gym.make('CartPole-v0')
env = coax.wrappers.TrainMonitor(env, name=name, tensorboard_dir=f"./data/tensorboard/{name}")

In [2]:
# ## Baseline MLPS

# def func_pi(S, is_training):
#     logits = hk.Sequential((
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(env.action_space.n, w_init=jnp.zeros)
#     ))
#     return {'logits': logits(S)}


# def func_v(S, is_training):
#     value = hk.Sequential((
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(8), jax.nn.relu,
#         hk.Linear(1, w_init=jnp.zeros), jnp.ravel
#     ))
#     return value(S)



In [3]:
from emlp import T, Scalar
from emlp.groups import SO, S, O, Trivial,Z
from emlp_haiku import EMLPBlock, Sequential, Linear,EMLP
from emlp.reps import Rep
from emlp.nn import gated,gate_indices,uniform_rep
from math import prod
from representations import PseudoScalar

## Trivial
# group=Trivial(2)
# rep_in = T(0)*prod(env.observation_space.shape)
# rep_out = T(0)*env.action_space.n#prod(env.action_space.shape)

## Reflection
group=Z(2)
rep_in = PseudoScalar()*prod(env.observation_space.shape)
rep_out = PseudoScalar()*env.action_space.n#prod(env.action_space.shape)

nn_pi = EMLP(rep_in,rep_out,group,ch=100,num_layers=2)
nn_v = EMLP(rep_in,T(0),group,ch=100,num_layers=2)

def func_pi(S, is_training):
    return {'logits': nn_pi(S)}


def func_v(S, is_training):
    return nn_v(S).reshape(-1)

[a2c|root|INFO] Initing EMLP
[a2c|root|INFO] Linear W components:400 rep:96P+48P⊗V+20P⊗V²+8P⊗V³+4P⊗V⁴
[a2c|root|INFO] P cache miss
[a2c|root|INFO] Solving basis for P, for G=Z(2)


(P+P+P+P) (P+P) 4P 2P
[4P, 24V⁰+12V+5V²+2V³+V⁴, 24V⁰+12V+5V²+2V³+V⁴]


[a2c|root|INFO] P⊗V cache miss
[a2c|root|INFO] Solving basis for P⊗V, for G=Z(2)
[a2c|root|INFO] P⊗V² cache miss
[a2c|root|INFO] Solving basis for P⊗V², for G=Z(2)
[a2c|root|INFO] P⊗V³ cache miss
[a2c|root|INFO] Solving basis for P⊗V³, for G=Z(2)
[a2c|root|INFO] P⊗V⁴ cache miss
[a2c|root|INFO] Solving basis for P⊗V⁴, for G=Z(2)
[a2c|root|INFO] V cache miss
[a2c|root|INFO] Solving basis for V, for G=Z(2)
[a2c|root|INFO] V² cache miss
[a2c|root|INFO] Solving basis for V², for G=Z(2)
[a2c|root|INFO] V³ cache miss
[a2c|root|INFO] Solving basis for V³, for G=Z(2)
[a2c|root|INFO] V⁴ cache miss
[a2c|root|INFO] Solving basis for V⁴, for G=Z(2)
[a2c|root|INFO] Linear W components:10000 rep:576V⁰+576V+384V²+216V³+121V⁴+44V⁵+14V⁶+4V⁷+V⁸
[a2c|root|INFO] V⁵ cache miss
[a2c|root|INFO] Solving basis for V⁵, for G=Z(2)
[a2c|root|INFO] V⁶ cache miss
[a2c|root|INFO] Solving basis for V⁶, for G=Z(2)
[a2c|root|INFO] V⁷ cache miss
[a2c|root|INFO] Solving basis for V⁷, for G=Z(2)
[a2c|root|INFO] V⁸ cache mi

[4P, 24V⁰+12V+5V²+2V³+V⁴, 24V⁰+12V+5V²+2V³+V⁴]


[a2c|root|INFO] Linear W components:100 rep:24V⁰+12V+5V²+2V³+V⁴


In [4]:

# these optimizers collect batches of grads before applying updates
optimizer_v = optax.chain(optax.apply_every(k=32), optax.adam(0.002))
optimizer_pi = optax.chain(optax.apply_every(k=32), optax.adam(0.001))


# value function and its derived policy
v = coax.V(func_v, env)
pi = coax.Policy(func_pi, env)

# experience tracer
tracer = coax.reward_tracing.NStep(n=1, gamma=0.9)

# updaters
vanilla_pg = coax.policy_objectives.VanillaPG(pi, optimizer=optimizer_pi)
simple_td = coax.td_learning.SimpleTD(v, loss_function=mse, optimizer=optimizer_v)


# train
for ep in range(1000):
    s = env.reset()
    er = 0
    for t in range(env.spec.max_episode_steps):
        a = pi(s)
        s_next, r, done, info = env.step(a)
        
        if done and (t == env.spec.max_episode_steps - 1):
            r = 1 / (1 - tracer.gamma)
        er+=r
        tracer.add(s, a, r, done)
        while tracer:
            transition_batch = tracer.pop()
            metrics_v, td_error = simple_td.update(transition_batch, return_td_error=True)
            metrics_pi = vanilla_pg.update(transition_batch, td_error)
            env.record_metrics(metrics_v)
            env.record_metrics(metrics_pi)

        if done:
            break

        s = s_next
    
    print("Epoch reward",er)
    # early stopping
    if env.avg_G > env.spec.reward_threshold:
        break


# run env one more time to render
#coax.utils.generate_gif(env, policy=pi, filepath=f"./data/{name}.gif", duration=25)

[a2c|TrainMonitor|INFO] ep: 1,	T: 28,	G: 27,	avg_r: 1,	avg_G: 27,	t: 27,	dt: 309.307ms,	SimpleTD/loss: 0.469,	VanillaPG/loss: 0.665
[a2c|TrainMonitor|INFO] ep: 2,	T: 40,	G: 11,	avg_r: 1,	avg_G: 19,	t: 11,	dt: 13.902ms,	SimpleTD/loss: 0.419,	VanillaPG/loss: 0.616


Epoch reward 27.0
Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 3,	T: 56,	G: 15,	avg_r: 1,	avg_G: 17.7,	t: 15,	dt: 14.357ms,	SimpleTD/loss: 0.366,	VanillaPG/loss: 0.532
[a2c|TrainMonitor|INFO] ep: 4,	T: 69,	G: 12,	avg_r: 1,	avg_G: 16.2,	t: 12,	dt: 13.843ms,	SimpleTD/loss: 0.356,	VanillaPG/loss: 0.55


Epoch reward 15.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 5,	T: 91,	G: 21,	avg_r: 1,	avg_G: 17.2,	t: 21,	dt: 14.643ms,	SimpleTD/loss: 0.369,	VanillaPG/loss: 0.406


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 6,	T: 111,	G: 19,	avg_r: 1,	avg_G: 17.5,	t: 19,	dt: 14.556ms,	SimpleTD/loss: 0.6,	VanillaPG/loss: 0.324
[a2c|TrainMonitor|INFO] ep: 7,	T: 125,	G: 13,	avg_r: 1,	avg_G: 16.9,	t: 13,	dt: 14.332ms,	SimpleTD/loss: 0.955,	VanillaPG/loss: 0.14


Epoch reward 19.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 8,	T: 148,	G: 22,	avg_r: 1,	avg_G: 17.5,	t: 22,	dt: 14.437ms,	SimpleTD/loss: 0.907,	VanillaPG/loss: 0.102


Epoch reward 22.0


[a2c|TrainMonitor|INFO] ep: 9,	T: 170,	G: 21,	avg_r: 1,	avg_G: 17.9,	t: 21,	dt: 14.138ms,	SimpleTD/loss: 1.01,	VanillaPG/loss: -0.0192


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 10,	T: 212,	G: 41,	avg_r: 1,	avg_G: 20.2,	t: 41,	dt: 13.915ms,	SimpleTD/loss: 0.607,	VanillaPG/loss: -0.037
[a2c|TrainMonitor|INFO] ep: 11,	T: 226,	G: 13,	avg_r: 1,	avg_G: 19.5,	t: 13,	dt: 14.438ms,	SimpleTD/loss: 1.98,	VanillaPG/loss: -0.286


Epoch reward 41.0
Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 12,	T: 238,	G: 11,	avg_r: 1,	avg_G: 18.6,	t: 11,	dt: 14.084ms,	SimpleTD/loss: 1.87,	VanillaPG/loss: -0.288


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 13,	T: 256,	G: 17,	avg_r: 1,	avg_G: 18.5,	t: 17,	dt: 13.978ms,	SimpleTD/loss: 0.322,	VanillaPG/loss: 0.168


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 14,	T: 288,	G: 31,	avg_r: 1,	avg_G: 19.7,	t: 31,	dt: 14.516ms,	SimpleTD/loss: 0.252,	VanillaPG/loss: 0.261
[a2c|TrainMonitor|INFO] ep: 15,	T: 301,	G: 12,	avg_r: 1,	avg_G: 18.9,	t: 12,	dt: 13.908ms,	SimpleTD/loss: 0.479,	VanillaPG/loss: 0.129


Epoch reward 31.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 16,	T: 316,	G: 14,	avg_r: 1,	avg_G: 18.5,	t: 14,	dt: 13.631ms,	SimpleTD/loss: 0.574,	VanillaPG/loss: 0.0389


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 17,	T: 333,	G: 16,	avg_r: 1,	avg_G: 18.2,	t: 16,	dt: 13.432ms,	SimpleTD/loss: 0.985,	VanillaPG/loss: -0.0252


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 18,	T: 351,	G: 17,	avg_r: 1,	avg_G: 18.1,	t: 17,	dt: 13.755ms,	SimpleTD/loss: 0.324,	VanillaPG/loss: -0.0265


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 19,	T: 371,	G: 19,	avg_r: 1,	avg_G: 18.2,	t: 19,	dt: 13.945ms,	SimpleTD/loss: 0.843,	VanillaPG/loss: -0.0178


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 20,	T: 388,	G: 16,	avg_r: 1,	avg_G: 18,	t: 16,	dt: 13.895ms,	SimpleTD/loss: 0.361,	VanillaPG/loss: 0.0253
[a2c|TrainMonitor|INFO] ep: 21,	T: 399,	G: 10,	avg_r: 1,	avg_G: 17.2,	t: 10,	dt: 14.116ms,	SimpleTD/loss: 1,	VanillaPG/loss: -0.129


Epoch reward 16.0
Epoch reward 10.0


[a2c|TrainMonitor|INFO] ep: 22,	T: 419,	G: 19,	avg_r: 1,	avg_G: 17.3,	t: 19,	dt: 14.589ms,	SimpleTD/loss: 0.36,	VanillaPG/loss: 0.0111
[a2c|TrainMonitor|INFO] ep: 23,	T: 432,	G: 12,	avg_r: 1,	avg_G: 16.8,	t: 12,	dt: 14.581ms,	SimpleTD/loss: 0.359,	VanillaPG/loss: 0.00218


Epoch reward 19.0
Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 24,	T: 446,	G: 13,	avg_r: 1,	avg_G: 16.4,	t: 13,	dt: 15.539ms,	SimpleTD/loss: 0.352,	VanillaPG/loss: -0.132


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 25,	T: 481,	G: 34,	avg_r: 1,	avg_G: 18.2,	t: 34,	dt: 14.542ms,	SimpleTD/loss: 1.15,	VanillaPG/loss: -0.0677


Epoch reward 34.0


[a2c|TrainMonitor|INFO] ep: 26,	T: 511,	G: 29,	avg_r: 1,	avg_G: 19.3,	t: 29,	dt: 14.068ms,	SimpleTD/loss: 0.943,	VanillaPG/loss: -0.198


Epoch reward 29.0


[a2c|TrainMonitor|INFO] ep: 27,	T: 534,	G: 22,	avg_r: 1,	avg_G: 19.5,	t: 22,	dt: 13.941ms,	SimpleTD/loss: 0.457,	VanillaPG/loss: 0.0267


Epoch reward 22.0


[a2c|TrainMonitor|INFO] ep: 28,	T: 613,	G: 78,	avg_r: 1,	avg_G: 25.4,	t: 78,	dt: 13.854ms,	SimpleTD/loss: 0.641,	VanillaPG/loss: 0.133


Epoch reward 78.0


[a2c|TrainMonitor|INFO] ep: 29,	T: 634,	G: 20,	avg_r: 1,	avg_G: 24.9,	t: 20,	dt: 13.554ms,	SimpleTD/loss: 2.48,	VanillaPG/loss: -0.997


Epoch reward 20.0


[a2c|TrainMonitor|INFO] ep: 30,	T: 676,	G: 41,	avg_r: 1,	avg_G: 26.5,	t: 41,	dt: 14.745ms,	SimpleTD/loss: 0.32,	VanillaPG/loss: -0.0461


Epoch reward 41.0


[a2c|TrainMonitor|INFO] ep: 31,	T: 754,	G: 77,	avg_r: 1,	avg_G: 31.5,	t: 77,	dt: 14.414ms,	SimpleTD/loss: 0.54,	VanillaPG/loss: 0.0462


Epoch reward 77.0


[a2c|TrainMonitor|INFO] ep: 32,	T: 863,	G: 108,	avg_r: 1,	avg_G: 39.2,	t: 108,	dt: 14.179ms,	SimpleTD/loss: 0.385,	VanillaPG/loss: -0.0025


Epoch reward 108.0


[a2c|TrainMonitor|INFO] ep: 33,	T: 929,	G: 65,	avg_r: 1,	avg_G: 41.8,	t: 65,	dt: 14.331ms,	SimpleTD/loss: 0.454,	VanillaPG/loss: 0.00854


Epoch reward 65.0


[a2c|TrainMonitor|INFO] ep: 34,	T: 998,	G: 68,	avg_r: 1,	avg_G: 44.4,	t: 68,	dt: 14.193ms,	SimpleTD/loss: 0.543,	VanillaPG/loss: -0.0493


Epoch reward 68.0


[a2c|TrainMonitor|INFO] ep: 35,	T: 1,106,	G: 107,	avg_r: 1,	avg_G: 50.6,	t: 107,	dt: 14.096ms,	SimpleTD/loss: 0.33,	VanillaPG/loss: -0.057


Epoch reward 107.0


[a2c|TrainMonitor|INFO] ep: 36,	T: 1,257,	G: 150,	avg_r: 1,	avg_G: 60.6,	t: 150,	dt: 14.057ms,	SimpleTD/loss: 0.239,	VanillaPG/loss: -0.0279


Epoch reward 150.0


[a2c|TrainMonitor|INFO] ep: 37,	T: 1,435,	G: 177,	avg_r: 1,	avg_G: 72.2,	t: 177,	dt: 14.154ms,	SimpleTD/loss: 0.184,	VanillaPG/loss: 0.00619


Epoch reward 177.0


[a2c|TrainMonitor|INFO] ep: 38,	T: 1,571,	G: 135,	avg_r: 1,	avg_G: 78.5,	t: 135,	dt: 14.134ms,	SimpleTD/loss: 0.202,	VanillaPG/loss: -0.0416


Epoch reward 135.0


[a2c|TrainMonitor|INFO] ep: 39,	T: 1,674,	G: 102,	avg_r: 1,	avg_G: 80.8,	t: 102,	dt: 14.141ms,	SimpleTD/loss: 0.244,	VanillaPG/loss: -0.0152


Epoch reward 102.0


[a2c|TrainMonitor|INFO] ep: 40,	T: 1,752,	G: 77,	avg_r: 1,	avg_G: 80.5,	t: 77,	dt: 13.875ms,	SimpleTD/loss: 0.168,	VanillaPG/loss: 0.000549


Epoch reward 77.0


[a2c|TrainMonitor|INFO] ep: 41,	T: 1,860,	G: 107,	avg_r: 1,	avg_G: 83.1,	t: 107,	dt: 14.157ms,	SimpleTD/loss: 0.0803,	VanillaPG/loss: -0.0309


Epoch reward 107.0


[a2c|TrainMonitor|INFO] ep: 42,	T: 1,997,	G: 136,	avg_r: 1,	avg_G: 88.4,	t: 136,	dt: 13.943ms,	SimpleTD/loss: 0.281,	VanillaPG/loss: -0.0713


Epoch reward 136.0


[a2c|TrainMonitor|INFO] ep: 43,	T: 2,198,	G: 200,	avg_r: 1,	avg_G: 99.6,	t: 200,	dt: 13.784ms,	SimpleTD/loss: 0.0574,	VanillaPG/loss: -0.0167


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 44,	T: 2,262,	G: 63,	avg_r: 1,	avg_G: 95.9,	t: 63,	dt: 14.394ms,	SimpleTD/loss: 0.458,	VanillaPG/loss: -0.0154


Epoch reward 63.0


[a2c|TrainMonitor|INFO] ep: 45,	T: 2,349,	G: 86,	avg_r: 1,	avg_G: 94.9,	t: 86,	dt: 13.678ms,	SimpleTD/loss: 0.261,	VanillaPG/loss: 0.00405


Epoch reward 86.0


[a2c|TrainMonitor|INFO] ep: 46,	T: 2,456,	G: 106,	avg_r: 1,	avg_G: 96,	t: 106,	dt: 14.411ms,	SimpleTD/loss: 0.245,	VanillaPG/loss: -0.00967


Epoch reward 106.0


[a2c|TrainMonitor|INFO] ep: 47,	T: 2,657,	G: 200,	avg_r: 1,	avg_G: 106,	t: 200,	dt: 14.191ms,	SimpleTD/loss: 0.0024,	VanillaPG/loss: -0.0072


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 48,	T: 2,824,	G: 166,	avg_r: 1,	avg_G: 112,	t: 166,	dt: 14.247ms,	SimpleTD/loss: 0.105,	VanillaPG/loss: -0.00813


Epoch reward 166.0


[a2c|TrainMonitor|INFO] ep: 49,	T: 2,908,	G: 83,	avg_r: 1,	avg_G: 109,	t: 83,	dt: 14.234ms,	SimpleTD/loss: 0.205,	VanillaPG/loss: 0.00275


Epoch reward 83.0


[a2c|TrainMonitor|INFO] ep: 50,	T: 3,109,	G: 200,	avg_r: 1,	avg_G: 118,	t: 200,	dt: 14.195ms,	SimpleTD/loss: 0.0027,	VanillaPG/loss: -0.00708


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 51,	T: 3,214,	G: 104,	avg_r: 1,	avg_G: 117,	t: 104,	dt: 14.211ms,	SimpleTD/loss: 0.0841,	VanillaPG/loss: 0.000642


Epoch reward 104.0


[a2c|TrainMonitor|INFO] ep: 52,	T: 3,415,	G: 200,	avg_r: 1,	avg_G: 125,	t: 200,	dt: 14.221ms,	SimpleTD/loss: 0.00152,	VanillaPG/loss: -0.0111


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 53,	T: 3,569,	G: 153,	avg_r: 1,	avg_G: 128,	t: 153,	dt: 14.265ms,	SimpleTD/loss: 0.0134,	VanillaPG/loss: 0.00169


Epoch reward 153.0


[a2c|TrainMonitor|INFO] ep: 54,	T: 3,770,	G: 200,	avg_r: 1,	avg_G: 135,	t: 200,	dt: 14.178ms,	SimpleTD/loss: 0.00592,	VanillaPG/loss: 0.00119


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 55,	T: 3,971,	G: 200,	avg_r: 1,	avg_G: 142,	t: 200,	dt: 14.319ms,	SimpleTD/loss: 0.000301,	VanillaPG/loss: -0.00436


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 56,	T: 4,172,	G: 200,	avg_r: 1,	avg_G: 148,	t: 200,	dt: 14.259ms,	SimpleTD/loss: 0.000303,	VanillaPG/loss: -0.00525


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 57,	T: 4,373,	G: 200,	avg_r: 1,	avg_G: 153,	t: 200,	dt: 14.304ms,	SimpleTD/loss: 0.000292,	VanillaPG/loss: -0.00549


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 58,	T: 4,574,	G: 200,	avg_r: 1,	avg_G: 158,	t: 200,	dt: 13.903ms,	SimpleTD/loss: 0.000366,	VanillaPG/loss: -0.00666


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 59,	T: 4,748,	G: 173,	avg_r: 1,	avg_G: 159,	t: 173,	dt: 14.352ms,	SimpleTD/loss: 0.115,	VanillaPG/loss: 0.000649


Epoch reward 173.0


[a2c|TrainMonitor|INFO] ep: 60,	T: 4,918,	G: 169,	avg_r: 1,	avg_G: 160,	t: 169,	dt: 14.323ms,	SimpleTD/loss: 0.029,	VanillaPG/loss: 0.00335


Epoch reward 169.0


[a2c|TrainMonitor|INFO] ep: 61,	T: 5,119,	G: 200,	avg_r: 1,	avg_G: 164,	t: 200,	dt: 14.216ms,	SimpleTD/loss: 0.0129,	VanillaPG/loss: -0.00635


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 62,	T: 5,320,	G: 200,	avg_r: 1,	avg_G: 168,	t: 200,	dt: 14.087ms,	SimpleTD/loss: 0.000229,	VanillaPG/loss: -0.00266


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 63,	T: 5,521,	G: 200,	avg_r: 1,	avg_G: 171,	t: 200,	dt: 13.825ms,	SimpleTD/loss: 0.000679,	VanillaPG/loss: -0.000324


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 64,	T: 5,697,	G: 175,	avg_r: 1,	avg_G: 171,	t: 175,	dt: 14.180ms,	SimpleTD/loss: 0.166,	VanillaPG/loss: -0.000734


Epoch reward 175.0


[a2c|TrainMonitor|INFO] ep: 65,	T: 5,898,	G: 200,	avg_r: 1,	avg_G: 174,	t: 200,	dt: 13.633ms,	SimpleTD/loss: 0.00965,	VanillaPG/loss: 0.000935


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 66,	T: 6,099,	G: 200,	avg_r: 1,	avg_G: 177,	t: 200,	dt: 13.583ms,	SimpleTD/loss: 0.000416,	VanillaPG/loss: -0.00693


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 67,	T: 6,300,	G: 200,	avg_r: 1,	avg_G: 179,	t: 200,	dt: 13.750ms,	SimpleTD/loss: 0.00156,	VanillaPG/loss: -0.00158


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 68,	T: 6,501,	G: 200,	avg_r: 1,	avg_G: 181,	t: 200,	dt: 13.967ms,	SimpleTD/loss: 0.000386,	VanillaPG/loss: -0.00588


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 69,	T: 6,702,	G: 200,	avg_r: 1,	avg_G: 183,	t: 200,	dt: 13.981ms,	SimpleTD/loss: 0.000785,	VanillaPG/loss: -0.00588


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 70,	T: 6,903,	G: 200,	avg_r: 1,	avg_G: 185,	t: 200,	dt: 14.606ms,	SimpleTD/loss: 0.000231,	VanillaPG/loss: -0.00421


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 71,	T: 7,104,	G: 200,	avg_r: 1,	avg_G: 186,	t: 200,	dt: 13.575ms,	SimpleTD/loss: 0.000274,	VanillaPG/loss: -0.00618


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 72,	T: 7,285,	G: 180,	avg_r: 1,	avg_G: 186,	t: 180,	dt: 13.799ms,	SimpleTD/loss: 0.113,	VanillaPG/loss: 7e-05


Epoch reward 180.0


[a2c|TrainMonitor|INFO] ep: 73,	T: 7,486,	G: 200,	avg_r: 1,	avg_G: 187,	t: 200,	dt: 14.362ms,	SimpleTD/loss: 0.0109,	VanillaPG/loss: 0.00319


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 74,	T: 7,687,	G: 200,	avg_r: 1,	avg_G: 188,	t: 200,	dt: 14.339ms,	SimpleTD/loss: 0.000679,	VanillaPG/loss: -0.00303


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 75,	T: 7,850,	G: 162,	avg_r: 1,	avg_G: 186,	t: 162,	dt: 14.217ms,	SimpleTD/loss: 0.0355,	VanillaPG/loss: 0.00358


Epoch reward 162.0


[a2c|TrainMonitor|INFO] ep: 76,	T: 8,051,	G: 200,	avg_r: 1,	avg_G: 187,	t: 200,	dt: 13.917ms,	SimpleTD/loss: 0.0246,	VanillaPG/loss: -0.00133


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 77,	T: 8,252,	G: 200,	avg_r: 1,	avg_G: 188,	t: 200,	dt: 14.122ms,	SimpleTD/loss: 0.000373,	VanillaPG/loss: -0.00311


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 78,	T: 8,453,	G: 200,	avg_r: 1,	avg_G: 190,	t: 200,	dt: 14.327ms,	SimpleTD/loss: 0.000165,	VanillaPG/loss: -0.00383


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 79,	T: 8,654,	G: 200,	avg_r: 1,	avg_G: 191,	t: 200,	dt: 13.978ms,	SimpleTD/loss: 0.00278,	VanillaPG/loss: 0.000705


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 80,	T: 8,809,	G: 154,	avg_r: 1,	avg_G: 187,	t: 154,	dt: 13.923ms,	SimpleTD/loss: 0.0492,	VanillaPG/loss: 0.000432


Epoch reward 154.0


[a2c|TrainMonitor|INFO] ep: 81,	T: 9,010,	G: 200,	avg_r: 1,	avg_G: 188,	t: 200,	dt: 13.662ms,	SimpleTD/loss: 0.00568,	VanillaPG/loss: -0.0025


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 82,	T: 9,208,	G: 197,	avg_r: 1,	avg_G: 189,	t: 197,	dt: 13.769ms,	SimpleTD/loss: 0.0595,	VanillaPG/loss: 0.0168


Epoch reward 197.0


[a2c|TrainMonitor|INFO] ep: 83,	T: 9,409,	G: 200,	avg_r: 1,	avg_G: 190,	t: 200,	dt: 14.267ms,	SimpleTD/loss: 0.0125,	VanillaPG/loss: 0.00667


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 84,	T: 9,610,	G: 200,	avg_r: 1,	avg_G: 191,	t: 200,	dt: 14.075ms,	SimpleTD/loss: 2.88e-05,	VanillaPG/loss: -0.000837


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 85,	T: 9,811,	G: 200,	avg_r: 1,	avg_G: 192,	t: 200,	dt: 13.796ms,	SimpleTD/loss: 4.14e-06,	VanillaPG/loss: -0.000494


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 86,	T: 10,012,	G: 200,	avg_r: 1,	avg_G: 193,	t: 200,	dt: 14.098ms,	SimpleTD/loss: 0.000377,	VanillaPG/loss: 5.22e-06


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 87,	T: 10,213,	G: 200,	avg_r: 1,	avg_G: 194,	t: 200,	dt: 13.836ms,	SimpleTD/loss: 6.59e-05,	VanillaPG/loss: -0.00155


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 88,	T: 10,414,	G: 200,	avg_r: 1,	avg_G: 194,	t: 200,	dt: 13.609ms,	SimpleTD/loss: 4.3e-05,	VanillaPG/loss: -0.000171


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 89,	T: 10,615,	G: 200,	avg_r: 1,	avg_G: 195,	t: 200,	dt: 13.985ms,	SimpleTD/loss: 1.88e-05,	VanillaPG/loss: -0.000526


Epoch reward 209.0
Epoch reward 209.0
