In [1]:
import coax
import gym
import haiku as hk
import jax
import jax.numpy as jnp
import optax
from coax.value_losses import mse

import matplotlib.pyplot as plt

# the name of this script
name = 'a2c'

# the cart-pole MDP
# env = gym.make('CartPole-v0')
env = gym.make("rpp_gym:InclinedCartpole-v0")

In [2]:
"TEST".lower()

'test'

In [3]:
env = coax.wrappers.TrainMonitor(env, name=name, tensorboard_dir=f"./data/tensorboard/{name}")

In [4]:
# env.spec.max_episode_steps = 200
# env.spec.reward_threshold = 195.0

In [5]:
from emlp import T, Scalar
from emlp.groups import SO, S, O, Trivial,Z
import emlp.nn.haiku as ehk
from emlp.reps import Rep
from emlp.nn import gated,gate_indices,uniform_rep
from math import prod
from representations import PseudoScalar
from mixed_emlp_haiku import MixedEMLP

## Trivial
# group=Trivial(2)
# rep_in = T(0)*prod(env.observation_space.shape)
# rep_out = T(0)*env.action_space.n#prod(env.action_space.shape)

## Reflection
group=Z(2)
rep_in = PseudoScalar()*prod(env.observation_space.shape)
rep_out = T(1)#*env.action_space.n#prod(env.action_space.shape)


ch = 250
num_layers = 4

nn_pi = ehk.EMLP(rep_in,rep_out,group,ch=100,num_layers=2)
nn_v = ehk.EMLP(rep_in,T(0),group,ch=100,num_layers=2)

# nn_pi = ehk.MLP(rep_in,rep_out(group),group,ch=250,num_layers=num_layers)
# nn_v = ehk.MLP(rep_in,T(0),group,ch=ch,num_layers=num_layers)

# nn_pi = MixedEMLP(rep_in,rep_out(group),group,ch=250,num_layers=5)
# nn_v = MixedEMLP(rep_in,T(0),group,ch=250,num_layers=5)


def func_pi(S, is_training):
    return {'logits': nn_pi(S)}


def func_v(S, is_training):
    return nn_v(S).reshape(-1)



# def func_pi(S, is_training):
#     logits = hk.Sequential((
#         hk.Linear(16), jax.nn.relu,
#         hk.Linear(16), jax.nn.relu,
#         hk.Linear(16), jax.nn.relu,
#         hk.Linear(env.action_space.n, w_init=jnp.zeros)
#     ))
#     return {'logits': logits(S)}



# def func_v(S, is_training):
#     value = hk.Sequential((
#         hk.Linear(32), jax.nn.relu,
#         hk.Linear(32), jax.nn.relu,
#         hk.Linear(32), jax.nn.relu,
#         hk.Linear(32), jax.nn.relu,
#         hk.Linear(32), jax.nn.relu,
#         hk.Linear(1, w_init=jnp.zeros), jnp.ravel
#     ))
#     return value(S)


[a2c|root|INFO] Initing EMLP (Haiku)
[a2c|root|INFO] Linear W components:400 rep:96P+48P⊗V+20P⊗V²+8P⊗V³+4P⊗V⁴
[a2c|root|INFO] P cache miss
[a2c|root|INFO] Solving basis for P, for G=Z(2)
[a2c|root|INFO] P⊗V cache miss
[a2c|root|INFO] Solving basis for P⊗V, for G=Z(2)
[a2c|root|INFO] P⊗V² cache miss
[a2c|root|INFO] Solving basis for P⊗V², for G=Z(2)
[a2c|root|INFO] P⊗V³ cache miss
[a2c|root|INFO] Solving basis for P⊗V³, for G=Z(2)
[a2c|root|INFO] P⊗V⁴ cache miss
[a2c|root|INFO] Solving basis for P⊗V⁴, for G=Z(2)
[a2c|root|INFO] V cache miss
[a2c|root|INFO] Solving basis for V, for G=Z(2)
[a2c|root|INFO] V² cache miss
[a2c|root|INFO] Solving basis for V², for G=Z(2)
[a2c|root|INFO] V³ cache miss
[a2c|root|INFO] Solving basis for V³, for G=Z(2)
[a2c|root|INFO] V⁴ cache miss
[a2c|root|INFO] Solving basis for V⁴, for G=Z(2)
[a2c|root|INFO] Linear W components:10000 rep:576V⁰+576V+384V²+216V³+121V⁴+44V⁵+14V⁶+4V⁷+V⁸
[a2c|root|INFO] V⁵ cache miss
[a2c|root|INFO] Solving basis for V⁵, for G=Z(2

In [6]:
# these optimizers collect batches of grads before applying updates
optimizer_v = optax.chain(optax.apply_every(k=32), optax.adam(0.002))
optimizer_pi = optax.chain(optax.apply_every(k=32), optax.adam(0.001))

# value function and its derived policy
v = coax.V(func_v, env)
pi = coax.Policy(func_pi, env)

In [7]:
store = v.params

In [8]:
# experience tracer
tracer = coax.reward_tracing.NStep(n=1, gamma=0.9)

# updaters
vanilla_pg = coax.policy_objectives.VanillaPG(pi, optimizer=optimizer_pi)
simple_td = coax.td_learning.SimpleTD(v, loss_function=mse, optimizer=optimizer_v)

epoch_rewards = []

# train
for ep in range(1000):
    s = env.reset()
    er = 0
    for t in range(env.spec.max_episode_steps):
        a = pi(s)
        s_next, r, done, info = env.step(a)
        
        if done and (t == env.spec.max_episode_steps - 1):
            r = 1 / (1 - tracer.gamma)
        er+=r
        tracer.add(s, a, r, done)
        while tracer:
            transition_batch = tracer.pop()
            metrics_v, td_error = simple_td.update(transition_batch, return_td_error=True)
            metrics_pi = vanilla_pg.update(transition_batch, td_error)
            env.record_metrics(metrics_v)
            env.record_metrics(metrics_pi)

        if done:
            break

        s = s_next
    
    print("Epoch reward",er)
    epoch_rewards.append(er)
    # early stopping
    if env.avg_G > env.spec.reward_threshold:
        break


# run env one more time to render
#coax.utils.generate_gif(env, policy=pi, filepath=f"./data/{name}.gif", duration=25)

[a2c|TrainMonitor|INFO] ep: 1,	T: 12,	G: 11,	avg_r: 1,	avg_G: 11,	t: 11,	dt: 973.330ms,	SimpleTD/loss: 0.639,	VanillaPG/loss: 0.774


Epoch reward 11.0


[a2c|TrainMonitor|INFO] ep: 2,	T: 49,	G: 36,	avg_r: 1,	avg_G: 23.5,	t: 36,	dt: 28.061ms,	SimpleTD/loss: 0.524,	VanillaPG/loss: 0.714


Epoch reward 36.0


[a2c|TrainMonitor|INFO] ep: 3,	T: 64,	G: 14,	avg_r: 1,	avg_G: 20.3,	t: 14,	dt: 28.742ms,	SimpleTD/loss: 0.396,	VanillaPG/loss: 0.596


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 4,	T: 80,	G: 15,	avg_r: 1,	avg_G: 19,	t: 15,	dt: 28.372ms,	SimpleTD/loss: 0.437,	VanillaPG/loss: 0.496


Epoch reward 15.0


[a2c|TrainMonitor|INFO] ep: 5,	T: 97,	G: 16,	avg_r: 1,	avg_G: 18.4,	t: 16,	dt: 29.613ms,	SimpleTD/loss: 0.424,	VanillaPG/loss: 0.4


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 6,	T: 111,	G: 13,	avg_r: 1,	avg_G: 17.5,	t: 13,	dt: 29.448ms,	SimpleTD/loss: 1.06,	VanillaPG/loss: 0.253


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 7,	T: 125,	G: 13,	avg_r: 1,	avg_G: 16.9,	t: 13,	dt: 27.702ms,	SimpleTD/loss: 1.5,	VanillaPG/loss: 0.146


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 8,	T: 139,	G: 13,	avg_r: 1,	avg_G: 16.4,	t: 13,	dt: 29.213ms,	SimpleTD/loss: 1.27,	VanillaPG/loss: -0.0088


Epoch reward 13.0


[a2c|TrainMonitor|INFO] ep: 9,	T: 156,	G: 16,	avg_r: 1,	avg_G: 16.3,	t: 16,	dt: 29.353ms,	SimpleTD/loss: 1.28,	VanillaPG/loss: -0.119


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 10,	T: 178,	G: 21,	avg_r: 1,	avg_G: 16.8,	t: 21,	dt: 28.104ms,	SimpleTD/loss: 0.698,	VanillaPG/loss: 0.0376


Epoch reward 21.0


[a2c|TrainMonitor|INFO] ep: 11,	T: 195,	G: 16,	avg_r: 1,	avg_G: 16.7,	t: 16,	dt: 30.011ms,	SimpleTD/loss: 0.687,	VanillaPG/loss: 0.0999


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 12,	T: 215,	G: 19,	avg_r: 1,	avg_G: 16.9,	t: 19,	dt: 28.882ms,	SimpleTD/loss: 0.707,	VanillaPG/loss: 0.105


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 13,	T: 234,	G: 18,	avg_r: 1,	avg_G: 17.1,	t: 18,	dt: 29.239ms,	SimpleTD/loss: 0.477,	VanillaPG/loss: 0.0619


Epoch reward 18.0


[a2c|TrainMonitor|INFO] ep: 14,	T: 251,	G: 16,	avg_r: 1,	avg_G: 16.9,	t: 16,	dt: 28.887ms,	SimpleTD/loss: 0.207,	VanillaPG/loss: 3.56e-05


Epoch reward 16.0


[a2c|TrainMonitor|INFO] ep: 15,	T: 261,	G: 9,	avg_r: 1,	avg_G: 16.2,	t: 9,	dt: 29.963ms,	SimpleTD/loss: 0.32,	VanillaPG/loss: -0.278


Epoch reward 9.0


[a2c|TrainMonitor|INFO] ep: 16,	T: 288,	G: 26,	avg_r: 1,	avg_G: 17.1,	t: 26,	dt: 27.781ms,	SimpleTD/loss: 0.293,	VanillaPG/loss: 0.0144


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 17,	T: 303,	G: 14,	avg_r: 1,	avg_G: 16.8,	t: 14,	dt: 28.455ms,	SimpleTD/loss: 0.523,	VanillaPG/loss: -0.105


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 18,	T: 318,	G: 14,	avg_r: 1,	avg_G: 16.5,	t: 14,	dt: 28.498ms,	SimpleTD/loss: 0.682,	VanillaPG/loss: -0.115


Epoch reward 14.0


[a2c|TrainMonitor|INFO] ep: 19,	T: 331,	G: 12,	avg_r: 1,	avg_G: 16.1,	t: 12,	dt: 28.060ms,	SimpleTD/loss: 0.683,	VanillaPG/loss: -0.044


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 20,	T: 344,	G: 12,	avg_r: 1,	avg_G: 15.7,	t: 12,	dt: 30.011ms,	SimpleTD/loss: 0.901,	VanillaPG/loss: 0.104


Epoch reward 12.0


[a2c|TrainMonitor|INFO] ep: 21,	T: 364,	G: 19,	avg_r: 1,	avg_G: 16,	t: 19,	dt: 28.699ms,	SimpleTD/loss: 0.384,	VanillaPG/loss: 0.119


Epoch reward 19.0


[a2c|TrainMonitor|INFO] ep: 22,	T: 422,	G: 57,	avg_r: 1,	avg_G: 20.1,	t: 57,	dt: 28.832ms,	SimpleTD/loss: 1.2,	VanillaPG/loss: -0.0515


Epoch reward 57.0


[a2c|TrainMonitor|INFO] ep: 23,	T: 449,	G: 26,	avg_r: 1,	avg_G: 20.7,	t: 26,	dt: 28.369ms,	SimpleTD/loss: 0.417,	VanillaPG/loss: -0.446


Epoch reward 26.0


[a2c|TrainMonitor|INFO] ep: 24,	T: 489,	G: 39,	avg_r: 1,	avg_G: 22.5,	t: 39,	dt: 28.605ms,	SimpleTD/loss: 0.308,	VanillaPG/loss: 0.331


Epoch reward 39.0


[a2c|TrainMonitor|INFO] ep: 25,	T: 517,	G: 27,	avg_r: 1,	avg_G: 23,	t: 27,	dt: 29.525ms,	SimpleTD/loss: 0.554,	VanillaPG/loss: 0.19


Epoch reward 27.0


[a2c|TrainMonitor|INFO] ep: 26,	T: 535,	G: 17,	avg_r: 1,	avg_G: 22.4,	t: 17,	dt: 28.380ms,	SimpleTD/loss: 0.972,	VanillaPG/loss: -0.231


Epoch reward 17.0


[a2c|TrainMonitor|INFO] ep: 27,	T: 578,	G: 42,	avg_r: 1,	avg_G: 24.3,	t: 42,	dt: 28.712ms,	SimpleTD/loss: 0.639,	VanillaPG/loss: 0.0768


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 28,	T: 636,	G: 57,	avg_r: 1,	avg_G: 27.6,	t: 57,	dt: 27.949ms,	SimpleTD/loss: 0.494,	VanillaPG/loss: -0.141


Epoch reward 57.0


[a2c|TrainMonitor|INFO] ep: 29,	T: 669,	G: 32,	avg_r: 1,	avg_G: 28,	t: 32,	dt: 27.774ms,	SimpleTD/loss: 0.649,	VanillaPG/loss: -0.241


Epoch reward 32.0


[a2c|TrainMonitor|INFO] ep: 30,	T: 736,	G: 66,	avg_r: 1,	avg_G: 31.8,	t: 66,	dt: 27.469ms,	SimpleTD/loss: 0.602,	VanillaPG/loss: -0.0115


Epoch reward 66.0


[a2c|TrainMonitor|INFO] ep: 31,	T: 822,	G: 85,	avg_r: 1,	avg_G: 37.2,	t: 85,	dt: 27.029ms,	SimpleTD/loss: 0.276,	VanillaPG/loss: -0.004


Epoch reward 85.0


[a2c|TrainMonitor|INFO] ep: 32,	T: 865,	G: 42,	avg_r: 1,	avg_G: 37.6,	t: 42,	dt: 28.094ms,	SimpleTD/loss: 0.526,	VanillaPG/loss: 0.0126


Epoch reward 42.0


[a2c|TrainMonitor|INFO] ep: 33,	T: 933,	G: 67,	avg_r: 1,	avg_G: 40.6,	t: 67,	dt: 28.297ms,	SimpleTD/loss: 0.329,	VanillaPG/loss: -0.0227


Epoch reward 67.0


[a2c|TrainMonitor|INFO] ep: 34,	T: 1,038,	G: 104,	avg_r: 1,	avg_G: 46.9,	t: 104,	dt: 28.809ms,	SimpleTD/loss: 0.256,	VanillaPG/loss: -0.0235


Epoch reward 104.0


[a2c|TrainMonitor|INFO] ep: 35,	T: 1,151,	G: 112,	avg_r: 1,	avg_G: 53.4,	t: 112,	dt: 28.488ms,	SimpleTD/loss: 0.15,	VanillaPG/loss: -0.00529


Epoch reward 112.0


[a2c|TrainMonitor|INFO] ep: 36,	T: 1,213,	G: 61,	avg_r: 1,	avg_G: 54.2,	t: 61,	dt: 27.981ms,	SimpleTD/loss: 0.0206,	VanillaPG/loss: 0.00981


Epoch reward 61.0


[a2c|TrainMonitor|INFO] ep: 37,	T: 1,300,	G: 86,	avg_r: 1,	avg_G: 57.4,	t: 86,	dt: 27.979ms,	SimpleTD/loss: 0.0338,	VanillaPG/loss: -0.0392


Epoch reward 86.0


[a2c|TrainMonitor|INFO] ep: 38,	T: 1,412,	G: 111,	avg_r: 1,	avg_G: 62.7,	t: 111,	dt: 27.264ms,	SimpleTD/loss: 0.0806,	VanillaPG/loss: -0.0331


Epoch reward 111.0


[a2c|TrainMonitor|INFO] ep: 39,	T: 1,479,	G: 66,	avg_r: 1,	avg_G: 63.1,	t: 66,	dt: 27.973ms,	SimpleTD/loss: 0.247,	VanillaPG/loss: -0.0591


Epoch reward 66.0


[a2c|TrainMonitor|INFO] ep: 40,	T: 1,680,	G: 200,	avg_r: 1,	avg_G: 76.8,	t: 200,	dt: 28.862ms,	SimpleTD/loss: 0.0152,	VanillaPG/loss: 0.00642


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 41,	T: 1,783,	G: 102,	avg_r: 1,	avg_G: 79.3,	t: 102,	dt: 27.854ms,	SimpleTD/loss: 0.303,	VanillaPG/loss: -0.0628


Epoch reward 102.0


[a2c|TrainMonitor|INFO] ep: 42,	T: 1,971,	G: 187,	avg_r: 1,	avg_G: 90,	t: 187,	dt: 28.441ms,	SimpleTD/loss: 0.241,	VanillaPG/loss: 0.0122


Epoch reward 187.0


[a2c|TrainMonitor|INFO] ep: 43,	T: 2,172,	G: 200,	avg_r: 1,	avg_G: 101,	t: 200,	dt: 28.684ms,	SimpleTD/loss: 0.0364,	VanillaPG/loss: 0.0135


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 44,	T: 2,373,	G: 200,	avg_r: 1,	avg_G: 111,	t: 200,	dt: 28.467ms,	SimpleTD/loss: 0.00417,	VanillaPG/loss: -0.0177


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 45,	T: 2,537,	G: 163,	avg_r: 1,	avg_G: 116,	t: 163,	dt: 28.106ms,	SimpleTD/loss: 0.118,	VanillaPG/loss: 0.000405


Epoch reward 163.0


[a2c|TrainMonitor|INFO] ep: 46,	T: 2,738,	G: 200,	avg_r: 1,	avg_G: 125,	t: 200,	dt: 28.081ms,	SimpleTD/loss: 0.00279,	VanillaPG/loss: -0.00583


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 47,	T: 2,939,	G: 200,	avg_r: 1,	avg_G: 132,	t: 200,	dt: 28.142ms,	SimpleTD/loss: 0.000975,	VanillaPG/loss: -0.00822


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 48,	T: 3,139,	G: 199,	avg_r: 1,	avg_G: 139,	t: 199,	dt: 28.249ms,	SimpleTD/loss: 0.104,	VanillaPG/loss: 0.00231


Epoch reward 199.0


[a2c|TrainMonitor|INFO] ep: 49,	T: 3,305,	G: 165,	avg_r: 1,	avg_G: 141,	t: 165,	dt: 27.649ms,	SimpleTD/loss: 0.0792,	VanillaPG/loss: -0.0072


Epoch reward 165.0


[a2c|TrainMonitor|INFO] ep: 50,	T: 3,395,	G: 89,	avg_r: 1,	avg_G: 136,	t: 89,	dt: 27.760ms,	SimpleTD/loss: 0.209,	VanillaPG/loss: -0.00927


Epoch reward 89.0


[a2c|TrainMonitor|INFO] ep: 51,	T: 3,514,	G: 118,	avg_r: 1,	avg_G: 134,	t: 118,	dt: 27.660ms,	SimpleTD/loss: 0.115,	VanillaPG/loss: 0.000759


Epoch reward 118.0


[a2c|TrainMonitor|INFO] ep: 52,	T: 3,682,	G: 167,	avg_r: 1,	avg_G: 138,	t: 167,	dt: 28.016ms,	SimpleTD/loss: 0.023,	VanillaPG/loss: 0.00692


Epoch reward 167.0


[a2c|TrainMonitor|INFO] ep: 53,	T: 3,876,	G: 193,	avg_r: 1,	avg_G: 143,	t: 193,	dt: 27.365ms,	SimpleTD/loss: 0.0392,	VanillaPG/loss: -0.0227


Epoch reward 193.0


[a2c|TrainMonitor|INFO] ep: 54,	T: 4,077,	G: 200,	avg_r: 1,	avg_G: 149,	t: 200,	dt: 27.710ms,	SimpleTD/loss: 0.0191,	VanillaPG/loss: -0.000488


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 55,	T: 4,278,	G: 200,	avg_r: 1,	avg_G: 154,	t: 200,	dt: 28.371ms,	SimpleTD/loss: 0.0211,	VanillaPG/loss: -0.00848


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 56,	T: 4,479,	G: 200,	avg_r: 1,	avg_G: 159,	t: 200,	dt: 27.865ms,	SimpleTD/loss: 0.00118,	VanillaPG/loss: -0.00136


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 57,	T: 4,664,	G: 184,	avg_r: 1,	avg_G: 161,	t: 184,	dt: 28.473ms,	SimpleTD/loss: 0.164,	VanillaPG/loss: -0.00382


Epoch reward 184.0


[a2c|TrainMonitor|INFO] ep: 58,	T: 4,865,	G: 200,	avg_r: 1,	avg_G: 165,	t: 200,	dt: 28.678ms,	SimpleTD/loss: 0.0053,	VanillaPG/loss: 0.000925


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 59,	T: 5,066,	G: 200,	avg_r: 1,	avg_G: 168,	t: 200,	dt: 28.865ms,	SimpleTD/loss: 0.00123,	VanillaPG/loss: -0.00093


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 60,	T: 5,267,	G: 200,	avg_r: 1,	avg_G: 172,	t: 200,	dt: 27.926ms,	SimpleTD/loss: 0.00309,	VanillaPG/loss: -0.00117


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 61,	T: 5,468,	G: 200,	avg_r: 1,	avg_G: 174,	t: 200,	dt: 28.093ms,	SimpleTD/loss: 0.00349,	VanillaPG/loss: -0.00274


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 62,	T: 5,669,	G: 200,	avg_r: 1,	avg_G: 177,	t: 200,	dt: 27.876ms,	SimpleTD/loss: 0.0028,	VanillaPG/loss: 0.000799


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 63,	T: 5,870,	G: 200,	avg_r: 1,	avg_G: 179,	t: 200,	dt: 28.085ms,	SimpleTD/loss: 0.00199,	VanillaPG/loss: -0.00128


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 64,	T: 6,071,	G: 200,	avg_r: 1,	avg_G: 181,	t: 200,	dt: 27.741ms,	SimpleTD/loss: 0.00043,	VanillaPG/loss: -0.00124


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 65,	T: 6,272,	G: 200,	avg_r: 1,	avg_G: 183,	t: 200,	dt: 27.554ms,	SimpleTD/loss: 0.00128,	VanillaPG/loss: -0.000935


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 66,	T: 6,473,	G: 200,	avg_r: 1,	avg_G: 185,	t: 200,	dt: 27.990ms,	SimpleTD/loss: 0.000233,	VanillaPG/loss: -0.000825


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 67,	T: 6,674,	G: 200,	avg_r: 1,	avg_G: 186,	t: 200,	dt: 28.469ms,	SimpleTD/loss: 9.96e-05,	VanillaPG/loss: -6.93e-05


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 68,	T: 6,875,	G: 200,	avg_r: 1,	avg_G: 188,	t: 200,	dt: 28.454ms,	SimpleTD/loss: 0.000196,	VanillaPG/loss: -0.000481


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 69,	T: 7,076,	G: 200,	avg_r: 1,	avg_G: 189,	t: 200,	dt: 28.063ms,	SimpleTD/loss: 0.00075,	VanillaPG/loss: 0.00138


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 70,	T: 7,277,	G: 200,	avg_r: 1,	avg_G: 190,	t: 200,	dt: 28.350ms,	SimpleTD/loss: 0.00384,	VanillaPG/loss: 0.00114


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 71,	T: 7,478,	G: 200,	avg_r: 1,	avg_G: 191,	t: 200,	dt: 27.695ms,	SimpleTD/loss: 0.00206,	VanillaPG/loss: -0.00119


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 72,	T: 7,679,	G: 200,	avg_r: 1,	avg_G: 192,	t: 200,	dt: 28.361ms,	SimpleTD/loss: 0.000758,	VanillaPG/loss: -0.000531


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 73,	T: 7,880,	G: 200,	avg_r: 1,	avg_G: 193,	t: 200,	dt: 27.694ms,	SimpleTD/loss: 0.00031,	VanillaPG/loss: -0.00219


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 74,	T: 8,081,	G: 200,	avg_r: 1,	avg_G: 194,	t: 200,	dt: 27.238ms,	SimpleTD/loss: 0.000842,	VanillaPG/loss: -0.000858


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 75,	T: 8,282,	G: 200,	avg_r: 1,	avg_G: 194,	t: 200,	dt: 27.388ms,	SimpleTD/loss: 0.00173,	VanillaPG/loss: 7.55e-05


Epoch reward 209.0


[a2c|TrainMonitor|INFO] ep: 76,	T: 8,483,	G: 200,	avg_r: 1,	avg_G: 195,	t: 200,	dt: 27.632ms,	SimpleTD/loss: 0.00115,	VanillaPG/loss: -0.0011


Epoch reward 209.0
Epoch reward 209.0


In [9]:
# coax.utils.dump(pi.params, "./emlp_pi_params.lz4")
# coax.utils.dump(v.params, "./emlp_v_params.lz4")