In [1]:
import jax
import tax
import tqdm
import haiku as hk
import numpy as np
import collections 
import jax.numpy as jnp
import matplotlib as mpl
import matplotlib.pyplot as plt
import mbrl

from jax import jit
from functools import partial
from deluca.envs.classic._acrobot import reset, env_params, dynamics, step, render
from mbrl.algs.rs import forecast
from mbrl.algs.rs import plan
from mbrl.algs.rs import score

rng = jax.random.PRNGKey(42)
Environment = collections.namedtuple('Environment', ['step', 'reset'])

env_params

{'dt': 0.2,
 'LINK_LENGTH_1': 1.0,
 'LINK_LENGTH_2': 1.0,
 'LINK_MASS_1': 1.0,
 'LINK_MASS_2': 1.0,
 'LINK_COM_POS_1': 0.5,
 'LINK_COM_POS_2': 0.5,
 'LINK_MOI': 1.0,
 'MAX_VEL_1': 12.566370614359172,
 'MAX_VEL_2': 28.274333882308138,
 'AVAIL_TORQUE': DeviceArray([-1.,  0.,  1.], dtype=float32),
 'torque_noise_max': 0.0,
 'book_or_nips': 'book',
 'action_arrow': None,
 'domain_fig': None,
 'actions_num': 3,
 'action_max': DeviceArray([ 1.      ,  1.      ,  1.      ,  1.      , 12.566371,
              28.274334], dtype=float32),
 'action_min': DeviceArray([ -1.      ,  -1.      ,  -1.      ,  -1.      , -12.566371,
              -28.274334], dtype=float32)}

In [2]:
step     = jit(partial(step, env_params=env_params))
reset    = jit(partial(reset, env_params=env_params))
dynamics = jit(partial(dynamics, env_params=env_params))
env      = Environment(step=step, reset=reset)

In [3]:
state0 = reset(rng)
u = 1      # 0, 1, 2

In [4]:
dynamics(state0, u)

  lax._check_user_dtype_supported(dtype, "zeros")


DeviceArray([-0.01590787,  0.08213443, -0.21250808,  0.18861379], dtype=float32)

In [5]:
step(rng, state0, u)

(DeviceArray([-0.00265431,  0.04787853, -0.08390957, -0.1455917 ], dtype=float32),
 DeviceArray([ 0.9999965 , -0.00265431,  0.99885404,  0.04786024,
              -0.08390957, -0.1455917 ], dtype=float32),
 DeviceArray(-1., dtype=float32),
 DeviceArray(0., dtype=float32),
 {})

In [6]:
@jit
def world(carry, t):
    keys, state, trajectory = carry
    action = trajectory[t]
    rng = keys[t]
    state_next, observation_next, reward, done, info = env.step(rng, state, action)
    reward = reward.astype(jnp.float32)
    carry = keys, state_next, trajectory
    return carry, {
        "observation": state,
        "observation_next": state_next,
        "reward": reward, "action": action, "terminal": 1. - done,
    }

In [7]:
score_    = jit(score)
forecast_ = partial(forecast, 
                    step_fn=world, 
                    horizon=250,
                    action_dim=None, 
                    minval=None, 
                    maxval=3,     # Number of discrete actions possible
                    action_type='discrete')

In [8]:
rng, rng_reset = jax.random.split(rng, 2)
state_0 = env.reset(rng_reset)
action, _ = plan(rng, state_0, jit(forecast_), jit(score))
action[0]

DeviceArray(1, dtype=int32)

In [9]:
%%time
# RS:Model.
score = 0
rng, rng_reset = jax.random.split(rng, 2)
state = env.reset(rng_reset)
list_states = []
for _ in tqdm.notebook.trange(200):
    rng, rng_plan = jax.random.split(rng, 2)
    list_states.append(state)
    action = plan(rng_plan, state, forecast_, score_,  population=75000)[0][0]
    state, _, reward, terminal, info = env.step(rng_plan, state, action)
    score += reward
print(score)

  0%|          | 0/200 [00:00<?, ?it/s]

-172.0
CPU times: user 51.5 s, sys: 7.18 s, total: 58.7 s
Wall time: 57.2 s


In [10]:
_, info = render(list_states[0], {})

In [11]:
for s in list_states:
    render(s, info)