In [1]:
 %matplotlib tk
from kusanagi.shell import cartpole
from kusanagi.ghost import control
from kusanagi.ghost import regression
from kusanagi.base import apply_controller, train_dynamics, ExperienceDataset
from kusanagi.ghost.optimizers import ScipyOptimizer
from kusanagi.ghost.algorithms import pilco_, mc_pilco_
from kusanagi import utils
import numpy as np
from functools import partial

Using cuDNN version 6021 on context None
Mapped name None to device cuda0: TITAN Xp (0000:01:00.0)


In [2]:
# experiment parameters
params = cartpole.default_params()
angle_dims = params['angle_dims']
n_random = 4
n_polopt = 20
max_steps = 40
# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

In [3]:
# environment
env = cartpole.Cartpole(**params['plant'])

# policy
policy = control.RBFPolicy(**params['policy'])
randpol = control.RandPolicy(maxU=policy.maxU, random_walk=True)

# dynamics model, inputs are state + action, outputs are changes in state
dynmodel = regression.SSGP_UI(**params['dynamics_model'])

# cost function model
cost = partial(cartpole.cartpole_loss, **params['cost'])

# experience dataset
exp = ExperienceDataset()

# optimizers
dynopt = ScipyOptimizer(**params['optimizer'])
polopt = ScipyOptimizer(**params['optimizer'])

[2017-07-04 10:29:55.590162] RBFPolicy_sat > Initializing parameters
[2017-07-04 10:29:55.600540] RBFPolicy_sat > Initialising expression graph for full GP training loss function
[2017-07-04 10:29:55.648513] RBFPolicy_sat > Initialising expression graph for prediction
[2017-07-04 10:29:55.663472] RBFPolicy_sat > Compiling mean and variance of prediction
[2017-07-04 10:29:56.483038] RBFPolicy_sat > Done compiling
[2017-07-04 10:29:56.496639] Experience > Initialising new experience dataset


In [4]:
# callback executed after every call to env.step
def step_cb(state, action, cost, info):
    exp.add_sample(state, action, cost, info)
    env.render()
    
# function to execute before applying policy
def gTrig(state):
    return utils.gTrig_np(state, angle_dims).flatten()

# initial data collection runs with random controls
for i in range(n_random):
    exp.new_episode()
    states, actions, costs, infos = apply_controller(env, policy, max_steps,
                                                     preprocess=gTrig, callback=step_cb)

[2017-07-04 10:29:56.582885] apply_controller > Starting run
[2017-07-04 10:29:56.585381] apply_controller > Running for 4.000000 seconds




[2017-07-04 10:29:58.956347] apply_controller > Done. Stopping robot. Value of run [39.868665]
[2017-07-04 10:29:58.957924] Cartpole > Stopping robot
[2017-07-04 10:29:58.958622] apply_controller > Starting run
[2017-07-04 10:29:58.962546] apply_controller > Running for 4.000000 seconds
[2017-07-04 10:30:01.288938] apply_controller > Done. Stopping robot. Value of run [39.999393]
[2017-07-04 10:30:01.305763] Cartpole > Stopping robot
[2017-07-04 10:30:01.308037] apply_controller > Starting run
[2017-07-04 10:30:01.309965] apply_controller > Running for 4.000000 seconds
[2017-07-04 10:30:03.446801] apply_controller > Done. Stopping robot. Value of run [39.999460]
[2017-07-04 10:30:03.463140] Cartpole > Stopping robot
[2017-07-04 10:30:03.465378] apply_controller > Starting run
[2017-07-04 10:30:03.467278] apply_controller > Running for 4.000000 seconds
[2017-07-04 10:30:05.601202] apply_controller > Done. Stopping robot. Value of run [39.860530]
[2017-07-04 10:30:05.617529] Cartpole > S

In [5]:
# learning loop
for i in range(n_polopt): 
    utils.print_with_stamp('Iteration %d, total experience: %d steps'%(i+1, sum([len(st) for st in exp.states])))
    # train dynamics model
    #train_dynamics(dynmodel, exp, angle_dims=angle_dims, init_episode=0)
    X, Y = exp.get_dynmodel_dataset(deltas=True, angle_dims=angle_dims)
    dynmodel.set_dataset(X, Y)
    if dynopt.loss_fn is None:
        loss_gp, inps, updts = dynmodel.get_loss()
        dynopt.set_objective(loss_gp, dynmodel.get_params(symbolic=True), inps, updts)
    dynopt.minimize()
    
    # train policy
    if polopt.loss_fn is None:
        loss_pol, inps, updts = pilco_.get_loss(policy, dynmodel, cost, D, angle_dims)
        polopt.set_objective(loss_pol, policy.get_params(symbolic=True), inps, updts)
    polopt.minimize(p0.mean, p0.cov, 40, 1)
    
    # apply controller
    exp.new_episode(policy_params=policy.get_params())
    states, actions, costs, infos = apply_controller(env, policy, max_steps,
                                                     preprocess=gTrig, callback=step_cb)

[2017-07-04 10:30:05.733518] Iteration 1, total experience: 160 steps
[2017-07-04 10:30:05.736889] SSGP_UI > Initialising parameters
[2017-07-04 10:30:05.742412] SSGP_UI > Initialising expression graph for full GP training loss function
[2017-07-04 10:30:05.787306] ScipyOptimizer > Building computation graph for gradients
[2017-07-04 10:30:06.031563] ScipyOptimizer > Compiling function for loss
[2017-07-04 10:30:06.554745] ScipyOptimizer > Compiling function for loss+gradients
[2017-07-04 10:30:10.223542] ScipyOptimizer > Optimizing parameters
[2017-07-04 10:30:10.244801] ScipyOptimizer > Initial loss [303.13458279479397]
[2017-07-04 10:30:10.245588] ScipyOptimizer > Using BFGS optimizer
[2K[2017-07-04 10:30:10.419054] ScipyOptimizer > Current value: -398.03754391416976, Total evaluations: 6,         Avg. time per call: 0.031891   

  'gtol': 1.0e-6}


[2K[2017-07-04 10:30:13.530043] ScipyOptimizer > Current value: -1241.603170525901, Total evaluations: 138,         Avg. time per call: 0.019742    
[2017-07-04 10:30:13.533816] ScipyOptimizer > Done training. New value [-1241.603171] iter: [136]
[2017-07-04 10:30:13.537197] pilco.rollout > Building computation graph for belief state propagation


AttributeError: 'NoneType' object has no attribute 'shape'

In [9]:
dynopt.grads_fn()

[array(924.1811476478766),
 array([[ -306.01761488,  -202.17335583,  -215.72509143,  -119.86554306,
          -276.31764957,   -67.52521367, -1667.62789131,   -16.79423829],
        [  -45.49039863,   -52.60383272,   -41.9701915 ,   -15.58153214,
           -45.7787077 ,    -9.85335037,  -240.07397572,     1.93446688],
        [  -54.84919972,   -45.99877261,   -54.86776936,   -11.95147382,
           -56.88412831,   -11.30721787,  -166.43947718,     2.66633356],
        [  -70.63741594,   -50.67703573,   -34.74237804,   -42.54835666,
           -54.95024117,    -4.01160943,  -162.12062864,     2.64143425]])]

In [10]:
dynopt.params

[SSGP_UI>loghyp]