In [1]:
%matplotlib tk
import os
import numpy as np
from kusanagi.ghost import control
from kusanagi.ghost import regression
from kusanagi.shell import cartpole
from kusanagi.ghost.algorithms import pilco_, mc_pilco_
from kusanagi.ghost.optimizers import ScipyOptimizer, SGDOptimizer
from kusanagi.base import apply_controller, train_dynamics, ExperienceDataset
from kusanagi import utils
from functools import partial
#np.random.seed(1337)
np.set_printoptions(linewidth=500)

from matplotlib import pyplot as plt
import theano

In [2]:
# setup output directory
utils.set_output_dir(os.path.join(utils.get_output_dir(), 'cartpole'))

params = cartpole.default_params()
n_rnd = 1                                                # number of random initial trials
n_opt = 100                                              #learning iterations
H = 26#params['max_steps']
gamma = params['discount']
angle_dims = params['angle_dims']

# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

# init environment
env = cartpole.Cartpole(**params['plant'])

# init policy
pol = control.NNPolicy(p0.mean,**params['policy'])
randpol = control.RandPolicy(maxU=pol.maxU)

# init dynmodel
#dyn = regression.SSGP_UI(**params['dynamics_model'])
dyn = regression.BNN(**params['dynamics_model'])

# init cost model
cost = partial(cartpole.cartpole_loss, **params['cost'])

# create experience dataset
exp = ExperienceDataset()

# init policy optimizer
params['optimizer']['min_method'] = 'adam'
params['optimizer']['max_evals'] = 1000
polopt = SGDOptimizer(**params['optimizer'])

# callback executed after every call to env.step
def step_cb(state, action, cost, info):
    exp.add_sample(state, action, cost, info)
    #env.render()

# function to execute before applying policy
def gTrig(state):
    return utils.gTrig_np(state, angle_dims).flatten()

(<class 'kusanagi.ghost.control.NNPolicy.NNPolicy'>, True)
[2017-07-11 11:09:37.953141] Experience > Initialising new experience dataset


In [3]:
# during first n_rnd trials, apply randomized controls
for i in range(n_rnd):
    exp.new_episode()
    apply_controller(env, randpol, H,
                     preprocess=gTrig,
                     callback=step_cb)

[2017-07-11 11:09:38.024797] apply_controller > Starting run
[2017-07-11 11:09:38.025624] apply_controller > Running for 2.600000 seconds
[2017-07-11 11:09:38.265119] apply_controller > Done. Stopping robot. Value of run [25.961014]
[2017-07-11 11:09:38.265992] Cartpole > Stopping robot


In [66]:
exp.new_episode()
apply_controller(env, pol, H,
                 preprocess=gTrig,
                 callback=step_cb)
print('')

[2017-07-11 20:29:25.750499] apply_controller > Starting run
[2017-07-11 20:29:25.751351] apply_controller > Running for 2.600000 seconds
[2017-07-11 20:29:26.047231] apply_controller > Done. Stopping robot. Value of run [25.158579]
[2017-07-11 20:29:26.048095] Cartpole > Stopping robot



In [73]:
# train dynamics model
dyn.optimizer.max_evals = 5000
train_dynamics(dyn, exp, angle_dims=angle_dims, init_episode=0)

[2017-07-11 20:32:45.578205] train_dynamics > Training dynamics model
[2017-07-11 20:32:45.581939] train_dynamics > Dataset size:: Inputs: [ (225, 6) ], Targets: [ (225, 4) ]  
[2017-07-11 20:32:45.583056] BNN_opt > Optimizing parameters via mini batches
[2017-07-11 20:32:45.585932] BNN_opt > Initial loss [81.5391260006]
[2K[2017-07-11 20:33:18.494740] BNN_opt > Current value: -3.552428E+00, Total evaluations: 5000, Avg. time per updt: 0.006524
[2017-07-11 20:33:18.498776] train_dynamics > Done training dynamics model


<kusanagi.ghost.regression.NN.BNN at 0x7f3783b82fd0>

In [76]:
rollout_fn = mc_pilco_.build_rollout(pol, dyn, cost, D, angle_dims, n_samples=100, resample_particles=True)
plt.close('all')
state_fig = None

[2017-07-11 20:33:39.111646] mc_pilco.rollout > Building computation graph for state particles propagation


In [77]:
dyn.update()
loss, costs, trajectories = rollout_fn(p0.mean, p0.cov, H, 1)
n_samples, T, dims = trajectories.shape

if not state_fig:
    state_fig, state_axarr = plt.subplots(dims, sharex=True)
exp_states = np.array(exp.states)
for d in range(dims):
    st = trajectories[:,:,d]
    #plot predictive distribution
    for i in range(n_samples):
        state_axarr[d].plot(np.arange(T-1), st[i,:-1], color='steelblue', alpha=0.3)
    #for i in range(len(exp.states)):
    #    state_axarr[d].plot(np.arange(T-1), exp_states[i,1:,d], color='orange', alpha=0.3)
    #plot experience
    state_axarr[d].plot(np.arange(T-1), exp_states[-1,1:,d], color='red')
    state_axarr[d].plot(np.arange(T-1), st[:,:-1].mean(0), color='purple')
plt.show()

In [79]:
# init policy optimizer
learning_rate = theano.tensor.scalar('lr')
loss, inps, updts = mc_pilco_.get_loss(pol, dyn, cost, D, angle_dims, n_samples=100, resample_particles=True)
inps.append(learning_rate)
polopt.set_objective(loss, pol.get_params(symbolic=True), inps, updts, clip=1000.0, learning_rate=learning_rate)

[2017-07-11 20:35:11.434974] mc_pilco.rollout > Building computation graph for state particles propagation
[2017-07-11 20:35:11.756249] SGDOptimizer > Building computation graph for gradients
[2017-07-11 20:35:12.533739] SGDOptimizer > Computing parameter update rules
[2017-07-11 20:35:12.553664] SGDOptimizer > Compiling function for loss
[2017-07-11 20:35:15.186482] SGDOptimizer > Compiling parameter updates


In [80]:
# initial state distribution
polopt.max_evals = 2000
x0 = np.array([st[0] for st in exp.states])
m0 = x0.mean(0)
S0 = np.cov(x0, rowvar=False, ddof=1) + 1e-7*np.eye(x0.shape[1]) if len(x0) > 2 else p0.cov
polopt.minimize(m0, S0, H, gamma, 1e-2, callback=lambda *args, **kwargs: dynmodel.update)

[2017-07-11 20:35:36.405261] SGDOptimizer > Optimizing parameters
[2017-07-11 20:35:36.441510] SGDOptimizer > Initial loss [0.91068711824]
[2K[2017-07-11 20:38:45.944667] SGDOptimizer > Current value: 9.571826E-01, Total evaluations: 2000, Avg. time per updt: 0.074791
[2017-07-11 20:38:45.945742] SGDOptimizer > Done training. New value [0.887812] iter: [439]


In [3]:
params['policy']

{'angle_dims': [3],
 'maxU': [10],
 'n_inducing': 30,
 'state0_dist': <kusanagi.utils.distributions.Gaussian at 0x7fb7865d9250>}

In [56]:
dyn.optimizer.loss_fn()

TypeError: Missing required input: BNN>train_inputs

In [71]:
print(exp.n_episodes())

9
