In [1]:
 %matplotlib tk
from kusanagi.shell import cartpole
from kusanagi.ghost import control
from kusanagi.ghost import regression
from kusanagi.base import apply_controller, train_dynamics, ExperienceDataset
from kusanagi.ghost.optimizers import ScipyOptimizer, SGDOptimizer
from kusanagi.ghost.algorithms import pilco_, mc_pilco_
from kusanagi import utils
import numpy as np
from functools import partial

In [2]:
# experiment parameters
params = cartpole.default_params()
angle_dims = params['angle_dims']
n_random = 4
n_polopt = 20
max_steps = 40
# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

In [3]:
# environment
env = cartpole.Cartpole(**params['plant'])

# policy
policy = control.NNPolicy(p0.mean, **params['policy'])
randpol = control.RandPolicy(maxU=policy.maxU, random_walk=True)

# dynamics model, inputs are state + action, outputs are changes in state
dynmodel = regression.BNN(**params['dynamics_model'])

# cost function model
cost = partial(cartpole.cartpole_loss, **params['cost'])

# experience dataset
exp = ExperienceDataset()

# optimizers
dynopt = SGDOptimizer(**params['optimizer'])
polopt = ScipyOptimizer(**params['optimizer'])

<class 'kusanagi.ghost.control.NNPolicy.NNPolicy'> True
[2017-07-12 10:26:01.678367] Experience > Initialising new experience dataset


In [4]:
# callback executed after every call to env.step
def step_cb(state, action, cost, info):
    exp.add_sample(state, action, cost, info)
    env.render()
    
# function to execute before applying policy
def gTrig(state):
    return utils.gTrig_np(state, angle_dims).flatten()

# initial data collection runs with random controls
for i in range(n_random):
    exp.new_episode()
    states, actions, costs, infos = apply_controller(env, policy, max_steps,
                                                     preprocess=gTrig, callback=step_cb)

[2017-07-12 10:26:01.750406] apply_controller > Starting run
[2017-07-12 10:26:01.751344] apply_controller > Running for 4.000000 seconds
[2017-07-12 10:26:01.752990] NNPolicy > Initialising expression graph for prediction
[2017-07-12 10:26:01.754572] NNPolicy > Building network
InputLayer {'shape': [None, 5], 'name': 'NNPolicy_input'}
DenseLayer {'num_units': 50, 'nonlinearity': <function selu at 0x7f0ca4dc1378>, 'W': <lasagne.init.HeNormal object at 0x7f0ce572ce10>, 'name': 'NNPolicy_fc0'}
DenseLayer {'num_units': 1, 'nonlinearity': <function linear at 0x7f0cb8b86268>, 'W': <lasagne.init.HeUniform object at 0x7f0caabdaef0>, 'name': 'NNPolicy_output'}
[2017-07-12 10:26:01.771485] NNPolicy > Compiling mean and variance of prediction
[2017-07-12 10:26:01.873897] NNPolicy > Done compiling


	fixed_dropout_masks
  % "\n\t".join(suggestions))


[2017-07-12 10:26:04.231036] apply_controller > Done. Stopping robot. Value of run [39.997108]
[2017-07-12 10:26:04.233418] Cartpole > Stopping robot
[2017-07-12 10:26:04.236168] apply_controller > Starting run
[2017-07-12 10:26:04.238839] apply_controller > Running for 4.000000 seconds
[2017-07-12 10:26:06.301606] apply_controller > Done. Stopping robot. Value of run [39.999891]
[2017-07-12 10:26:06.318671] Cartpole > Stopping robot
[2017-07-12 10:26:06.321728] apply_controller > Starting run
[2017-07-12 10:26:06.324566] apply_controller > Running for 4.000000 seconds
[2017-07-12 10:26:08.476400] apply_controller > Done. Stopping robot. Value of run [39.994798]
[2017-07-12 10:26:08.477186] Cartpole > Stopping robot
[2017-07-12 10:26:08.482604] apply_controller > Starting run
[2017-07-12 10:26:08.484637] apply_controller > Running for 4.000000 seconds
[2017-07-12 10:26:10.572178] apply_controller > Done. Stopping robot. Value of run [39.998278]
[2017-07-12 10:26:10.572953] Cartpole > S

In [None]:
# learning loop
for i in range(n_polopt): 
    utils.print_with_stamp('Iteration %d, total experience: %d steps'%(i+1, sum([len(st) for st in exp.states])))
    # train dynamics model
    train_dynamics(dynmodel, exp, angle_dims=angle_dims, init_episode=0)
    #X, Y = exp.get_dynmodel_dataset(deltas=True, angle_dims=angle_dims)
    #dynmodel.set_dataset(X, Y)
    #if dynopt.loss_fn is None:
    #    loss_gp, inps, updts = dynmodel.get_loss()
    #    dynopt.set_objective(loss_gp, dynmodel.get_params(symbolic=True), inps, updts)
    #dynopt.minimize()
    
    # train policy
    if polopt.loss_fn is None:
        loss_pol, inps, updts = mc_pilco_.get_loss(policy, dynmodel, cost, D, angle_dims)
        polopt.set_objective(loss_pol, policy.get_params(symbolic=True), inps, updts)
    polopt.minimize(p0.mean, p0.cov, 40, 1)
    
    # apply controller
    exp.new_episode(policy_params=policy.get_params())
    states, actions, costs, infos = apply_controller(env, policy, max_steps,
                                                     preprocess=gTrig, callback=step_cb)

[2017-07-12 10:26:10.661516] Iteration 1, total experience: 160 steps
[2017-07-12 10:26:10.665997] train_dynamics > Training dynamics model
[2017-07-12 10:26:10.669865] train_dynamics > Dataset size:: Inputs: [ (156, 6) ], Targets: [ (156, 4) ]  
[2017-07-12 10:26:10.672193] BNN > Building network
InputLayer {'shape': (None, 6), 'name': 'BNN_input'}
DenseLayer {'num_units': 200, 'nonlinearity': <function sigmoid at 0x7f0cb8be8950>, 'name': 'BNN_fc0'}
DropoutLayer {'p': 0.05, 'rescale': False, 'name': 'BNN_drop0', 'dropout_samples': array(25, dtype=int32)}
DenseLayer {'num_units': 200, 'nonlinearity': <function sigmoid at 0x7f0cb8be8950>, 'name': 'BNN_fc1'}
DropoutLayer {'p': 0.05, 'rescale': False, 'name': 'BNN_drop1', 'dropout_samples': array(25, dtype=int32)}
DenseLayer {'num_units': 4, 'nonlinearity': <function linear at 0x7f0cb8b86268>, 'name': 'BNN_output'}
[2017-07-12 10:26:10.678767] BNN > Initialising loss function
[2017-07-12 10:26:10.797675] BNN_opt > Building computation gra

	fixed_dropout_masks
  % "\n\t".join(suggestions))


[2017-07-12 10:26:25.693684] ScipyOptimizer > Building computation graph for gradients
[2017-07-12 10:26:26.467329] ScipyOptimizer > Compiling function for loss
[2017-07-12 10:26:31.851410] ScipyOptimizer > Compiling function for loss+gradients
[2017-07-12 10:26:58.158806] ScipyOptimizer > Optimizing parameters
[2017-07-12 10:26:58.228704] ScipyOptimizer > Initial loss [0.9999962783209636]
[2017-07-12 10:26:58.229600] ScipyOptimizer > Using L-BFGS-B optimizer
[2K[2017-07-12 10:26:59.147949] ScipyOptimizer > Current loss: 0.9999959931162289, Total evaluations: 9, Avg. time per call: 0.107220   
[2017-07-12 10:26:59.151710] ScipyOptimizer > Done training. New value [0.999992] iter: [7]
[2017-07-12 10:26:59.155181] apply_controller > Starting run
[2017-07-12 10:26:59.157873] apply_controller > Running for 4.000000 seconds




[2017-07-12 10:27:01.387523] apply_controller > Done. Stopping robot. Value of run [39.999422]
[2017-07-12 10:27:01.397272] Cartpole > Stopping robot
[2017-07-12 10:27:01.399597] Iteration 2, total experience: 200 steps
[2017-07-12 10:27:01.401472] train_dynamics > Training dynamics model
[2017-07-12 10:27:01.404400] train_dynamics > Dataset size:: Inputs: [ (195, 6) ], Targets: [ (195, 4) ]  
[2017-07-12 10:27:01.406314] BNN_opt > Optimizing parameters via mini batches
[2017-07-12 10:27:01.414149] BNN_opt > Initial loss [17.196826602347112]
[2K[2017-07-12 10:27:12.876903] BNN_opt > Current value: 1.147638E+00, Total evaluations: 979, Avg. time per updt: 0.005107

In [9]:
dynopt.grads_fn()

[array(924.1811476478766),
 array([[ -306.01761488,  -202.17335583,  -215.72509143,  -119.86554306,
          -276.31764957,   -67.52521367, -1667.62789131,   -16.79423829],
        [  -45.49039863,   -52.60383272,   -41.9701915 ,   -15.58153214,
           -45.7787077 ,    -9.85335037,  -240.07397572,     1.93446688],
        [  -54.84919972,   -45.99877261,   -54.86776936,   -11.95147382,
           -56.88412831,   -11.30721787,  -166.43947718,     2.66633356],
        [  -70.63741594,   -50.67703573,   -34.74237804,   -42.54835666,
           -54.95024117,    -4.01160943,  -162.12062864,     2.64143425]])]

In [10]:
dynopt.params

[SSGP_UI>loghyp]