In [None]:
 %matplotlib qt
import os
import numpy as np
import lasagne
import theano

from functools import partial

from kusanagi import utils
from kusanagi.ghost import control, regression
from kusanagi.shell import cartpole, arduino
from kusanagi.shell.cost import gaussian_kl_loss, convert_angle_dimensions
from kusanagi.shell.experiment_utils import run_pilco_experiment, setup_mc_pilco_experiment

# np.random.seed(1337)
np.set_printoptions(linewidth=500)

In [None]:
# init params
params = cartpole.default_params()
params['optimizer']['min_method'] = 'adam'
params['optimizer']['max_evals'] = 1000
params['learning_rate'] = 1e-4
params['crn_dropout'] = True
params['min_steps'] = 40
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 40                # learning iterations
n_samples = 100                     # number of MC samples for bayesian nn

H = params['min_steps']
gamma = params['discount']
angle_dims = params['angle_dims']

# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

In [None]:
# init dynamics model
dyn_spec = dict(
    hidden_dims=[200]*2,
    p=True, p_input=True,
    nonlinearities=regression.nonlinearities.rectify,
    W_init=lasagne.init.GlorotNormal(gain='relu'),
    dropout_class=regression.layers.DenseLogNormalDropoutLayer,
    build_fn=regression.dropout_mlp)
dyn = regression.BNN(network_spec=dyn_spec, **params['dynamics_model'])

# init policy
pol_spec = dict(
    hidden_dims=[200]*2,
    p=0.1, p_input=0.0,
    nonlinearities=regression.nonlinearities.rectify,
    W_init=lasagne.init.GlorotNormal(gain='relu'),
    dropout_class=regression.layers.DenseDropoutLayer,
    build_fn=regression.dropout_mlp)
pol = control.NNPolicy(dyn.E, network_spec=pol_spec, heteroscedastic=False, **params['policy'])

# init cost model
task_cost = partial(cartpole.cartpole_loss, **params['cost'])

# init environment
env = cartpole.Cartpole(**params['plant'])
#env = arduino.SerialPlant(maxU=pol.maxU, loss_func= task_cost, name='target',**params['plant'])

In [None]:
# kl regularization
state_dims = dyn.E + len(angle_dims)
trajs = theano.shared(
    np.zeros((n_samples, H+1, state_dims)).astype(theano.config.floatX),
    name='trajs')
target_mean = theano.shared(
    np.zeros(
        (H+1, state_dims)).astype(theano.config.floatX),
    name='target_mean')
target_cov = theano.shared(
    np.repeat(
        np.eye(state_dims)[None, :, :], H+1, axis=0).astype(theano.config.floatX),
    name='target_cov')

# define cost as sum of task cost and deviation form expert demonstration
def task_plus_il_cost(t, mx, Sx, weights=[1, 1e-4], loss_type=utils.ImitationLossType.KLQP):
    '''
        The IL term will penalize rollout predictive distributions that 
        are too different from the target distribution
    '''
    mxa, Sxa = convert_angle_dimensions(mx, Sx, angle_dims)
    mt, St = target_mean[t], target_cov[t]
    imitation_loss = 0
    if loss_type == utils.ImitationLossType.KLQP:
        imitation_loss = gaussian_kl_loss(mxa, Sxa, mt, St)
    elif loss_type == utils.ImitationLossType.KLPQ:
        imitation_loss = gaussian_kl_loss(mt, St, mxa, Sxa)
    elif loss_type == utils.ImitationLossType.KLSYM:
        imitation_loss = 0.5*(gaussian_kl_loss(mt, St, mxa, Sxa) + gaussian_kl_loss(mxa, Sxa, mt, St))
    return weights[0]*task_cost(mx, Sx)[0] + weights[1]*imitation_loss

In [None]:
extra_shared = [trajs, target_mean, target_cov]
rollout_fn = None
target_exp = None
fig = None
axarr = None

def update_target_traj(loss, costs, trajectories):
    '''
        Target distribution will be the rollout predictive 
        distribution from the previous optimization iterations;
        i.e. the KL term in the cost will be 
                 KL(rollout_pred(params_{i}) || rollout_pred(params_{i-1}) )
             when using the reverse KL loss
    '''
    tr_shape = trajectories.shape
    trajectories = trajectories.reshape((tr_shape[0]*tr_shape[1], tr_shape[2]))
    trajectories = utils.gTrig(trajectories, angle_dims)
    trajectories = trajectories.reshape((tr_shape[0], tr_shape[1], trajectories.shape[-1]))
    trajm = trajectories.mean(0)
    
    trajc = trajectories[:, :, :, None]
    trajmm = trajm[:, :, None]
    
    N = (trajc.shape[0]-1.0).astype(theano.config.floatX)
    traj_cov = (trajc*trajc.swapaxes(2,3)).sum(0)/N
    traj_cov -= (trajmm*trajmm.swapaxes(1,2))

    updates = theano.updates.OrderedUpdates()
    updates[trajs] = trajectories
    updates[target_mean] = trajm
    updates[target_cov] = traj_cov
    
    return updates

def learning_iteration_cb(exp, dyn, pol, polopt, params, rollout_fn_in):
    global rollout_fn
    global target_exp
    i = exp.curr_episode
    # setup output directory
    exp.save(None, 'experience_%d' % (i))
    pol.save(None, 'policy_%d' % (i))
    dyn.save(None, 'dynamics_%d' % (i))
    with open(os.path.join(utils.get_output_dir(), 'config.dill'), 'wb') as f:
        dill.dump(params, f)
    rollout_fn = rollout_fn_in
    target_exp = exp

counter = 0
def minimize_cb(*args, **kwargs):
    global fig
    global axarr
    global counter
    if counter % 500 == 0:
        p0 = params['state0_dist']
        m0, S0 = p0.mean, p0.cov
        fig, axarr = plot_rollout(rollout_fn, source_exp, m0, S0, H, 1.0,
                                  fig=fig, axarr=axarr, n_exp=n_demo, name='Rollout during optimization')
        plt.waitforbuttonpress(0.01)
    counter += 1
    
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared, extra_updts_init=update_target_traj,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=pol, dyn=dyn)
cost = partial(task_plus_il_cost, loss_type=utils.ImitationLossType.KLSYM)

# setup output directory
output_dir = os.path.join(utils.get_output_dir(), env.name + '_kl_reg')
utils.set_output_dir(utils.unique_path(output_dir))

run_pilco_experiment(
    env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)