In [None]:
 %matplotlib qt
import copy
import dill
import os
import numpy as np
import lasagne
import theano

from functools import partial

from kusanagi import utils
from kusanagi.base import apply_controller, ExperienceDataset
from kusanagi.ghost import control, regression
from kusanagi.shell import cartpole, arduino
from kusanagi.shell.cost import gaussian_kl_loss, convert_angle_dimensions
from kusanagi.shell.experiment_utils import run_pilco_experiment, setup_mc_pilco_experiment, plot_rollout

from matplotlib import pyplot as plt
# np.random.seed(1337)
np.set_printoptions(linewidth=500)

In [None]:
# init params
output_dir = utils.get_output_dir()
sim2real_output_dir = '/localdata/juan/sim2real_results'


params = cartpole.default_params()
params['optimizer']['min_method'] = 'adam'
params['optimizer']['max_evals'] = 1000
params['learning_rate'] = 1e-4
params['crn_dropout'] = True
params['min_steps'] = 30
n_samples = 100                     # number of MC samples for bayesian nn
n_demo = 10                          # number of example trajectories
pol_adjustment = False

H = params['min_steps']
gamma = params['discount']
angle_dims = params['angle_dims']

# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

dyn_path = os.path.join(output_dir, 'cartpole_kl_loss/dynamics_21')
pol_path = os.path.join(output_dir, 'cartpole_kl_loss/policy_21')
exp_path = None #os.path.join(output_dir, 'cartpole_kl_loss/experience_29')

In [None]:
def init_dyn(params, dyn_path=None, copy_params=True):

    dyn_spec = dict(
        hidden_dims=[200]*2,
        p=True, p_input=True,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseLogNormalDropoutLayer,
        build_fn=regression.dropout_mlp)
    
    if dyn_path is not None:
        # load dynamics model
        source_dyn = regression.BNN(
            filename=dyn_path, name='source_dyn', **params['dynamics_model'])
    else:
        # init dynamics model
        source_dyn = regression.BNN(network_spec=dyn_spec, name='source_dyn', **params['dynamics_model'])
        
    if copy_params and dyn_path is not None:
        target_dyn = regression.BNN(
            filename=dyn_path, name='target_dyn', **params['dynamics_model'])
    else:
        target_dyn = regression.BNN(network_spec=dyn_spec, name='target_dyn', **params['dynamics_model'])

    return source_dyn, target_dyn

def init_pol(params,  pol_path=None, adjustment=False, copy_params=True):
    pol_spec = dict(
        hidden_dims=[200]*2,
        p=0.1, p_input=0.0,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseDropoutLayer,
        build_fn=regression.dropout_mlp)

    if pol_path is not None:
        # load policy
        source_pol = control.NNPolicy(params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
    else:
        # init policy
        source_pol = control.NNPolicy(
            params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
    if pol_adjustment:
        # init adjustment model
        target_pol = control.AdjustedPolicy(
            source_pol, maxU=source_pol.maxU, angle_dims=source_pol.angle_dims,
            adjustment_model_class=regression.BNN)
        target_pol.adjustment_model.trained = True
    else:
        if copy_params and pol_path is not None:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
        else:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
            
    return source_pol, target_pol

# init task cost
task_cost = partial(cartpole.cartpole_loss, **params['cost'])

# init source environment
params['source'] = params['plant']
params['source']['name'] = 'Cartpole_src'
source_env = cartpole.Cartpole(**params['source'])

In [None]:
# collect example trajectory data on sim environment
source_pol = init_pol(params, pol_path)[0]
if exp_path is not None:
    source_exp = ExperienceDataset(filename=exp_path)
else:
    source_exp = ExperienceDataset()

# init expert trajectory variables
n_episodes = source_exp.n_episodes()
if n_demo > n_episodes:
    # function to execute before applying policy
    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    # function to execute after applying policy
    def step_cb(state, action, cost, info, env=None):
        env.render()

    # apply controller
    callback = partial(step_cb, env=source_env)

    for i in range(n_demo-n_episodes):
        ret = apply_controller(source_env, source_pol, H+1, gTrig, callback)
        source_exp.append_episode(*ret)

In [None]:
# source trajectory
trajs = np.array(source_exp.states)
tr_shape = trajs.shape

trajs = utils.gTrig_np(trajs.reshape((tr_shape[0]*tr_shape[1], tr_shape[2])), angle_dims)
trajectories = trajs.reshape((tr_shape[0], tr_shape[1], trajs.shape[-1])).astype(theano.config.floatX)

traj_mean = trajectories.mean(0)
trajc = trajectories[:, :, :, None]
trajmm = traj_mean[:, :, None]
N = (trajc.shape[0]-1.0)
traj_cov = (trajc*trajc.swapaxes(2,3)).sum(0)/N
traj_cov -= (trajmm*trajmm.swapaxes(1,2))

trajs = theano.shared(trajectories, name='trajs')
target_mean = theano.shared(traj_mean, name='target_mean')
target_cov = theano.shared(traj_cov, name='target_cov')

# define cost as sum of task cost and deviation form expert demonstration
def task_plus_il_cost(t, mx, Sx, weights=[1, 1e-4], loss_type=utils.ImitationLossType.KLQP):
    '''
        The IL term will penalize rollout predictive distributions that 
        are too different from the target distribution
    '''
    mxa, Sxa = convert_angle_dimensions(mx, Sx, angle_dims)
    mt, St = target_mean[t], target_cov[t]

    if loss_type == utils.ImitationLossType.KLQP:
        imitation_loss = gaussian_kl_loss(mxa, Sxa, mt, St)
    elif loss_type == utils.ImitationLossType.KLPQ:
        imitation_loss = gaussian_kl_loss(mt, St, mxa, Sxa)
    elif loss_type == utils.ImitationLossType.KLSYM:
        imitation_loss = 0.5*(gaussian_kl_loss(mt, St, mxa, Sxa) + gaussian_kl_loss(mxa, Sxa, mt, St))
    return weights[0]*task_cost(mx, Sx)[0] + weights[1]*imitation_loss

In [None]:
extra_shared = [trajs, target_mean, target_cov]
rollout_fn = None
target_exp = None
fig = None
axarr = None


def learning_iteration_cb(exp, dyn, pol, polopt, params, rollout_fn_in):
    global rollout_fn
    global target_exp
    i = exp.curr_episode
    # setup output directory
    exp.save(None, 'experience_%d' % (i))
    pol.save(None, 'policy_%d' % (i))
    dyn.save(None, 'dynamics_%d' % (i))
    with open(os.path.join(utils.get_output_dir(), 'config.dill'), 'wb') as f:
        dill.dump(params, f)
    rollout_fn = rollout_fn_in
    target_exp = exp

counter = 0
def minimize_cb(*args, **kwargs):
    global fig
    global axarr
    global counter
    if counter % 500 == 0:
        p0 = params['state0_dist']
        m0, S0 = p0.mean, p0.cov
        fig, axarr = plot_rollout(rollout_fn, source_exp, m0, S0, H, 1.0,
                                  fig=fig, axarr=axarr, n_exp=n_demo, name='Rollout during optimization')
        plt.waitforbuttonpress(0.01)
    counter += 1

In [None]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_mass'] *= 2
params['target']['name'] = 'target_2x_mass'
target_env = cartpole.Cartpole(**params['target'])

In [None]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_length'] *= 2
params['target']['name'] = 'target_2x_length'
target_env = cartpole.Cartpole(**params['target'])

In [None]:
# experiment 1 learn from scratch
output_dir = os.path.join(sim2real_output_dir, target_env.name + '001_no_transfer')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 2 learn starting from source policy and dynamics
output_dir = os.path.join(sim2real_output_dir, target_env.name + '002_task_cost_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 3 learn starting from scratch, using klqp imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '003_il_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 4 learn starting from scratch, using klpq imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '004_il_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 5 learn starting from source params, using klqp imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '005_il_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 6 learn starting from source, using klpq imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '004_il_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 7 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '007_taskplusil_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 8 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '008_taskplusil_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 9 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '009_taskplusil_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 8 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '010_taskplusil_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)