In [1]:
 %matplotlib qt
import copy
import dill
import os
import numpy as np
import lasagne
import theano

from functools import partial

from kusanagi import utils
from kusanagi.base import apply_controller, ExperienceDataset
from kusanagi.ghost import control, regression
from kusanagi.shell import cartpole, arduino
from kusanagi.shell.cost import gaussian_kl_loss, convert_angle_dimensions
from kusanagi.shell.experiment_utils import run_pilco_experiment, setup_mc_pilco_experiment, plot_rollout

from matplotlib import pyplot as plt
# np.random.seed(1337)
np.set_printoptions(linewidth=500)

In [2]:
# init params
params = cartpole.default_params()
params['optimizer']['min_method'] = 'adam'
params['optimizer']['max_evals'] = 1000
params['learning_rate'] = 1e-4
params['crn_dropout'] = True
params['min_steps'] = 30
n_samples = 100                     # number of MC samples for bayesian nn
n_demo = 10                          # number of example trajectories
pol_adjustment = False

H = params['min_steps']
gamma = params['discount']
angle_dims = params['angle_dims']

# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

dyn_path = '/home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21'
pol_path = '/home/juancamilog/.kusanagi/output/cartpole_kl_loss/policy_21'
exp_path = None #'/home/juancamilog/.kusanagi/output/cartpole_kl_loss/experience_29'

In [3]:
def init_dyn(params, dyn_path=None, copy_params=True):

    dyn_spec = dict(
        hidden_dims=[200]*2,
        p=True, p_input=True,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseLogNormalDropoutLayer,
        build_fn=regression.dropout_mlp)
    
    if dyn_path is not None:
        # load dynamics model
        source_dyn = regression.BNN(
            filename=dyn_path, name='source_dyn', **params['dynamics_model'])
    else:
        # init dynamics model
        source_dyn = regression.BNN(network_spec=dyn_spec, name='source_dyn', **params['dynamics_model'])
        
    if copy_params and dyn_path is not None:
        target_dyn = regression.BNN(
            filename=dyn_path, name='target_dyn', **params['dynamics_model'])
    else:
        target_dyn = regression.BNN(network_spec=dyn_spec, name='target_dyn', **params['dynamics_model'])

    return source_dyn, target_dyn

def init_pol(params,  pol_path=None, adjustment=False, copy_params=True):
    pol_spec = dict(
        hidden_dims=[200]*2,
        p=0.1, p_input=0.0,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseDropoutLayer,
        build_fn=regression.dropout_mlp)

    if pol_path is not None:
        # load policy
        source_pol = control.NNPolicy(params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
    else:
        # init policy
        source_pol = control.NNPolicy(
            params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
    if pol_adjustment:
        # init adjustment model
        target_pol = control.AdjustedPolicy(
            source_pol, maxU=source_pol.maxU, angle_dims=source_pol.angle_dims,
            adjustment_model_class=regression.BNN)
        target_pol.adjustment_model.trained = True
    else:
        if copy_params and pol_path is not None:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
        else:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
            
    return source_pol, target_pol

# init task cost
task_cost = partial(cartpole.cartpole_loss, **params['cost'])

# init source environment
params['source'] = params['plant']
params['source']['name'] = 'Cartpole_src'
source_env = cartpole.Cartpole(**params['source'])

In [4]:
# collect example trajectory data on sim environment
source_pol = init_pol(params, pol_path)[0]
if exp_path is not None:
    source_exp = ExperienceDataset(filename=exp_path)
else:
    source_exp = ExperienceDataset()

# init expert trajectory variables
n_episodes = source_exp.n_episodes()
if n_demo > n_episodes:
    # function to execute before applying policy
    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    # function to execute after applying policy
    def step_cb(state, action, cost, info, env=None):
        env.render()

    # apply controller
    callback = partial(step_cb, env=source_env)

    for i in range(n_demo-n_episodes):
        ret = apply_controller(source_env, source_pol, H+1, gTrig, callback)
        source_exp.append_episode(*ret)

[2018-05-12 13:48:06.942303] NNPolicy > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/policy_21.zip
[2018-05-12 13:48:06.957485] NNPolicy > Building network
('InputLayer', {'shape': (None, 5), 'name': 'NNPolicy_input'})
('DenseLayer', {'W': NNPolicy_fc0>W, 'b': NNPolicy_fc0>b, 'name': 'NNPolicy_fc0', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'num_units': 200})
('DenseDropoutLayer', {'b': NNPolicy_fc1>b, 'name': 'NNPolicy_fc1', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'noise_samples': NNPolicy_fc1>noise_samples, 'p': 0.1, 'num_units': 200, 'W': NNPolicy_fc1>W})
('DenseDropoutLayer', {'b': NNPolicy_output>b, 'name': 'NNPolicy_output', 'nonlinearity': <function linear at 0x7f12bebf6050>, 'noise_samples': NNPolicy_output>noise_samples, 'p': 0.1, 'num_units': 1, 'W': NNPolicy_output>W})
[2018-05-12 13:48:06.962923] NNPolicy > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/policy_21.zip
[2018-05-12 13:48:06.975962] NNP

In [10]:
# source trajectory
trajs = np.array(source_exp.states)
tr_shape = trajs.shape

trajs = utils.gTrig_np(trajs.reshape((tr_shape[0]*tr_shape[1], tr_shape[2])), angle_dims)
trajectories = trajs.reshape((tr_shape[0], tr_shape[1], trajs.shape[-1])).astype(theano.config.floatX)

traj_mean = trajectories.mean(0)
trajc = trajectories[:, :, :, None]
trajmm = traj_mean[:, :, None]
N = (trajc.shape[0]-1.0)
traj_cov = (trajc*trajc.swapaxes(2,3)).sum(0)/N
traj_cov -= (trajmm*trajmm.swapaxes(1,2))

trajs = theano.shared(trajectories, name='trajs')
target_mean = theano.shared(traj_mean, name='target_mean')
target_cov = theano.shared(traj_cov, name='target_cov')

# define cost as sum of task cost and deviation form expert demonstration
def task_plus_il_cost(t, mx, Sx, weights=[1, 1e-4], loss_type=utils.ImitationLossType.KLQP):
    '''
        The IL term will penalize rollout predictive distributions that 
        are too different from the target distribution
    '''
    mxa, Sxa = convert_angle_dimensions(mx, Sx, angle_dims)
    mt, St = target_mean[t], target_cov[t]

    if loss_type == utils.ImitationLossType.KLQP:
        imitation_loss = gaussian_kl_loss(mxa, Sxa, mt, St)
    elif loss_type == utils.ImitationLossType.KLPQ:
        imitation_loss = gaussian_kl_loss(mt, St, mxa, Sxa)
    elif loss_type == utils.ImitationLossType.KLSYM:
        imitation_loss = 0.5*(gaussian_kl_loss(mt, St, mxa, Sxa) + gaussian_kl_loss(mxa, Sxa, mt, St))
    return weights[0]*task_cost(mx, Sx)[0] + weights[1]*imitation_loss

In [6]:
extra_shared = [trajs, target_mean, target_cov]
rollout_fn = None
target_exp = None
fig = None
axarr = None


def learning_iteration_cb(exp, dyn, pol, polopt, params, rollout_fn_in):
    global rollout_fn
    global target_exp
    i = exp.curr_episode
    # setup output directory
    exp.save(None, 'experience_%d' % (i))
    pol.save(None, 'policy_%d' % (i))
    dyn.save(None, 'dynamics_%d' % (i))
    with open(os.path.join(utils.get_output_dir(), 'config.dill'), 'wb') as f:
        dill.dump(params, f)
    rollout_fn = rollout_fn_in
    target_exp = exp

counter = 0
def minimize_cb(*args, **kwargs):
    global fig
    global axarr
    global counter
    if counter % 500 == 0:
        p0 = params['state0_dist']
        m0, S0 = p0.mean, p0.cov
        fig, axarr = plot_rollout(rollout_fn, source_exp, m0, S0, H, 1.0,
                                  fig=fig, axarr=axarr, n_exp=n_demo, name='Rollout during optimization')
        plt.waitforbuttonpress(0.01)
    counter += 1

In [7]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_mass'] *= 2
params['target']['name'] = 'target_2x_mass'
target_env = cartpole.Cartpole(**params['target'])

In [None]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_length'] *= 2
params['target']['name'] = 'target_2x_length'
target_env = cartpole.Cartpole(**params['target'])

In [8]:
# experiment 1 learn from scratch
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '001_no_transfer')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 13:48:27.820705] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 13:48:27.852473] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f12bebf6050>, 'n

[2018-05-12 13:50:41.665506] target_dyn_opt > Initial loss [133.31895743475843]
[2K[2018-05-12 13:50:54.693628] target_dyn_opt > Curr loss: 2.148382E+01 [1941: 2.060586E+01], n_evals: 1999, Avg. time per updt: 0.004992
[2018-05-12 13:50:54.703835] target_dyn_opt > Done training. New loss [24.196404] iter: [2000]
[2018-05-12 13:50:54.706563] train_dynamics > Done training dynamics model
[2018-05-12 13:50:54.707951] Experience > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/experience_1.zip
[2018-05-12 13:50:54.827774] NNPolicy > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/policy_1.zip
[2018-05-12 13:50:54.903440] target_dyn > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/dynamics_1.zip
[2018-05-12 13:50:56.574932] ==== Iteration [2], experience: [60 steps] ====
[2018-05-12 13:50:56.579818] SGDOptimizer > Optimizing parameters
[2018-05-12 13:50:56.708188] SGDOptimizer > Initial los

[2K[2018-05-12 13:59:54.301906] SGDOptimizer > Curr loss: 5.898086E-01, n_evals: 999, Avg. time per updt: 0.091044
[2018-05-12 13:59:54.330819] SGDOptimizer > Done training. New loss [0.578723] iter: [999]
[2018-05-12 13:59:54.332452] apply_controller > Starting run
[2018-05-12 13:59:54.333659] apply_controller > Running for 3.000000 seconds
[2018-05-12 13:59:54.572054] apply_controller > Done. Stopping robot. Value of run [28.041107]
[2018-05-12 13:59:54.573540] target_2x_mass > Stopping robot
[2018-05-12 13:59:54.574839] train_dynamics > Training dynamics model
[2018-05-12 13:59:54.577669] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-12 13:59:54.579039] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 13:59:54.594188] target_dyn_opt > Initial loss [3.9307521544006914]
[2K[2018-05-12 14:00:12.384877] target_dyn_opt > Curr loss: -1.884772E+00 [1909: -2.553112E+00], n_evals: 1999, Avg. time per updt: 0.007386
[2018-05-12 14:0

[2018-05-12 14:07:53.725578] train_dynamics > Done training dynamics model
[2018-05-12 14:07:53.727253] Experience > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/experience_10.zip
[2018-05-12 14:07:54.285185] NNPolicy > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/policy_10.zip
[2018-05-12 14:07:54.371614] target_dyn > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/dynamics_10.zip
[2018-05-12 14:07:56.066236] ==== Iteration [11], experience: [330 steps] ====
[2018-05-12 14:07:56.071409] SGDOptimizer > Optimizing parameters
[2018-05-12 14:07:56.185833] SGDOptimizer > Initial loss [0.5162771344184875]
[2K[2018-05-12 14:09:28.329044] SGDOptimizer > Curr loss: 4.539379E-01, n_evals: 999, Avg. time per updt: 0.090816
[2018-05-12 14:09:28.355669] SGDOptimizer > Done training. New loss [0.454654] iter: [999]
[2018-05-12 14:09:28.357362] apply_controller > Starting run
[2018-05-12 14:09:2

[2018-05-12 14:16:56.757993] apply_controller > Done. Stopping robot. Value of run [12.591006]
[2018-05-12 14:16:56.759208] target_2x_mass > Stopping robot
[2018-05-12 14:16:56.760530] train_dynamics > Training dynamics model
[2018-05-12 14:16:56.764369] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 14:16:56.765777] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 14:16:56.779575] target_dyn_opt > Initial loss [-8.021506384207772]
[2K[2018-05-12 14:17:14.599769] target_dyn_opt > Curr loss: -8.853582E+00 [1530: -9.528332E+00], n_evals: 1999, Avg. time per updt: 0.007412
[2018-05-12 14:17:14.614084] target_dyn_opt > Done training. New loss [-9.181263] iter: [2000]
[2018-05-12 14:17:14.616825] train_dynamics > Done training dynamics model
[2018-05-12 14:17:14.618370] Experience > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/experience_15.zip
[2018-05-12 14:17:15.386064] NNPolicy > Saving sta

[2018-05-12 14:24:33.695322] target_dyn > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/dynamics_19.zip
[2018-05-12 14:24:35.357790] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 14:24:35.362856] SGDOptimizer > Optimizing parameters
[2018-05-12 14:24:35.481892] SGDOptimizer > Initial loss [0.42585092782974243]
[2K[2018-05-12 14:26:04.747751] SGDOptimizer > Curr loss: 4.073306E-01, n_evals: 999, Avg. time per updt: 0.087962
[2018-05-12 14:26:04.774754] SGDOptimizer > Done training. New loss [0.409185] iter: [999]
[2018-05-12 14:26:04.776715] apply_controller > Starting run
[2018-05-12 14:26:04.777917] apply_controller > Running for 3.000000 seconds
[2018-05-12 14:26:04.962077] apply_controller > Done. Stopping robot. Value of run [25.213491]
[2018-05-12 14:26:04.963437] target_2x_mass > Stopping robot
[2018-05-12 14:26:04.964738] train_dynamics > Training dynamics model
[2018-05-12 14:26:04.968480] train_dynamics > Dataset size:: Input

[2018-05-12 14:33:26.914956] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 14:33:26.930791] target_dyn_opt > Initial loss [-9.375177718359849]
[2K[2018-05-12 14:33:44.861823] target_dyn_opt > Curr loss: -1.104130E+01 [1715: -1.175312E+01], n_evals: 1999, Avg. time per updt: 0.007459
[2018-05-12 14:33:44.877446] target_dyn_opt > Done training. New loss [-11.465817] iter: [2000]
[2018-05-12 14:33:44.880090] train_dynamics > Done training dynamics model
[2018-05-12 14:33:44.881748] Experience > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/experience_24.zip
[2018-05-12 14:33:46.048106] NNPolicy > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/policy_24.zip
[2018-05-12 14:33:46.129061] target_dyn > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/dynamics_24.zip
[2018-05-12 14:33:47.802791] ==== Iteration [25], experience: [750 steps] ====
[2018-05-12 14:33:47.807852]

[2018-05-12 14:41:16.533173] SGDOptimizer > Initial loss [0.4186825752258301]
[2K[2018-05-12 14:42:46.431938] SGDOptimizer > Curr loss: 3.934833E-01, n_evals: 999, Avg. time per updt: 0.088571
[2018-05-12 14:42:46.457588] SGDOptimizer > Done training. New loss [0.398087] iter: [999]
[2018-05-12 14:42:46.459452] apply_controller > Starting run
[2018-05-12 14:42:46.461329] apply_controller > Running for 3.000000 seconds
[2018-05-12 14:42:46.689373] apply_controller > Done. Stopping robot. Value of run [10.834880]
[2018-05-12 14:42:46.690735] target_2x_mass > Stopping robot
[2018-05-12 14:42:46.691964] train_dynamics > Training dynamics model
[2018-05-12 14:42:46.698671] train_dynamics > Dataset size:: Inputs: [ (870, 6) ], Targets: [ (870, 4) ] 
[2018-05-12 14:42:46.700143] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 14:42:46.714189] target_dyn_opt > Initial loss [-10.753128531643714]
[2K[2018-05-12 14:43:04.900845] target_dyn_opt > Curr loss: -1.197830E+01 [148

In [None]:
# experiment 2 learn starting from source policy and dynamics
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '002_task_cost_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 15:14:27.592466] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 15:14:27.619675] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f12bebeeb90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f12bebf6050>, 'n

[2018-05-12 15:17:02.805738] apply_controller > Starting run
[2018-05-12 15:17:02.807052] apply_controller > Running for 3.000000 seconds
[2018-05-12 15:17:02.981100] apply_controller > Done. Stopping robot. Value of run [29.993176]
[2018-05-12 15:17:02.982710] target_2x_mass > Stopping robot
[2018-05-12 15:17:02.984065] train_dynamics > Training dynamics model
[2018-05-12 15:17:02.988161] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-12 15:17:02.989722] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 15:17:03.023278] target_dyn_opt > Initial loss [161.2184887225258]
[2K[2018-05-12 15:17:16.365105] target_dyn_opt > Curr loss: 1.417724E+01 [1985: 1.410008E+01], n_evals: 1999, Avg. time per updt: 0.005131
[2018-05-12 15:17:16.375706] target_dyn_opt > Done training. New loss [14.388128] iter: [2000]
[2018-05-12 15:17:16.378255] train_dynamics > Done training dynamics model
[2018-05-12 15:17:16.382286] Experience > Saving state to 

[2018-05-12 15:25:01.220639] apply_controller > Done. Stopping robot. Value of run [19.265255]
[2018-05-12 15:25:01.221863] target_2x_mass > Stopping robot
[2018-05-12 15:25:01.223164] train_dynamics > Training dynamics model
[2018-05-12 15:25:01.225567] train_dynamics > Dataset size:: Inputs: [ (174, 6) ], Targets: [ (174, 4) ] 
[2018-05-12 15:25:01.227149] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 15:25:01.240362] target_dyn_opt > Initial loss [4.656178794886873]
[2K[2018-05-12 15:25:19.072032] target_dyn_opt > Curr loss: -4.712666E+00 [1561: -5.065705E+00], n_evals: 1999, Avg. time per updt: 0.007370
[2018-05-12 15:25:19.084668] target_dyn_opt > Done training. New loss [-4.947109] iter: [2000]
[2018-05-12 15:25:19.087254] train_dynamics > Done training dynamics model
[2018-05-12 15:25:19.089176] Experience > Saving state to /home/juancamilog/.kusanagi/output/target_2x_mass001_no_transfer/target_2x_mass002_task_cost_from_source/target_2x_mass002_task_cost_f

In [None]:
# experiment 3 learn starting from scratch, using klqp imitation loss
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '003_il_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 4 learn starting from scratch, using klpq imitation loss
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '004_il_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 5 learn starting from source params, using klqp imitation loss
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '005_il_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 6 learn starting from source, using klpq imitation loss
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '004_il_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 7 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '007_taskplusil_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 8 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '008_taskplusil_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 9 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '009_taskplusil_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

In [None]:
# experiment 8 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(utils.get_output_dir(), target_env.name + '010_taskplusil_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)