In [1]:
 %matplotlib qt
import copy
import dill
import os
import numpy as np
import lasagne
import theano

from functools import partial

from kusanagi import utils
from kusanagi.base import apply_controller, ExperienceDataset
from kusanagi.ghost import control, regression
from kusanagi.shell import cartpole, arduino
from kusanagi.shell.cost import gaussian_kl_loss, mmd_loss, convert_angle_dimensions
from kusanagi.shell.experiment_utils import run_pilco_experiment, setup_mc_pilco_experiment, plot_rollout

from matplotlib import pyplot as plt
# np.random.seed(1337)
np.set_printoptions(linewidth=500)

In [2]:
# init params
output_dir = utils.get_output_dir()
sim2real_output_dir = '/localdata/juan/sim2real_results'


params = cartpole.default_params()
params['optimizer']['min_method'] = 'adam'
params['optimizer']['max_evals'] = 1000
params['learning_rate'] = 1e-4
params['crn_dropout'] = True
params['min_steps'] = 30
n_samples = 100                     # number of MC samples for bayesian nn
n_demo = 10                          # number of example trajectories
pol_adjustment = False

H = params['min_steps']
gamma = params['discount']
angle_dims = params['angle_dims']

# initial state distribution
p0 = params['state0_dist']
D = p0.mean.size

dyn_path = os.path.join(output_dir, 'cartpole_kl_loss/dynamics_21')
pol_path = os.path.join(output_dir, 'cartpole_kl_loss/policy_21')
exp_path = None #os.path.join(output_dir, 'cartpole_kl_loss/experience_29')

In [3]:
def init_dyn(params, dyn_path=None, copy_params=True):

    dyn_spec = dict(
        hidden_dims=[200]*2,
        p=True, p_input=True,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseLogNormalDropoutLayer,
        build_fn=regression.dropout_mlp)
    
    if dyn_path is not None:
        # load dynamics model
        source_dyn = regression.BNN(
            filename=dyn_path, name='source_dyn', **params['dynamics_model'])
    else:
        # init dynamics model
        source_dyn = regression.BNN(network_spec=dyn_spec, name='source_dyn', **params['dynamics_model'])
        
    if copy_params and dyn_path is not None:
        target_dyn = regression.BNN(
            filename=dyn_path, name='target_dyn', **params['dynamics_model'])
    else:
        target_dyn = regression.BNN(network_spec=dyn_spec, name='target_dyn', **params['dynamics_model'])

    return source_dyn, target_dyn

def init_pol(params,  pol_path=None, adjustment=False, copy_params=True):
    pol_spec = dict(
        hidden_dims=[200]*2,
        p=0.1, p_input=0.0,
        nonlinearities=regression.nonlinearities.rectify,
        W_init=lasagne.init.GlorotNormal(gain='relu'),
        dropout_class=regression.layers.DenseDropoutLayer,
        build_fn=regression.dropout_mlp)

    if pol_path is not None:
        # load policy
        source_pol = control.NNPolicy(params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
    else:
        # init policy
        source_pol = control.NNPolicy(
            params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
    if pol_adjustment:
        # init adjustment model
        target_pol = control.AdjustedPolicy(
            source_pol, maxU=source_pol.maxU, angle_dims=source_pol.angle_dims,
            adjustment_model_class=regression.BNN)
        target_pol.adjustment_model.trained = True
    else:
        if copy_params and pol_path is not None:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], filename=pol_path, **params['policy'])
        else:
            target_pol = control.NNPolicy(
                params['dynamics_model']['odims'], network_spec=pol_spec, heteroscedastic=False, **params['policy'])
            
    return source_pol, target_pol

# init task cost
task_cost = partial(cartpole.cartpole_loss, **params['cost'])

# init source environment
params['source'] = params['plant']
params['source']['name'] = 'Cartpole_src'
source_env = cartpole.Cartpole(**params['source'])

In [4]:
# collect example trajectory data on sim environment
source_pol = init_pol(params, pol_path)[0]
if exp_path is not None:
    source_exp = ExperienceDataset(filename=exp_path)
else:
    source_exp = ExperienceDataset()

# init expert trajectory variables
n_episodes = source_exp.n_episodes()
if n_demo > n_episodes:
    # function to execute before applying policy
    def gTrig(state):
        return utils.gTrig_np(state, angle_dims).flatten()

    # function to execute after applying policy
    def step_cb(state, action, cost, info, env=None):
        env.render()

    # apply controller
    callback = partial(step_cb, env=source_env)

    for i in range(n_demo-n_episodes):
        ret = apply_controller(source_env, source_pol, H+1, gTrig, callback)
        source_exp.append_episode(*ret)

[2018-05-14 23:39:54.609339] NNPolicy > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/policy_21.zip
[2018-05-14 23:39:54.624844] NNPolicy > Building network
('InputLayer', {'shape': (None, 5), 'name': 'NNPolicy_input'})
('DenseLayer', {'W': NNPolicy_fc0>W, 'b': NNPolicy_fc0>b, 'name': 'NNPolicy_fc0', 'nonlinearity': <function rectify at 0x7f7fbc4b6b90>, 'num_units': 200})
('DenseDropoutLayer', {'b': NNPolicy_fc1>b, 'name': 'NNPolicy_fc1', 'nonlinearity': <function rectify at 0x7f7fbc4b6b90>, 'noise_samples': NNPolicy_fc1>noise_samples, 'p': 0.1, 'num_units': 200, 'W': NNPolicy_fc1>W})
('DenseDropoutLayer', {'b': NNPolicy_output>b, 'name': 'NNPolicy_output', 'nonlinearity': <function linear at 0x7f7fbc4ba050>, 'noise_samples': NNPolicy_output>noise_samples, 'p': 0.1, 'num_units': 1, 'W': NNPolicy_output>W})
[2018-05-14 23:39:55.267093] NNPolicy > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/policy_21.zip
[2018-05-14 23:39:55.280389] NNP

In [5]:
# source trajectory
trajs = np.array(source_exp.states)
tr_shape = trajs.shape

trajs = utils.gTrig_np(trajs.reshape((tr_shape[0]*tr_shape[1], tr_shape[2])), angle_dims)
trajectories = trajs.reshape((tr_shape[0], tr_shape[1], trajs.shape[-1])).astype(theano.config.floatX)

traj_mean = trajectories.mean(0)
trajc = trajectories[:, :, :, None]
trajmm = traj_mean[:, :, None]
N = (trajc.shape[0]-1.0)
traj_cov = (trajc*trajc.swapaxes(2,3)).sum(0)/N
traj_cov -= (trajmm*trajmm.swapaxes(1,2))

trajs = theano.shared(trajectories, name='trajs')
target_mean = theano.shared(traj_mean, name='target_mean')
target_cov = theano.shared(traj_cov, name='target_cov')

In [6]:
extra_shared = [trajs, target_mean, target_cov]
rollout_fn = None
target_exp = None
fig = None
axarr = None


def learning_iteration_cb(exp, dyn, pol, polopt, params, rollout_fn_in):
    global rollout_fn
    global target_exp
    i = exp.curr_episode
    # setup output directory
    exp.save(None, 'experience_%d' % (i))
    pol.save(None, 'policy_%d' % (i))
    dyn.save(None, 'dynamics_%d' % (i))
    with open(os.path.join(utils.get_output_dir(), 'config.dill'), 'wb') as f:
        dill.dump(params, f)
    rollout_fn = rollout_fn_in
    target_exp = exp

counter = 0
def minimize_cb(*args, **kwargs):
    global fig
    global axarr
    global counter
    if counter % 500 == 0:
        p0 = params['state0_dist']
        m0, S0 = p0.mean, p0.cov
        fig, axarr = plot_rollout(rollout_fn, source_exp, m0, S0, H, 1.0,
                                  fig=fig, axarr=axarr, n_exp=n_demo, name='Rollout during optimization')
        plt.waitforbuttonpress(0.01)
    counter += 1
    

# define cost as sum of task cost and deviation form expert demonstration
def task_plus_il_cost(t, mx, Sx, weights=[1, 1e-4], loss_type=utils.ImitationLossType.KLQP):
    '''
        The IL term will penalize rollout predictive distributions that 
        are too different from the target distribution
    '''
    mxa, Sxa = convert_angle_dimensions(mx, Sx, angle_dims)
    mt, St = target_mean[t], target_cov[t]

    if loss_type == utils.ImitationLossType.KLQP:
        imitation_loss = gaussian_kl_loss(mxa, Sxa, mt, St)
    elif loss_type == utils.ImitationLossType.KLPQ:
        imitation_loss = gaussian_kl_loss(mt, St, mxa, Sxa)
    elif loss_type == utils.ImitationLossType.KLSYM:
        imitation_loss = 0.5*(gaussian_kl_loss(mt, St, mxa, Sxa) + gaussian_kl_loss(mxa, Sxa, mt, St))
    elif loss_type == utils.ImitationLossType.MMD:
        mmd = mmd_loss(mxa, Sxa, trajs[:, t, :])
        imitation_loss = theano.tensor.sqrt(mmd)
    return weights[0]*task_cost(mx, Sx)[0] + weights[1]*imitation_loss

In [7]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_mass'] *= 2
params['target']['name'] = 'target_2x_mass'
target_env = cartpole.Cartpole(**params['target'])

In [7]:
params['target'] = copy.deepcopy(params['plant'])
params['target']['pole_length'] *= 2
params['target']['name'] = 'target_2x_length'
target_env = cartpole.Cartpole(**params['target'])

In [8]:
# experiment 1 learn from scratch
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_001_no_transfer')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 15:54:22.429103] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 15:54:22.451849] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 15:56:33.598963] target_dyn_opt > Initial loss [196.87817812073425]
[2K[2018-05-12 15:56:46.483913] target_dyn_opt > Curr loss: 2.487258E+01 [1720: 2.168714E+01], n_evals: 1999, Avg. time per updt: 0.004921
[2018-05-12 15:56:46.492488] target_dyn_opt > Done training. New loss [23.101507] iter: [2000]
[2018-05-12 15:56:46.495014] train_dynamics > Done training dynamics model
[2018-05-12 15:56:46.496376] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/experience_1.zip
[2018-05-12 15:56:46.613623] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/policy_1.zip
[2018-05-12 15:56:46.689021] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/dynamics_1.zip
[2018-05-12 15:56:48.088281] ==== Iteration [2], experience: [60 steps] ====
[2018-05-12 15:56:48.093368] SGDOptimizer > Optimizing parameters
[2018-05-12 15:56:48.199887] SGDOptimizer > Initial loss [

[2K[2018-05-12 16:05:21.333019] SGDOptimizer > Curr loss: 6.946740E-01, n_evals: 999, Avg. time per updt: 0.084595
[2018-05-12 16:05:21.356317] SGDOptimizer > Done training. New loss [0.714920] iter: [999]
[2018-05-12 16:05:21.358078] apply_controller > Starting run
[2018-05-12 16:05:21.359390] apply_controller > Running for 3.000000 seconds
[2018-05-12 16:05:21.582548] apply_controller > Done. Stopping robot. Value of run [22.796093]
[2018-05-12 16:05:21.584227] target_2x_mass > Stopping robot
[2018-05-12 16:05:21.585468] train_dynamics > Training dynamics model
[2018-05-12 16:05:21.588266] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-12 16:05:21.589721] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 16:05:21.603583] target_dyn_opt > Initial loss [3.0791926721473377]
[2K[2018-05-12 16:05:39.782419] target_dyn_opt > Curr loss: -2.215102E+00 [1806: -2.733445E+00], n_evals: 1999, Avg. time per updt: 0.007560
[2018-05-12 16:0

[2018-05-12 16:12:45.465630] train_dynamics > Done training dynamics model
[2018-05-12 16:12:45.467167] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/experience_10.zip
[2018-05-12 16:12:46.000982] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/policy_10.zip
[2018-05-12 16:12:46.085452] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/dynamics_10.zip
[2018-05-12 16:12:47.950014] ==== Iteration [11], experience: [330 steps] ====
[2018-05-12 16:12:47.955059] SGDOptimizer > Optimizing parameters
[2018-05-12 16:12:48.087515] SGDOptimizer > Initial loss [0.5894615650177002]
[2K[2018-05-12 16:14:14.796930] SGDOptimizer > Curr loss: 4.647201E-01, n_evals: 999, Avg. time per updt: 0.085369
[2018-05-12 16:14:14.820512] SGDOptimizer > Done training. New loss [0.461924] iter: [999]
[2018-05-12 16:14:14.822207] apply_controller > Starting run
[2018-05-12 16:14:14.8

[2018-05-12 16:21:33.370308] apply_controller > Done. Stopping robot. Value of run [23.538347]
[2018-05-12 16:21:33.371786] target_2x_mass > Stopping robot
[2018-05-12 16:21:33.373087] train_dynamics > Training dynamics model
[2018-05-12 16:21:33.376251] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 16:21:33.377630] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 16:21:33.392611] target_dyn_opt > Initial loss [-6.725470505590582]
[2K[2018-05-12 16:21:55.476629] target_dyn_opt > Curr loss: -9.423353E+00 [1672: -9.679023E+00], n_evals: 1999, Avg. time per updt: 0.009358
[2018-05-12 16:21:55.489449] target_dyn_opt > Done training. New loss [-9.322177] iter: [2000]
[2018-05-12 16:21:55.491991] train_dynamics > Done training dynamics model
[2018-05-12 16:21:55.493929] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/experience_15.zip
[2018-05-12 16:21:56.224672] NNPolicy > Saving stat

[2018-05-12 16:29:21.915939] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/dynamics_19.zip
[2018-05-12 16:29:23.690466] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 16:29:23.695736] SGDOptimizer > Optimizing parameters
[2018-05-12 16:29:23.814728] SGDOptimizer > Initial loss [0.6612314581871033]
[2K[2018-05-12 16:30:53.828263] SGDOptimizer > Curr loss: 4.069581E-01, n_evals: 999, Avg. time per updt: 0.088693
[2018-05-12 16:30:53.853326] SGDOptimizer > Done training. New loss [0.403314] iter: [999]
[2018-05-12 16:30:53.855186] apply_controller > Starting run
[2018-05-12 16:30:53.856561] apply_controller > Running for 3.000000 seconds
[2018-05-12 16:30:54.031195] apply_controller > Done. Stopping robot. Value of run [12.916805]
[2018-05-12 16:30:54.032896] target_2x_mass > Stopping robot
[2018-05-12 16:30:54.034244] train_dynamics > Training dynamics model
[2018-05-12 16:30:54.038782] train_dynamics > Dataset size:: Inputs:

[2018-05-12 16:38:32.093783] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 16:38:32.108932] target_dyn_opt > Initial loss [-10.190879085063933]
[2K[2018-05-12 16:38:50.328376] target_dyn_opt > Curr loss: -1.164239E+01 [1642: -1.206825E+01], n_evals: 1999, Avg. time per updt: 0.007624
[2018-05-12 16:38:50.341468] target_dyn_opt > Done training. New loss [-11.568586] iter: [2000]
[2018-05-12 16:38:50.344299] train_dynamics > Done training dynamics model
[2018-05-12 16:38:50.350676] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/experience_24.zip
[2018-05-12 16:38:51.478282] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/policy_24.zip
[2018-05-12 16:38:51.560647] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_001_no_transfer/dynamics_24.zip
[2018-05-12 16:38:53.318351] ==== Iteration [25], experience: [750 steps] ====
[2018-05-12 16:38:53.323465] S

[2018-05-12 16:47:16.933906] SGDOptimizer > Initial loss [0.817743182182312]
[2K[2018-05-12 16:48:53.999075] SGDOptimizer > Curr loss: 3.867545E-01, n_evals: 999, Avg. time per updt: 0.095736
[2018-05-12 16:48:54.031433] SGDOptimizer > Done training. New loss [0.729472] iter: [999]
[2018-05-12 16:48:54.033400] apply_controller > Starting run
[2018-05-12 16:48:54.034705] apply_controller > Running for 3.000000 seconds
[2018-05-12 16:48:54.222518] apply_controller > Done. Stopping robot. Value of run [12.085616]
[2018-05-12 16:48:54.224002] target_2x_mass > Stopping robot
[2018-05-12 16:48:54.225281] train_dynamics > Training dynamics model
[2018-05-12 16:48:54.229871] train_dynamics > Dataset size:: Inputs: [ (870, 6) ], Targets: [ (870, 4) ] 
[2018-05-12 16:48:54.231179] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 16:48:54.248269] target_dyn_opt > Initial loss [-10.706662858525911]
[2K[2018-05-12 16:49:13.598108] target_dyn_opt > Curr loss: -1.233316E+01 [1634

In [9]:
# experiment 2 learn starting from source policy and dynamics
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_002_task_cost_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=False,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

run_pilco_experiment(
    target_env, task_cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 16:51:20.497790] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 16:51:20.534702] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 16:54:09.371597] target_2x_mass > Stopping robot
[2018-05-12 16:54:09.373750] train_dynamics > Training dynamics model
[2018-05-12 16:54:09.376520] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-12 16:54:09.377812] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 16:54:09.390318] target_dyn_opt > Initial loss [75.53877804659564]
[2K[2018-05-12 16:54:23.136133] target_dyn_opt > Curr loss: 1.404247E+01 [1977: 1.374613E+01], n_evals: 1999, Avg. time per updt: 0.005315
[2018-05-12 16:54:23.146592] target_dyn_opt > Done training. New loss [14.228373] iter: [2000]
[2018-05-12 16:54:23.149583] train_dynamics > Done training dynamics model
[2018-05-12 16:54:23.150953] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/experience_1.zip
[2018-05-12 16:54:23.280492] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/policy_1.zip

[2018-05-12 17:03:26.738427] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/dynamics_5.zip
[2018-05-12 17:03:28.906829] ==== Iteration [6], experience: [180 steps] ====
[2018-05-12 17:03:28.911590] SGDOptimizer > Optimizing parameters
[2018-05-12 17:03:29.100702] SGDOptimizer > Initial loss [0.4596257507801056]
[2K[2018-05-12 17:05:42.244139] SGDOptimizer > Curr loss: 4.473623E-01, n_evals: 999, Avg. time per updt: 0.131875
[2018-05-12 17:05:42.288190] SGDOptimizer > Done training. New loss [0.442795] iter: [999]
[2018-05-12 17:05:42.289929] apply_controller > Starting run
[2018-05-12 17:05:42.291144] apply_controller > Running for 3.000000 seconds
[2018-05-12 17:05:42.482667] apply_controller > Done. Stopping robot. Value of run [13.155147]
[2018-05-12 17:05:42.483926] target_2x_mass > Stopping robot
[2018-05-12 17:05:42.485186] train_dynamics > Training dynamics model
[2018-05-12 17:05:42.488359] train_dynamics > Dataset size::

[2018-05-12 17:15:24.216665] train_dynamics > Dataset size:: Inputs: [ (319, 6) ], Targets: [ (319, 4) ] 
[2018-05-12 17:15:24.217917] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 17:15:24.238609] target_dyn_opt > Initial loss [-8.527793052428194]
[2K[2018-05-12 17:15:45.492051] target_dyn_opt > Curr loss: -9.501811E+00 [1678: -1.011010E+01], n_evals: 1999, Avg. time per updt: 0.009128
[2018-05-12 17:15:45.534376] target_dyn_opt > Done training. New loss [-9.623624] iter: [2000]
[2018-05-12 17:15:45.540950] train_dynamics > Done training dynamics model
[2018-05-12 17:15:45.543022] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/experience_10.zip
[2018-05-12 17:15:46.166306] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/policy_10.zip
[2018-05-12 17:15:46.260506] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_fro

[2018-05-12 17:25:54.723339] ==== Iteration [15], experience: [450 steps] ====
[2018-05-12 17:25:54.728714] SGDOptimizer > Optimizing parameters
[2018-05-12 17:25:54.934577] SGDOptimizer > Initial loss [0.440027117729187]
[2K[2018-05-12 17:28:19.372399] SGDOptimizer > Curr loss: 4.150839E-01, n_evals: 999, Avg. time per updt: 0.143137
[2018-05-12 17:28:19.422012] SGDOptimizer > Done training. New loss [0.413236] iter: [999]
[2018-05-12 17:28:19.423858] apply_controller > Starting run
[2018-05-12 17:28:19.425226] apply_controller > Running for 3.000000 seconds
[2018-05-12 17:28:19.607488] apply_controller > Done. Stopping robot. Value of run [12.009007]
[2018-05-12 17:28:19.608740] target_2x_mass > Stopping robot
[2018-05-12 17:28:19.610076] train_dynamics > Training dynamics model
[2018-05-12 17:28:19.614198] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 17:28:19.615983] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 17:2

[2018-05-12 17:37:40.661451] target_dyn_opt > Initial loss [-10.131616319047993]
[2K[2018-05-12 17:38:01.001045] target_dyn_opt > Curr loss: -1.200429E+01 [1485: -1.287330E+01], n_evals: 1999, Avg. time per updt: 0.008657
[2018-05-12 17:38:01.016656] target_dyn_opt > Done training. New loss [-11.990581] iter: [2000]
[2018-05-12 17:38:01.019278] train_dynamics > Done training dynamics model
[2018-05-12 17:38:01.020661] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/experience_19.zip
[2018-05-12 17:38:01.943593] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/policy_19.zip
[2018-05-12 17:38:02.036684] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/dynamics_19.zip
[2018-05-12 17:38:04.244616] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 17:38:04.249311] SGDOptimizer > Optimizing parameters
[2018-05-12 17:38:0

[2018-05-12 17:48:22.780171] SGDOptimizer > Optimizing parameters
[2018-05-12 17:48:22.952205] SGDOptimizer > Initial loss [0.38807886838912964]
[2K[2018-05-12 17:50:27.951149] SGDOptimizer > Curr loss: 3.800575E-01, n_evals: 999, Avg. time per updt: 0.123710
[2018-05-12 17:50:27.992171] SGDOptimizer > Done training. New loss [0.383806] iter: [999]
[2018-05-12 17:50:27.994084] apply_controller > Starting run
[2018-05-12 17:50:27.995567] apply_controller > Running for 3.000000 seconds
[2018-05-12 17:50:28.183770] apply_controller > Done. Stopping robot. Value of run [18.834042]
[2018-05-12 17:50:28.185118] target_2x_mass > Stopping robot
[2018-05-12 17:50:28.186364] train_dynamics > Training dynamics model
[2018-05-12 17:50:28.190992] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-12 17:50:28.192269] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 17:50:28.207996] target_dyn_opt > Initial loss [-7.73263591207307]
[2K[2018-05-1

[2K[2018-05-12 18:01:02.860788] target_dyn_opt > Curr loss: -1.340658E+01 [1896: -1.396489E+01], n_evals: 1999, Avg. time per updt: 0.010010
[2018-05-12 18:01:02.881740] target_dyn_opt > Done training. New loss [-13.573767] iter: [2000]
[2018-05-12 18:01:02.885062] train_dynamics > Done training dynamics model
[2018-05-12 18:01:02.886479] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/experience_28.zip
[2018-05-12 18:01:04.528831] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/policy_28.zip
[2018-05-12 18:01:04.648673] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_002_task_cost_from_source/dynamics_28.zip
[2018-05-12 18:01:07.500020] ==== Iteration [29], experience: [870 steps] ====
[2018-05-12 18:01:07.505604] SGDOptimizer > Optimizing parameters
[2018-05-12 18:01:07.696139] SGDOptimizer > Initial loss [0.42210742831230164]
[2K[2018-05-12 18:03

In [11]:
# experiment 3 learn starting from scratch, using klqp imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_003_il_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 18:07:01.672530] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 18:07:01.696574] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 18:09:41.461214] target_dyn_opt > Initial loss [86.12310002248228]
[2K[2018-05-12 18:09:54.839105] target_dyn_opt > Curr loss: 2.913741E+01 [1648: 2.204935E+01], n_evals: 1999, Avg. time per updt: 0.005158
[2018-05-12 18:09:54.849303] target_dyn_opt > Done training. New loss [24.445112] iter: [2000]
[2018-05-12 18:09:54.851855] train_dynamics > Done training dynamics model
[2018-05-12 18:09:54.853476] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/experience_1.zip
[2018-05-12 18:09:55.000338] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/policy_1.zip
[2018-05-12 18:09:55.104843] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/dynamics_1.zip
[2018-05-12 18:09:57.776343] ==== Iteration [2], experience: [60 steps] ====
[2018-05-12 18:09:57.781382] SGDOptimizer > Optimizing parameters
[2018-05-12 18:09:57.913607] SGD

[2018-05-12 18:18:55.728596] SGDOptimizer > Initial loss [16146.7490234375]
[2K[2018-05-12 18:20:47.453708] SGDOptimizer > Curr loss: 1.852543E+04, n_evals: 999, Avg. time per updt: 0.110363
[2018-05-12 18:20:47.478936] SGDOptimizer > Done training. New loss [25032.130859] iter: [999]
[2018-05-12 18:20:47.480988] apply_controller > Starting run
[2018-05-12 18:20:47.482785] apply_controller > Running for 3.000000 seconds
[2018-05-12 18:20:47.656831] apply_controller > Done. Stopping robot. Value of run [29.985237]
[2018-05-12 18:20:47.658165] target_2x_mass > Stopping robot
[2018-05-12 18:20:47.659386] train_dynamics > Training dynamics model
[2018-05-12 18:20:47.662451] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-12 18:20:47.663823] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 18:20:47.680779] target_dyn_opt > Initial loss [9.682417066577418]
[2K[2018-05-12 18:21:05.893852] target_dyn_opt > Curr loss: -2.607559E+00 [197

[2K[2018-05-12 18:30:09.534704] target_dyn_opt > Curr loss: -6.387366E+00 [1939: -6.906607E+00], n_evals: 1999, Avg. time per updt: 0.007931
[2018-05-12 18:30:09.550512] target_dyn_opt > Done training. New loss [-6.493181] iter: [2000]
[2018-05-12 18:30:09.553331] train_dynamics > Done training dynamics model
[2018-05-12 18:30:09.554765] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/experience_10.zip
[2018-05-12 18:30:10.120019] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/policy_10.zip
[2018-05-12 18:30:10.225679] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/dynamics_10.zip
[2018-05-12 18:30:12.855644] ==== Iteration [11], experience: [330 steps] ====
[2018-05-12 18:30:12.861508] SGDOptimizer > Optimizing parameters
[2018-05-12 18:30:12.999015] SGDOptimizer > Initial loss [16508.91796875]
[2K[2018-05-12 18:31:57.38498

[2K[2018-05-12 18:40:25.483579] SGDOptimizer > Curr loss: 1.258405E+04, n_evals: 999, Avg. time per updt: 0.107769
[2018-05-12 18:40:25.508662] SGDOptimizer > Done training. New loss [11852.201172] iter: [999]
[2018-05-12 18:40:25.510430] apply_controller > Starting run
[2018-05-12 18:40:25.511905] apply_controller > Running for 3.000000 seconds
[2018-05-12 18:40:25.684192] apply_controller > Done. Stopping robot. Value of run [28.420656]
[2018-05-12 18:40:25.685524] target_2x_mass > Stopping robot
[2018-05-12 18:40:25.686885] train_dynamics > Training dynamics model
[2018-05-12 18:40:25.690083] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 18:40:25.691584] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 18:40:25.706010] target_dyn_opt > Initial loss [-7.930185625377785]
[2K[2018-05-12 18:40:43.822502] target_dyn_opt > Curr loss: -9.295877E+00 [1970: -9.622886E+00], n_evals: 1999, Avg. time per updt: 0.007581
[2018-05-12 

[2K[2018-05-12 18:49:22.005316] target_dyn_opt > Curr loss: -1.073867E+01 [1839: -1.108668E+01], n_evals: 1999, Avg. time per updt: 0.007419
[2018-05-12 18:49:22.018121] target_dyn_opt > Done training. New loss [-10.245923] iter: [2000]
[2018-05-12 18:49:22.021046] train_dynamics > Done training dynamics model
[2018-05-12 18:49:22.022408] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/experience_19.zip
[2018-05-12 18:49:22.927869] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/policy_19.zip
[2018-05-12 18:49:23.029309] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/dynamics_19.zip
[2018-05-12 18:49:25.549078] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 18:49:25.554235] SGDOptimizer > Optimizing parameters
[2018-05-12 18:49:25.684238] SGDOptimizer > Initial loss [18763.236328125]
[2K[2018-05-12 18:51:09.051

[2K[2018-05-12 19:00:00.537035] SGDOptimizer > Curr loss: 2.311229E+04, n_evals: 999, Avg. time per updt: 0.109729
[2018-05-12 19:00:00.563348] SGDOptimizer > Done training. New loss [14469.771484] iter: [999]
[2018-05-12 19:00:00.565282] apply_controller > Starting run
[2018-05-12 19:00:00.566809] apply_controller > Running for 3.000000 seconds
[2018-05-12 19:00:00.735036] apply_controller > Done. Stopping robot. Value of run [28.099049]
[2018-05-12 19:00:00.736433] target_2x_mass > Stopping robot
[2018-05-12 19:00:00.737978] train_dynamics > Training dynamics model
[2018-05-12 19:00:00.742253] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-12 19:00:00.743542] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 19:00:00.757698] target_dyn_opt > Initial loss [-9.43703396492991]
[2K[2018-05-12 19:00:18.373511] target_dyn_opt > Curr loss: -1.205195E+01 [1556: -1.218129E+01], n_evals: 1999, Avg. time per updt: 0.007344
[2018-05-12 1

[2K[2018-05-12 19:09:08.939753] target_dyn_opt > Curr loss: -1.243961E+01 [1791: -1.305509E+01], n_evals: 1999, Avg. time per updt: 0.007441
[2018-05-12 19:09:08.952958] target_dyn_opt > Done training. New loss [-12.592446] iter: [2000]
[2018-05-12 19:09:08.955653] train_dynamics > Done training dynamics model
[2018-05-12 19:09:08.957064] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/experience_28.zip
[2018-05-12 19:09:10.251176] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/policy_28.zip
[2018-05-12 19:09:10.354843] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_003_il_klqp_from_scratch/dynamics_28.zip
[2018-05-12 19:09:12.904812] ==== Iteration [29], experience: [870 steps] ====
[2018-05-12 19:09:12.911156] SGDOptimizer > Optimizing parameters
[2018-05-12 19:09:13.066821] SGDOptimizer > Initial loss [17912.044921875]
[2K[2018-05-12 19:11:02.820

In [13]:
# experiment 4 learn starting from scratch, using klpq imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_004_il_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 20:13:07.022654] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 20:13:07.045435] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 20:15:36.098548] target_dyn_opt > Initial loss [156.75030500114923]
[2K[2018-05-12 20:15:48.759590] target_dyn_opt > Curr loss: 2.502083E+01 [1983: 2.290967E+01], n_evals: 1999, Avg. time per updt: 0.004833
[2018-05-12 20:15:48.768349] target_dyn_opt > Done training. New loss [24.500931] iter: [2000]
[2018-05-12 20:15:48.771099] train_dynamics > Done training dynamics model
[2018-05-12 20:15:48.772927] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/experience_1.zip
[2018-05-12 20:15:48.916921] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/policy_1.zip
[2018-05-12 20:15:49.023452] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/dynamics_1.zip
[2018-05-12 20:15:51.819620] ==== Iteration [2], experience: [60 steps] ====
[2018-05-12 20:15:51.824978] SGDOptimizer > Optimizing parameters
[2018-05-12 20:15:51.956453] SG

[2018-05-12 20:23:47.752621] SGDOptimizer > Initial loss [183.74156188964844]
[2K[2018-05-12 20:25:35.594984] SGDOptimizer > Curr loss: 9.030720E+01, n_evals: 999, Avg. time per updt: 0.106446
[2018-05-12 20:25:35.620236] SGDOptimizer > Done training. New loss [88.412155] iter: [999]
[2018-05-12 20:25:35.622412] apply_controller > Starting run
[2018-05-12 20:25:35.623761] apply_controller > Running for 3.000000 seconds
[2018-05-12 20:25:35.806487] apply_controller > Done. Stopping robot. Value of run [29.997684]
[2018-05-12 20:25:35.807713] target_2x_mass > Stopping robot
[2018-05-12 20:25:35.808982] train_dynamics > Training dynamics model
[2018-05-12 20:25:35.812052] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-12 20:25:35.813794] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 20:25:35.830479] target_dyn_opt > Initial loss [5.547990425266188]
[2K[2018-05-12 20:25:53.594466] target_dyn_opt > Curr loss: 7.135745E-01 [1566:

[2K[2018-05-12 20:33:46.041886] target_dyn_opt > Curr loss: -3.845105E+00 [1941: -4.612096E+00], n_evals: 1999, Avg. time per updt: 0.008042
[2018-05-12 20:33:46.054923] target_dyn_opt > Done training. New loss [-3.908500] iter: [2000]
[2018-05-12 20:33:46.057763] train_dynamics > Done training dynamics model
[2018-05-12 20:33:46.059285] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/experience_10.zip
[2018-05-12 20:33:46.578953] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/policy_10.zip
[2018-05-12 20:33:46.677389] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/dynamics_10.zip
[2018-05-12 20:33:49.482247] ==== Iteration [11], experience: [330 steps] ====
[2018-05-12 20:33:49.487934] SGDOptimizer > Optimizing parameters
[2018-05-12 20:33:49.614617] SGDOptimizer > Initial loss [85.52481079101562]
[2K[2018-05-12 20:35:25.28

[2K[2018-05-12 20:43:26.496291] SGDOptimizer > Curr loss: 9.777536E+01, n_evals: 999, Avg. time per updt: 0.098778
[2018-05-12 20:43:26.522299] SGDOptimizer > Done training. New loss [89.084068] iter: [999]
[2018-05-12 20:43:26.524153] apply_controller > Starting run
[2018-05-12 20:43:26.525455] apply_controller > Running for 3.000000 seconds
[2018-05-12 20:43:26.709600] apply_controller > Done. Stopping robot. Value of run [29.969034]
[2018-05-12 20:43:26.710988] target_2x_mass > Stopping robot
[2018-05-12 20:43:26.712875] train_dynamics > Training dynamics model
[2018-05-12 20:43:26.716548] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 20:43:26.717837] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 20:43:26.734353] target_dyn_opt > Initial loss [-5.8900237881161015]
[2K[2018-05-12 20:43:44.551802] target_dyn_opt > Curr loss: -7.737148E+00 [1270: -7.983390E+00], n_evals: 1999, Avg. time per updt: 0.007422
[2018-05-12 20

[2K[2018-05-12 20:52:20.171100] target_dyn_opt > Curr loss: -8.872998E+00 [1169: -9.725033E+00], n_evals: 1999, Avg. time per updt: 0.009548
[2018-05-12 20:52:20.184300] target_dyn_opt > Done training. New loss [-9.178159] iter: [2000]
[2018-05-12 20:52:20.186897] train_dynamics > Done training dynamics model
[2018-05-12 20:52:20.188462] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/experience_19.zip
[2018-05-12 20:52:21.092353] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/policy_19.zip
[2018-05-12 20:52:21.191756] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/dynamics_19.zip
[2018-05-12 20:52:24.028651] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 20:52:24.034044] SGDOptimizer > Optimizing parameters
[2018-05-12 20:52:24.156794] SGDOptimizer > Initial loss [123.30403900146484]
[2K[2018-05-12 20:54:11.5

[2K[2018-05-12 21:02:57.038370] SGDOptimizer > Curr loss: 1.110953E+02, n_evals: 999, Avg. time per updt: 0.104143
[2018-05-12 21:02:57.066327] SGDOptimizer > Done training. New loss [109.132835] iter: [999]
[2018-05-12 21:02:57.068489] apply_controller > Starting run
[2018-05-12 21:02:57.069875] apply_controller > Running for 3.000000 seconds
[2018-05-12 21:02:57.260728] apply_controller > Done. Stopping robot. Value of run [28.661507]
[2018-05-12 21:02:57.261949] target_2x_mass > Stopping robot
[2018-05-12 21:02:57.263204] train_dynamics > Training dynamics model
[2018-05-12 21:02:57.267930] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-12 21:02:57.269470] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 21:02:57.284307] target_dyn_opt > Initial loss [-9.025384691131526]
[2K[2018-05-12 21:03:15.225636] target_dyn_opt > Curr loss: -1.077499E+01 [1967: -1.117132E+01], n_evals: 1999, Avg. time per updt: 0.007520
[2018-05-12 21

[2K[2018-05-12 21:11:35.978731] target_dyn_opt > Curr loss: -1.182010E+01 [1972: -1.222982E+01], n_evals: 1999, Avg. time per updt: 0.007493
[2018-05-12 21:11:35.992255] target_dyn_opt > Done training. New loss [-11.796480] iter: [2000]
[2018-05-12 21:11:35.994960] train_dynamics > Done training dynamics model
[2018-05-12 21:11:35.996588] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/experience_28.zip
[2018-05-12 21:11:37.287260] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/policy_28.zip
[2018-05-12 21:11:37.387881] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_004_il_klpq_from_scratch/dynamics_28.zip
[2018-05-12 21:11:40.273831] ==== Iteration [29], experience: [870 steps] ====
[2018-05-12 21:11:40.279270] SGDOptimizer > Optimizing parameters
[2018-05-12 21:11:40.408820] SGDOptimizer > Initial loss [112.98040008544922]
[2K[2018-05-12 21:13:20.

In [14]:
# experiment 5 learn starting from source params, using klqp imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_005_il_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 21:24:03.382105] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 21:24:03.409119] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 21:26:46.796804] target_2x_mass > Stopping robot
[2018-05-12 21:26:46.798151] train_dynamics > Training dynamics model
[2018-05-12 21:26:46.804463] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-12 21:26:46.806269] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 21:26:46.866516] target_dyn_opt > Initial loss [81.07482053442249]
[2K[2018-05-12 21:26:59.851637] target_dyn_opt > Curr loss: 1.292845E+01 [1894: 1.268465E+01], n_evals: 1999, Avg. time per updt: 0.004991
[2018-05-12 21:26:59.862250] target_dyn_opt > Done training. New loss [13.446337] iter: [2000]
[2018-05-12 21:26:59.865439] train_dynamics > Done training dynamics model
[2018-05-12 21:26:59.868321] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/experience_1.zip
[2018-05-12 21:27:00.043281] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/policy_1.zip
[20

[2018-05-12 21:36:06.970343] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/dynamics_5.zip
[2018-05-12 21:36:10.222628] ==== Iteration [6], experience: [180 steps] ====
[2018-05-12 21:36:10.227787] SGDOptimizer > Optimizing parameters
[2018-05-12 21:36:10.369849] SGDOptimizer > Initial loss [23754.146484375]
[2K[2018-05-12 21:37:47.037477] SGDOptimizer > Curr loss: 1.906376E+04, n_evals: 999, Avg. time per updt: 0.095388
[2018-05-12 21:37:47.064257] SGDOptimizer > Done training. New loss [18178.949219] iter: [999]
[2018-05-12 21:37:47.066485] apply_controller > Starting run
[2018-05-12 21:37:47.067986] apply_controller > Running for 3.000000 seconds
[2018-05-12 21:37:47.248966] apply_controller > Done. Stopping robot. Value of run [21.561527]
[2018-05-12 21:37:47.250315] target_2x_mass > Stopping robot
[2018-05-12 21:37:47.251356] train_dynamics > Training dynamics model
[2018-05-12 21:37:47.254348] train_dynamics > Dataset size:: 

[2018-05-12 21:46:55.455635] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 21:46:55.476536] target_dyn_opt > Initial loss [-5.33886449585514]
[2K[2018-05-12 21:47:13.648922] target_dyn_opt > Curr loss: -1.017741E+01 [1998: -1.017741E+01], n_evals: 1999, Avg. time per updt: 0.007595
[2018-05-12 21:47:13.662690] target_dyn_opt > Done training. New loss [-9.802204] iter: [2000]
[2018-05-12 21:47:13.665414] train_dynamics > Done training dynamics model
[2018-05-12 21:47:13.666812] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/experience_10.zip
[2018-05-12 21:47:14.222221] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/policy_10.zip
[2018-05-12 21:47:14.346260] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/dynamics_10.zip
[2018-05-12 21:47:17.652118] ==== Iteration [11], experience: [330 steps] ====
[2018-05-

[2018-05-12 21:56:13.326715] SGDOptimizer > Optimizing parameters
[2018-05-12 21:56:13.456017] SGDOptimizer > Initial loss [9152.763671875]
[2K[2018-05-12 21:57:56.272324] SGDOptimizer > Curr loss: 1.183951E+04, n_evals: 999, Avg. time per updt: 0.101494
[2018-05-12 21:57:56.301087] SGDOptimizer > Done training. New loss [11353.252930] iter: [999]
[2018-05-12 21:57:56.303169] apply_controller > Starting run
[2018-05-12 21:57:56.304496] apply_controller > Running for 3.000000 seconds
[2018-05-12 21:57:56.488817] apply_controller > Done. Stopping robot. Value of run [29.678513]
[2018-05-12 21:57:56.490318] target_2x_mass > Stopping robot
[2018-05-12 21:57:56.491753] train_dynamics > Training dynamics model
[2018-05-12 21:57:56.495404] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 21:57:56.497092] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 21:57:56.513367] target_dyn_opt > Initial loss [-5.984004893436515]
[2K[2018-05-1

[2K[2018-05-12 22:07:33.662843] target_dyn_opt > Curr loss: -1.305463E+01 [1752: -1.318502E+01], n_evals: 1999, Avg. time per updt: 0.008287
[2018-05-12 22:07:33.677745] target_dyn_opt > Done training. New loss [-12.711449] iter: [2000]
[2018-05-12 22:07:33.680693] train_dynamics > Done training dynamics model
[2018-05-12 22:07:33.682602] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/experience_19.zip
[2018-05-12 22:07:34.647471] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/policy_19.zip
[2018-05-12 22:07:34.784304] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/dynamics_19.zip
[2018-05-12 22:07:38.371488] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 22:07:38.376785] SGDOptimizer > Optimizing parameters
[2018-05-12 22:07:38.520877] SGDOptimizer > Initial loss [16394.966796875]
[2K[2018-05-12 22:09:24.816820

[2K[2018-05-12 22:18:32.546978] SGDOptimizer > Curr loss: 2.181756E+04, n_evals: 999, Avg. time per updt: 0.121214
[2018-05-12 22:18:32.581876] SGDOptimizer > Done training. New loss [25214.322266] iter: [999]
[2018-05-12 22:18:32.583874] apply_controller > Starting run
[2018-05-12 22:18:32.585258] apply_controller > Running for 3.000000 seconds
[2018-05-12 22:18:32.761881] apply_controller > Done. Stopping robot. Value of run [29.497971]
[2018-05-12 22:18:32.763335] target_2x_mass > Stopping robot
[2018-05-12 22:18:32.764867] train_dynamics > Training dynamics model
[2018-05-12 22:18:32.769642] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-12 22:18:32.771052] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 22:18:32.788590] target_dyn_opt > Initial loss [-8.119942300812951]
[2K[2018-05-12 22:18:51.980628] target_dyn_opt > Curr loss: -1.346883E+01 [1696: -1.394797E+01], n_evals: 1999, Avg. time per updt: 0.008105
[2018-05-12 

[2K[2018-05-12 22:28:58.190506] target_dyn_opt > Curr loss: -1.398672E+01 [1321: -1.450596E+01], n_evals: 1999, Avg. time per updt: 0.008540
[2018-05-12 22:28:58.204607] target_dyn_opt > Done training. New loss [-13.992560] iter: [2000]
[2018-05-12 22:28:58.207376] train_dynamics > Done training dynamics model
[2018-05-12 22:28:58.209185] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/experience_28.zip
[2018-05-12 22:28:59.552877] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/policy_28.zip
[2018-05-12 22:28:59.683260] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_005_il_klqp_from_source/dynamics_28.zip
[2018-05-12 22:29:03.052390] ==== Iteration [29], experience: [870 steps] ====
[2018-05-12 22:29:03.057899] SGDOptimizer > Optimizing parameters
[2018-05-12 22:29:03.235752] SGDOptimizer > Initial loss [11164.6904296875]
[2K[2018-05-12 22:30:55.53277

In [15]:
# experiment 6 learn starting from source, using klpq imitation loss
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_006_il_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[0.0, 1.0], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-12 22:33:46.557839] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-12 22:33:46.586640] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f8d1f485b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f8d1f48b050>, 'n

[2018-05-12 22:36:35.271989] target_2x_mass > Stopping robot
[2018-05-12 22:36:35.273311] train_dynamics > Training dynamics model
[2018-05-12 22:36:35.277899] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-12 22:36:35.279402] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 22:36:35.323002] target_dyn_opt > Initial loss [145.800531711044]
[2K[2018-05-12 22:36:48.520767] target_dyn_opt > Curr loss: 1.427014E+01 [1997: 1.401361E+01], n_evals: 1999, Avg. time per updt: 0.005071
[2018-05-12 22:36:48.530135] target_dyn_opt > Done training. New loss [14.547424] iter: [2000]
[2018-05-12 22:36:48.532962] train_dynamics > Done training dynamics model
[2018-05-12 22:36:48.535975] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/experience_1.zip
[2018-05-12 22:36:48.616391] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/policy_1.zip
[201

[2018-05-12 22:46:10.946261] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/dynamics_5.zip
[2018-05-12 22:46:14.430147] ==== Iteration [6], experience: [180 steps] ====
[2018-05-12 22:46:14.435592] SGDOptimizer > Optimizing parameters
[2018-05-12 22:46:14.584460] SGDOptimizer > Initial loss [81.24007415771484]
[2K[2018-05-12 22:48:01.631533] SGDOptimizer > Curr loss: 1.514444E+02, n_evals: 999, Avg. time per updt: 0.105706
[2018-05-12 22:48:01.659496] SGDOptimizer > Done training. New loss [143.600677] iter: [999]
[2018-05-12 22:48:01.661632] apply_controller > Starting run
[2018-05-12 22:48:01.663096] apply_controller > Running for 3.000000 seconds
[2018-05-12 22:48:01.839135] apply_controller > Done. Stopping robot. Value of run [28.081289]
[2018-05-12 22:48:01.840584] target_2x_mass > Stopping robot
[2018-05-12 22:48:01.841823] train_dynamics > Training dynamics model
[2018-05-12 22:48:01.844413] train_dynamics > Dataset size:: 

[2018-05-12 22:57:18.078900] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 22:57:18.092273] target_dyn_opt > Initial loss [-4.237832629755373]
[2K[2018-05-12 22:57:36.121767] target_dyn_opt > Curr loss: -8.656337E+00 [1672: -9.448584E+00], n_evals: 1999, Avg. time per updt: 0.007538
[2018-05-12 22:57:36.137298] target_dyn_opt > Done training. New loss [-8.661101] iter: [2000]
[2018-05-12 22:57:36.140306] train_dynamics > Done training dynamics model
[2018-05-12 22:57:36.141897] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/experience_10.zip
[2018-05-12 22:57:36.612245] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/policy_10.zip
[2018-05-12 22:57:36.649563] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/dynamics_10.zip
[2018-05-12 22:57:40.146833] ==== Iteration [11], experience: [330 steps] ====
[2018-05

[2018-05-12 23:07:08.423880] SGDOptimizer > Optimizing parameters
[2018-05-12 23:07:08.591373] SGDOptimizer > Initial loss [124.55823516845703]
[2K[2018-05-12 23:09:09.416131] SGDOptimizer > Curr loss: 1.206766E+02, n_evals: 999, Avg. time per updt: 0.119558
[2018-05-12 23:09:09.452660] SGDOptimizer > Done training. New loss [89.575188] iter: [999]
[2018-05-12 23:09:09.454545] apply_controller > Starting run
[2018-05-12 23:09:09.456003] apply_controller > Running for 3.000000 seconds
[2018-05-12 23:09:09.664946] apply_controller > Done. Stopping robot. Value of run [27.065147]
[2018-05-12 23:09:09.666290] target_2x_mass > Stopping robot
[2018-05-12 23:09:09.667580] train_dynamics > Training dynamics model
[2018-05-12 23:09:09.671439] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-12 23:09:09.672837] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 23:09:09.688297] target_dyn_opt > Initial loss [2.6805963020386367]
[2K[2018-05-

[2K[2018-05-12 23:19:12.242814] target_dyn_opt > Curr loss: -1.160376E+01 [1464: -1.225831E+01], n_evals: 1999, Avg. time per updt: 0.008016
[2018-05-12 23:19:12.257803] target_dyn_opt > Done training. New loss [-11.901140] iter: [2000]
[2018-05-12 23:19:12.260908] train_dynamics > Done training dynamics model
[2018-05-12 23:19:12.262252] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/experience_19.zip
[2018-05-12 23:19:13.093271] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/policy_19.zip
[2018-05-12 23:19:13.130492] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/dynamics_19.zip
[2018-05-12 23:19:16.584182] ==== Iteration [20], experience: [600 steps] ====
[2018-05-12 23:19:16.589262] SGDOptimizer > Optimizing parameters
[2018-05-12 23:19:16.745941] SGDOptimizer > Initial loss [70.28131866455078]
[2K[2018-05-12 23:21:16.4412

[2K[2018-05-12 23:30:53.565697] SGDOptimizer > Curr loss: 9.519038E+01, n_evals: 999, Avg. time per updt: 0.123999
[2018-05-12 23:30:53.604947] SGDOptimizer > Done training. New loss [117.856613] iter: [999]
[2018-05-12 23:30:53.606570] apply_controller > Starting run
[2018-05-12 23:30:53.608567] apply_controller > Running for 3.000000 seconds
[2018-05-12 23:30:53.798344] apply_controller > Done. Stopping robot. Value of run [29.897913]
[2018-05-12 23:30:53.799702] target_2x_mass > Stopping robot
[2018-05-12 23:30:53.801033] train_dynamics > Training dynamics model
[2018-05-12 23:30:53.805966] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-12 23:30:53.807345] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-12 23:30:53.823760] target_dyn_opt > Initial loss [-9.387204790115357]
[2K[2018-05-12 23:31:13.036648] target_dyn_opt > Curr loss: -1.269185E+01 [1375: -1.332868E+01], n_evals: 1999, Avg. time per updt: 0.008134
[2018-05-12 23

[2K[2018-05-12 23:41:55.165306] target_dyn_opt > Curr loss: -1.334860E+01 [1236: -1.393041E+01], n_evals: 1999, Avg. time per updt: 0.008191
[2018-05-12 23:41:55.179270] target_dyn_opt > Done training. New loss [-13.495827] iter: [2000]
[2018-05-12 23:41:55.181875] train_dynamics > Done training dynamics model
[2018-05-12 23:41:55.184209] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/experience_28.zip
[2018-05-12 23:41:56.428823] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/policy_28.zip
[2018-05-12 23:41:56.467274] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_006_il_klpq_from_source/dynamics_28.zip
[2018-05-12 23:41:59.976262] ==== Iteration [29], experience: [870 steps] ====
[2018-05-12 23:41:59.981424] SGDOptimizer > Optimizing parameters
[2018-05-12 23:42:00.143362] SGDOptimizer > Initial loss [64.0837173461914]
[2K[2018-05-12 23:44:02.16639

In [8]:
# experiment 7 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_007_taskplusil_klqp_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-4], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-13 00:19:30.180849] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-13 00:19:30.202610] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7fe798322050>, 'n

[2018-05-13 00:21:57.119110] target_dyn_opt > Initial loss [137.52225553021054]
[2K[2018-05-13 00:22:09.738147] target_dyn_opt > Curr loss: 2.104207E+01 [1969: 1.966144E+01], n_evals: 1999, Avg. time per updt: 0.004783
[2018-05-13 00:22:09.748039] target_dyn_opt > Done training. New loss [20.975352] iter: [2000]
[2018-05-13 00:22:09.750711] train_dynamics > Done training dynamics model
[2018-05-13 00:22:09.752047] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/experience_1.zip
[2018-05-13 00:22:09.864166] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/policy_1.zip
[2018-05-13 00:22:09.940657] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/dynamics_1.zip
[2018-05-13 00:22:11.583432] ==== Iteration [2], experience: [60 steps] ====
[2018-05-13 00:22:11.588906] SGDOptimizer > Optimizing parameters
[201

[2018-05-13 00:30:05.683855] SGDOptimizer > Optimizing parameters
[2018-05-13 00:30:05.802176] SGDOptimizer > Initial loss [1.6083914041519165]
[2K[2018-05-13 00:31:42.755441] SGDOptimizer > Curr loss: 2.648129E+00, n_evals: 999, Avg. time per updt: 0.095675
[2018-05-13 00:31:42.784443] SGDOptimizer > Done training. New loss [2.694020] iter: [999]
[2018-05-13 00:31:42.786105] apply_controller > Starting run
[2018-05-13 00:31:42.787425] apply_controller > Running for 3.000000 seconds
[2018-05-13 00:31:42.970216] apply_controller > Done. Stopping robot. Value of run [29.957670]
[2018-05-13 00:31:42.971713] target_2x_mass > Stopping robot
[2018-05-13 00:31:42.972993] train_dynamics > Training dynamics model
[2018-05-13 00:31:42.975892] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-13 00:31:42.977432] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 00:31:42.992257] target_dyn_opt > Initial loss [-1.689326590838328]
[2K[2018-05-1

[2018-05-13 00:39:26.420667] target_dyn_opt > Initial loss [-4.522779218712384]
[2K[2018-05-13 00:39:43.527187] target_dyn_opt > Curr loss: -7.090064E+00 [1527: -7.330358E+00], n_evals: 1999, Avg. time per updt: 0.007080
[2018-05-13 00:39:43.541082] target_dyn_opt > Done training. New loss [-6.791544] iter: [2000]
[2018-05-13 00:39:43.543784] train_dynamics > Done training dynamics model
[2018-05-13 00:39:43.545193] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/experience_10.zip
[2018-05-13 00:39:44.022469] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/policy_10.zip
[2018-05-13 00:39:44.083364] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/dynamics_10.zip
[2018-05-13 00:39:45.640216] ==== Iteration [11], experience: [330 steps] ====
[2018-05-13 00:39:45.645298] SGDOptimizer > Optimizing paramete

[2018-05-13 00:47:43.413566] ==== Iteration [15], experience: [450 steps] ====
[2018-05-13 00:47:43.418573] SGDOptimizer > Optimizing parameters
[2018-05-13 00:47:43.561048] SGDOptimizer > Initial loss [2.1772866249084473]
[2K[2018-05-13 00:49:24.399609] SGDOptimizer > Curr loss: 2.284377E+00, n_evals: 999, Avg. time per updt: 0.099508
[2018-05-13 00:49:24.424572] SGDOptimizer > Done training. New loss [2.028311] iter: [999]
[2018-05-13 00:49:24.426454] apply_controller > Starting run
[2018-05-13 00:49:24.427769] apply_controller > Running for 3.000000 seconds
[2018-05-13 00:49:24.607096] apply_controller > Done. Stopping robot. Value of run [29.970339]
[2018-05-13 00:49:24.608581] target_2x_mass > Stopping robot
[2018-05-13 00:49:24.610003] train_dynamics > Training dynamics model
[2018-05-13 00:49:24.613409] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-13 00:49:24.615146] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 00:

[2018-05-13 00:57:42.345425] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 00:57:42.360013] target_dyn_opt > Initial loss [-8.616694450121502]
[2K[2018-05-13 00:58:01.400663] target_dyn_opt > Curr loss: -1.080140E+01 [903: -1.104414E+01], n_evals: 1999, Avg. time per updt: 0.007909
[2018-05-13 00:58:01.413465] target_dyn_opt > Done training. New loss [-10.741351] iter: [2000]
[2018-05-13 00:58:01.416055] train_dynamics > Done training dynamics model
[2018-05-13 00:58:01.417377] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/experience_19.zip
[2018-05-13 00:58:02.281756] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/policy_19.zip
[2018-05-13 00:58:02.344475] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_007_taskplusil_klqp_from_scratch2/dynamics_19.zip
[2018-05-13 00:58:03.950455] ==== Iteration [20], experie

KeyboardInterrupt: 

In [9]:
# experiment 8 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_008_taskplusil_klpq_from_scratch')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 1                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=False)
source_pol, target_pol = init_pol(params, pol_path, copy_params=False)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-13 01:00:56.715649] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-13 01:00:56.739815] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7fe798322050>, 'n

[2018-05-13 01:03:18.087371] target_dyn_opt > Initial loss [132.50352635352414]
[2K[2018-05-13 01:03:30.551298] target_dyn_opt > Curr loss: 2.142717E+01 [1857: 1.898543E+01], n_evals: 1999, Avg. time per updt: 0.004730
[2018-05-13 01:03:30.560129] target_dyn_opt > Done training. New loss [21.989043] iter: [2000]
[2018-05-13 01:03:30.562822] train_dynamics > Done training dynamics model
[2018-05-13 01:03:30.564250] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/experience_1.zip
[2018-05-13 01:03:30.681010] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/policy_1.zip
[2018-05-13 01:03:30.758144] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/dynamics_1.zip
[2018-05-13 01:03:34.537630] ==== Iteration [2], experience: [60 steps] ====
[2018-05-13 01:03:34.542479] SGDOptimizer > Optimizing parameters
[2018-0

[2018-05-13 01:11:34.023441] SGDOptimizer > Optimizing parameters
[2018-05-13 01:11:34.157871] SGDOptimizer > Initial loss [1.167551040649414]
[2K[2018-05-13 01:13:09.535985] SGDOptimizer > Curr loss: 1.054263E+00, n_evals: 999, Avg. time per updt: 0.094081
[2018-05-13 01:13:09.566345] SGDOptimizer > Done training. New loss [1.054760] iter: [999]
[2018-05-13 01:13:09.568308] apply_controller > Starting run
[2018-05-13 01:13:09.569517] apply_controller > Running for 3.000000 seconds
[2018-05-13 01:13:09.774016] apply_controller > Done. Stopping robot. Value of run [23.127647]
[2018-05-13 01:13:09.775330] target_2x_mass > Stopping robot
[2018-05-13 01:13:09.776814] train_dynamics > Training dynamics model
[2018-05-13 01:13:09.779719] train_dynamics > Dataset size:: Inputs: [ (203, 6) ], Targets: [ (203, 4) ] 
[2018-05-13 01:13:09.780952] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 01:13:09.795403] target_dyn_opt > Initial loss [3.8809792004775696]
[2K[2018-05-13

[2018-05-13 01:20:56.552925] target_dyn_opt > Initial loss [-2.7549199911307394]
[2K[2018-05-13 01:21:13.877555] target_dyn_opt > Curr loss: -5.660141E+00 [1880: -5.917906E+00], n_evals: 1999, Avg. time per updt: 0.007183
[2018-05-13 01:21:13.890939] target_dyn_opt > Done training. New loss [-5.191447] iter: [2000]
[2018-05-13 01:21:13.893499] train_dynamics > Done training dynamics model
[2018-05-13 01:21:13.894877] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/experience_10.zip
[2018-05-13 01:21:14.398341] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/policy_10.zip
[2018-05-13 01:21:14.474276] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/dynamics_10.zip
[2018-05-13 01:21:16.362011] ==== Iteration [11], experience: [330 steps] ====
[2018-05-13 01:21:16.367029] SGDOptimizer > Optimizing parameters

[2018-05-13 01:29:40.714788] ==== Iteration [15], experience: [450 steps] ====
[2018-05-13 01:29:40.720672] SGDOptimizer > Optimizing parameters
[2018-05-13 01:29:40.869563] SGDOptimizer > Initial loss [1.1475088596343994]
[2K[2018-05-13 01:31:21.466747] SGDOptimizer > Curr loss: 1.028686E+00, n_evals: 999, Avg. time per updt: 0.099308
[2018-05-13 01:31:21.496747] SGDOptimizer > Done training. New loss [1.107759] iter: [999]
[2018-05-13 01:31:21.498414] apply_controller > Starting run
[2018-05-13 01:31:21.499722] apply_controller > Running for 3.000000 seconds
[2018-05-13 01:31:21.701235] apply_controller > Done. Stopping robot. Value of run [25.096731]
[2018-05-13 01:31:21.702580] target_2x_mass > Stopping robot
[2018-05-13 01:31:21.703928] train_dynamics > Training dynamics model
[2018-05-13 01:31:21.707730] train_dynamics > Dataset size:: Inputs: [ (464, 6) ], Targets: [ (464, 4) ] 
[2018-05-13 01:31:21.709173] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 01:

[2018-05-13 01:39:37.000914] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 01:39:37.014627] target_dyn_opt > Initial loss [-7.4378865450620655]
[2K[2018-05-13 01:39:54.764564] target_dyn_opt > Curr loss: -9.145403E+00 [1357: -9.838453E+00], n_evals: 1999, Avg. time per updt: 0.007406
[2018-05-13 01:39:54.779703] target_dyn_opt > Done training. New loss [-8.824719] iter: [2000]
[2018-05-13 01:39:54.782203] train_dynamics > Done training dynamics model
[2018-05-13 01:39:54.783584] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/experience_19.zip
[2018-05-13 01:39:55.647273] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/policy_19.zip
[2018-05-13 01:39:55.726431] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/dynamics_19.zip
[2018-05-13 01:39:57.647582] ==== Iteration [20], experienc

[2018-05-13 01:48:31.890786] ==== Iteration [24], experience: [720 steps] ====
[2018-05-13 01:48:31.896557] SGDOptimizer > Optimizing parameters
[2018-05-13 01:48:32.042561] SGDOptimizer > Initial loss [1.0677735805511475]
[2K[2018-05-13 01:50:18.508482] SGDOptimizer > Curr loss: 1.100039E+00, n_evals: 999, Avg. time per updt: 0.105183
[2018-05-13 01:50:18.537798] SGDOptimizer > Done training. New loss [1.070559] iter: [999]
[2018-05-13 01:50:18.539464] apply_controller > Starting run
[2018-05-13 01:50:18.540750] apply_controller > Running for 3.000000 seconds
[2018-05-13 01:50:18.793354] apply_controller > Done. Stopping robot. Value of run [28.138481]
[2018-05-13 01:50:18.794660] target_2x_mass > Stopping robot
[2018-05-13 01:50:18.796051] train_dynamics > Training dynamics model
[2018-05-13 01:50:18.800273] train_dynamics > Dataset size:: Inputs: [ (725, 6) ], Targets: [ (725, 4) ] 
[2018-05-13 01:50:18.801937] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 01:

[2018-05-13 01:58:55.871890] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 01:58:55.887725] target_dyn_opt > Initial loss [-8.980866091784343]
[2K[2018-05-13 01:59:14.138615] target_dyn_opt > Curr loss: -1.133806E+01 [1625: -1.187059E+01], n_evals: 1999, Avg. time per updt: 0.007655
[2018-05-13 01:59:14.151751] target_dyn_opt > Done training. New loss [-11.293425] iter: [2000]
[2018-05-13 01:59:14.154488] train_dynamics > Done training dynamics model
[2018-05-13 01:59:14.155845] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/experience_28.zip
[2018-05-13 01:59:15.406967] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/policy_28.zip
[2018-05-13 01:59:15.487327] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_008_taskplusil_klpq_from_scratch/dynamics_28.zip
[2018-05-13 01:59:17.410780] ==== Iteration [29], experienc

In [10]:
# experiment 9 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_009_taskplusil_klqp_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-3], loss_type=utils.ImitationLossType.KLQP)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-13 02:03:36.903388] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-13 02:03:36.926568] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7fe79839cb90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7fe798322050>, 'n

[2018-05-13 02:06:22.639399] target_2x_mass > Stopping robot
[2018-05-13 02:06:22.640740] train_dynamics > Training dynamics model
[2018-05-13 02:06:22.642787] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-13 02:06:22.644441] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 02:06:22.655862] target_dyn_opt > Initial loss [99.72602818032792]
[2K[2018-05-13 02:06:35.776619] target_dyn_opt > Curr loss: 1.479556E+01 [1970: 1.431731E+01], n_evals: 1999, Avg. time per updt: 0.005063
[2018-05-13 02:06:35.787017] target_dyn_opt > Done training. New loss [14.485334] iter: [2000]
[2018-05-13 02:06:35.789621] train_dynamics > Done training dynamics model
[2018-05-13 02:06:35.791100] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/experience_1.zip
[2018-05-13 02:06:35.934660] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/

[2018-05-13 02:15:07.563417] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/dynamics_5.zip
[2018-05-13 02:15:09.845913] ==== Iteration [6], experience: [180 steps] ====
[2018-05-13 02:15:09.851050] SGDOptimizer > Optimizing parameters
[2018-05-13 02:15:10.011215] SGDOptimizer > Initial loss [13.905003547668457]
[2K[2018-05-13 02:17:14.361346] SGDOptimizer > Curr loss: 1.262811E+01, n_evals: 999, Avg. time per updt: 0.123079
[2018-05-13 02:17:14.401666] SGDOptimizer > Done training. New loss [11.802900] iter: [999]
[2018-05-13 02:17:14.403380] apply_controller > Starting run
[2018-05-13 02:17:14.404239] apply_controller > Running for 3.000000 seconds
[2018-05-13 02:17:14.569234] apply_controller > Done. Stopping robot. Value of run [29.954182]
[2018-05-13 02:17:14.570603] target_2x_mass > Stopping robot
[2018-05-13 02:17:14.571648] train_dynamics > Training dynamics model
[2018-05-13 02:17:14.574355] train_dynamics > Dataset

[2018-05-13 02:25:59.146637] train_dynamics > Training dynamics model
[2018-05-13 02:25:59.149789] train_dynamics > Dataset size:: Inputs: [ (319, 6) ], Targets: [ (319, 4) ] 
[2018-05-13 02:25:59.151275] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 02:25:59.167647] target_dyn_opt > Initial loss [1.7783040894796853]
[2K[2018-05-13 02:26:17.814538] target_dyn_opt > Curr loss: -9.717578E+00 [1697: -1.052650E+01], n_evals: 1999, Avg. time per updt: 0.007849
[2018-05-13 02:26:17.830262] target_dyn_opt > Done training. New loss [-9.912127] iter: [2000]
[2018-05-13 02:26:17.832899] train_dynamics > Done training dynamics model
[2018-05-13 02:26:17.834328] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/experience_10.zip
[2018-05-13 02:26:18.356811] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/policy_10.zip
[2018-05-13 02:26:18.463105] target_dyn >

[2018-05-13 02:35:33.459636] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/dynamics_14.zip
[2018-05-13 02:35:35.742015] ==== Iteration [15], experience: [450 steps] ====
[2018-05-13 02:35:35.747278] SGDOptimizer > Optimizing parameters
[2018-05-13 02:35:35.895080] SGDOptimizer > Initial loss [9.228636741638184]
[2K[2018-05-13 02:37:30.957785] SGDOptimizer > Curr loss: 1.190965E+01, n_evals: 999, Avg. time per updt: 0.113755
[2018-05-13 02:37:30.989836] SGDOptimizer > Done training. New loss [12.381735] iter: [999]
[2018-05-13 02:37:30.991492] apply_controller > Starting run
[2018-05-13 02:37:30.992817] apply_controller > Running for 3.000000 seconds
[2018-05-13 02:37:31.164462] apply_controller > Done. Stopping robot. Value of run [29.918121]
[2018-05-13 02:37:31.166052] target_2x_mass > Stopping robot
[2018-05-13 02:37:31.167365] train_dynamics > Training dynamics model
[2018-05-13 02:37:31.170796] train_dynamics > Datase

[2018-05-13 02:47:30.013383] train_dynamics > Training dynamics model
[2018-05-13 02:47:30.018113] train_dynamics > Dataset size:: Inputs: [ (580, 6) ], Targets: [ (580, 4) ] 
[2018-05-13 02:47:30.019708] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 02:47:30.035698] target_dyn_opt > Initial loss [-0.6191536333026555]
[2K[2018-05-13 02:47:49.202387] target_dyn_opt > Curr loss: -1.317009E+01 [1620: -1.335932E+01], n_evals: 1999, Avg. time per updt: 0.008116
[2018-05-13 02:47:49.216027] target_dyn_opt > Done training. New loss [-12.980149] iter: [2000]
[2018-05-13 02:47:49.218791] train_dynamics > Done training dynamics model
[2018-05-13 02:47:49.220187] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/experience_19.zip
[2018-05-13 02:47:50.116493] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/policy_19.zip
[2018-05-13 02:47:50.226545] target_dyn

[2018-05-13 02:56:51.322519] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/dynamics_23.zip
[2018-05-13 02:56:53.623256] ==== Iteration [24], experience: [720 steps] ====
[2018-05-13 02:56:53.628990] SGDOptimizer > Optimizing parameters
[2018-05-13 02:56:53.811266] SGDOptimizer > Initial loss [11.75406551361084]
[2K[2018-05-13 02:59:06.853847] SGDOptimizer > Curr loss: 1.607053E+01, n_evals: 999, Avg. time per updt: 0.131792
[2018-05-13 02:59:06.896371] SGDOptimizer > Done training. New loss [16.514425] iter: [999]
[2018-05-13 02:59:06.898053] apply_controller > Starting run
[2018-05-13 02:59:06.899574] apply_controller > Running for 3.000000 seconds
[2018-05-13 02:59:07.102420] apply_controller > Done. Stopping robot. Value of run [25.759817]
[2018-05-13 02:59:07.103756] target_2x_mass > Stopping robot
[2018-05-13 02:59:07.105080] train_dynamics > Training dynamics model
[2018-05-13 02:59:07.110182] train_dynamics > Datase

[2018-05-13 03:08:40.340408] train_dynamics > Training dynamics model
[2018-05-13 03:08:40.352196] train_dynamics > Dataset size:: Inputs: [ (841, 6) ], Targets: [ (841, 4) ] 
[2018-05-13 03:08:40.353735] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 03:08:40.368548] target_dyn_opt > Initial loss [0.30490078873640014]
[2K[2018-05-13 03:08:59.171728] target_dyn_opt > Curr loss: -1.364790E+01 [1857: -1.505479E+01], n_evals: 1999, Avg. time per updt: 0.007929
[2018-05-13 03:08:59.186476] target_dyn_opt > Done training. New loss [-13.974410] iter: [2000]
[2018-05-13 03:08:59.189134] train_dynamics > Done training dynamics model
[2018-05-13 03:08:59.190613] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/experience_28.zip
[2018-05-13 03:09:00.469477] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_009_taskplusil_klqp_from_source/policy_28.zip
[2018-05-13 03:09:00.582215] target_dyn

In [10]:
# experiment 10 learn starting from source params, using klqp imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_010_taskplusil_klpq_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=True,
    noisy_policy_input=True, crn=True, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1e-5], loss_type=utils.ImitationLossType.KLPQ)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-13 09:45:18.416468] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-13 09:45:18.440944] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f40e52c3b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f40e52c3b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f40e52cb050>, 'n

[2018-05-13 09:47:58.339273] target_2x_mass > Stopping robot
[2018-05-13 09:47:58.340557] train_dynamics > Training dynamics model
[2018-05-13 09:47:58.342465] train_dynamics > Dataset size:: Inputs: [ (58, 6) ], Targets: [ (58, 4) ] 
[2018-05-13 09:47:58.343724] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 09:47:58.353838] target_dyn_opt > Initial loss [176.71504588733458]
[2K[2018-05-13 09:48:11.148741] target_dyn_opt > Curr loss: 1.603806E+01 [1984: 1.563805E+01], n_evals: 1999, Avg. time per updt: 0.004894
[2018-05-13 09:48:11.158999] target_dyn_opt > Done training. New loss [15.988953] iter: [2000]
[2018-05-13 09:48:11.161598] train_dynamics > Done training dynamics model
[2018-05-13 09:48:11.163007] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/experience_1.zip
[2018-05-13 09:48:11.305424] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_sourc

[2018-05-13 09:56:26.371022] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/policy_5.zip
[2018-05-13 09:56:26.472094] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/dynamics_5.zip
[2018-05-13 09:56:28.409091] ==== Iteration [6], experience: [180 steps] ====
[2018-05-13 09:56:28.414099] SGDOptimizer > Optimizing parameters
[2018-05-13 09:56:28.565375] SGDOptimizer > Initial loss [0.9112875461578369]
[2K[2018-05-13 09:58:04.290138] SGDOptimizer > Curr loss: 7.850891E-01, n_evals: 999, Avg. time per updt: 0.094420
[2018-05-13 09:58:04.317262] SGDOptimizer > Done training. New loss [0.757923] iter: [999]
[2018-05-13 09:58:04.319105] apply_controller > Starting run
[2018-05-13 09:58:04.320638] apply_controller > Running for 3.000000 seconds
[2018-05-13 09:58:04.520581] apply_controller > Done. Stopping robot. Value of run [22.754665]
[2018-05-13 09:58:04.521949] targe

[2018-05-13 10:05:55.412180] apply_controller > Done. Stopping robot. Value of run [22.255943]
[2018-05-13 10:05:55.413563] target_2x_mass > Stopping robot
[2018-05-13 10:05:55.414877] train_dynamics > Training dynamics model
[2018-05-13 10:05:55.418552] train_dynamics > Dataset size:: Inputs: [ (319, 6) ], Targets: [ (319, 4) ] 
[2018-05-13 10:05:55.420472] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 10:05:55.438769] target_dyn_opt > Initial loss [-2.633302568922222]
[2K[2018-05-13 10:06:13.402036] target_dyn_opt > Curr loss: -9.429442E+00 [1737: -9.840206E+00], n_evals: 1999, Avg. time per updt: 0.007506
[2018-05-13 10:06:13.415251] target_dyn_opt > Done training. New loss [-9.204387] iter: [2000]
[2018-05-13 10:06:13.418096] train_dynamics > Done training dynamics model
[2018-05-13 10:06:13.419633] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/experience_10.zip
[2018-05-13 10:06:13.943811] NNPol

[2018-05-13 10:14:42.711111] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/policy_14.zip
[2018-05-13 10:14:42.820130] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/dynamics_14.zip
[2018-05-13 10:14:44.774098] ==== Iteration [15], experience: [450 steps] ====
[2018-05-13 10:14:44.779708] SGDOptimizer > Optimizing parameters
[2018-05-13 10:14:44.935457] SGDOptimizer > Initial loss [0.7014590501785278]
[2K[2018-05-13 10:16:32.222571] SGDOptimizer > Curr loss: 6.442209E-01, n_evals: 999, Avg. time per updt: 0.105984
[2018-05-13 10:16:32.255041] SGDOptimizer > Done training. New loss [0.581761] iter: [999]
[2018-05-13 10:16:32.256627] apply_controller > Starting run
[2018-05-13 10:16:32.258104] apply_controller > Running for 3.000000 seconds
[2018-05-13 10:16:32.430895] apply_controller > Done. Stopping robot. Value of run [15.190462]
[2018-05-13 10:16:32.433262] ta

[2018-05-13 10:25:15.238270] apply_controller > Done. Stopping robot. Value of run [14.580030]
[2018-05-13 10:25:15.239637] target_2x_mass > Stopping robot
[2018-05-13 10:25:15.240925] train_dynamics > Training dynamics model
[2018-05-13 10:25:15.244622] train_dynamics > Dataset size:: Inputs: [ (580, 6) ], Targets: [ (580, 4) ] 
[2018-05-13 10:25:15.246149] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 10:25:15.259756] target_dyn_opt > Initial loss [-4.248172493227597]
[2K[2018-05-13 10:25:33.506092] target_dyn_opt > Curr loss: -1.256274E+01 [1511: -1.292397E+01], n_evals: 1999, Avg. time per updt: 0.007662
[2018-05-13 10:25:33.521792] target_dyn_opt > Done training. New loss [-12.150626] iter: [2000]
[2018-05-13 10:25:33.524417] train_dynamics > Done training dynamics model
[2018-05-13 10:25:33.525768] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/experience_19.zip
[2018-05-13 10:25:34.423378] NNPo

[2018-05-13 10:34:26.177098] NNPolicy > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/policy_23.zip
[2018-05-13 10:34:26.284194] target_dyn > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/dynamics_23.zip
[2018-05-13 10:34:28.263013] ==== Iteration [24], experience: [720 steps] ====
[2018-05-13 10:34:28.268572] SGDOptimizer > Optimizing parameters
[2018-05-13 10:34:28.439285] SGDOptimizer > Initial loss [0.6959244608879089]
[2K[2018-05-13 10:36:20.063571] SGDOptimizer > Curr loss: 5.813130E-01, n_evals: 999, Avg. time per updt: 0.110303
[2018-05-13 10:36:20.097814] SGDOptimizer > Done training. New loss [0.566720] iter: [999]
[2018-05-13 10:36:20.099405] apply_controller > Starting run
[2018-05-13 10:36:20.100578] apply_controller > Running for 3.000000 seconds
[2018-05-13 10:36:20.272342] apply_controller > Done. Stopping robot. Value of run [16.387569]
[2018-05-13 10:36:20.273576] ta

[2018-05-13 10:45:27.969125] apply_controller > Done. Stopping robot. Value of run [15.167965]
[2018-05-13 10:45:27.970703] target_2x_mass > Stopping robot
[2018-05-13 10:45:27.971928] train_dynamics > Training dynamics model
[2018-05-13 10:45:27.978537] train_dynamics > Dataset size:: Inputs: [ (841, 6) ], Targets: [ (841, 4) ] 
[2018-05-13 10:45:27.979840] target_dyn_opt > Optimizing parameters via mini batches
[2018-05-13 10:45:27.994103] target_dyn_opt > Initial loss [-9.81950121373824]
[2K[2018-05-13 10:45:46.555913] target_dyn_opt > Curr loss: -1.369124E+01 [1705: -1.429661E+01], n_evals: 1999, Avg. time per updt: 0.007810
[2018-05-13 10:45:46.570532] target_dyn_opt > Done training. New loss [-13.529571] iter: [2000]
[2018-05-13 10:45:46.573314] train_dynamics > Done training dynamics model
[2018-05-13 10:45:46.574957] Experience > Saving state to /localdata/juan/sim2real_results/target_2x_mass_010_taskplusil_klpq_from_source2/experience_28.zip
[2018-05-13 10:45:47.850491] NNPol

In [None]:
# experiment 11 learn starting from source params, using mmd imitation loss + task cost
output_dir = os.path.join(sim2real_output_dir, target_env.name + '_011_il_mmd_from_source')
utils.set_output_dir(utils.unique_path(output_dir))
params['n_rnd'] = 0                 # number of random initial trials
params['n_opt'] = 30                # learning iterations
source_dyn, target_dyn = init_dyn(params, dyn_path, copy_params=True)
source_pol, target_pol = init_pol(params, pol_path, copy_params=True)
loss_kwargs = dict(
    n_samples=n_samples, mm_state=True, mm_cost=False,
    noisy_policy_input=True, crn=10, time_varying_cost=True,
    extra_shared=extra_shared,
    intermediate_outs=False)

polopt_kwargs = dict(clip=1.0, polyak_averaging=None)

setup_experiment = partial(setup_mc_pilco_experiment, pol=target_pol, dyn=target_dyn)

cost = partial(task_plus_il_cost, weights=[1.0, 1.0], loss_type=utils.ImitationLossType.MMD)

run_pilco_experiment(
    target_env, cost, setup_experiment, params,
    loss_kwargs, polopt_kwargs,
    minimize_cb=minimize_cb, learning_iteration_cb=learning_iteration_cb,
    debug_plot=2, render=False)

[2018-05-15 00:02:40.615236] source_dyn > Loading state from /home/juancamilog/.kusanagi/output/cartpole_kl_loss/dynamics_21.zip
[2018-05-15 00:02:40.639754] source_dyn > Building network
('InputLayer', {'shape': (None, 6), 'name': 'BNN_input'})
('DenseLogNormalDropoutLayer', {'b': BNN_fc0>b, 'name': 'BNN_fc0', 'nonlinearity': <function rectify at 0x7f7fbc4b6b90>, 'noise_samples': BNN_fc0>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc0>W, 'logit_posterior_mean': BNN_fc0>logit_posterior_mean, 'logit_posterior_std': BNN_fc0>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_fc1>b, 'name': 'BNN_fc1', 'nonlinearity': <function rectify at 0x7f7fbc4b6b90>, 'noise_samples': BNN_fc1>noise_samples, 'p': True, 'num_units': 200, 'W': BNN_fc1>W, 'logit_posterior_mean': BNN_fc1>logit_posterior_mean, 'logit_posterior_std': BNN_fc1>logit_posterior_std})
('DenseLogNormalDropoutLayer', {'b': BNN_output>b, 'name': 'BNN_output', 'nonlinearity': <function linear at 0x7f7fbc4ba050>, 'n