In [1]:
%load_ext autoreload
%autoreload 2
'''
Example of how to use the library for learning using the PILCO learner on the cartpole task
'''
# pylint: disable=C0103
import atexit
import sys
import os
import numpy as np
import kusanagi.ghost.regression as kreg

from kusanagi import utils
from kusanagi.shell.cartpole import default_params
from kusanagi.ghost.learners.PILCO import PILCO, MC_PILCO
from kusanagi.ghost.control import NNPolicy
from kusanagi.utils import plot_results

#np.random.seed(31337)
np.set_printoptions(linewidth=500)

In [None]:
# setup output directory
utils.set_output_dir(os.path.join(utils.get_output_dir(), 'cartpole'))

use_bnn = True
J = 4                                                       # number of random initial trials
N = 100                                                     #learning iterations
pilco_config = default_params()
# initialize learner params for PILCO
pilco_config['params']['use_empirical_x0'] = True
pilco_config['params']['realtime'] = False
pilco_config['params']['H'] = 4.0
pilco_config['params']['plant']['dt'] = 0.1
pilco_config['params']['plant']['params']['l'] = .6
pilco_config['params']['cost']['pendulum_length'] = .6

# create copy of params for MC-PILCO
mcpilco_config = pilco_config.copy()
mcpilco_config['params']['min_method'] = 'ADAM'
mcpilco_config['params']['learning_rate'] = 1e-4
mcpilco_config['params']['max_evals'] = 1000
mcpilco_config['params']['clip'] = 10.0
mcpilco_config['n_samples'] = 25
mcpilco_config['dynmodel_class'] = kreg.BNN
mcpilco_config['policy_class'] = NNPolicy

# init learners
pilco = PILCO(**pilco_config)
mcpilco = MC_PILCO(**mcpilco_config)
mcpilco.resample = False

# share experience and policy between the two learners
mcpilco.experience = pilco.experience
mcpilco.policy = pilco.policy

# gather data with random trials on the pilco learner
for i in range(J):
    pilco.plant.reset_state()
    pilco.apply_controller(random_controls=True)
#pilco.plant.reset_state()
#pilco.apply_controller()

[2017-05-18 19:50:39.986612] RBFPolicy_sat > Initializing parameters
[2017-05-18 19:50:40.000461] RBFPolicy_sat > Initialising expression graph for full GP training loss function
[2017-05-18 19:50:40.244307] RBFPolicy_sat > Initialising expression graph for prediction
[2017-05-18 19:50:40.264631] RBFPolicy_sat > Compiling mean and variance of prediction
[2017-05-18 19:50:43.326503] RBFPolicy_sat > Done compiling
[2017-05-18 19:50:43.331933] Experience > Initialising new experience dataset
[2017-05-18 19:50:43.334386] PILCO > Initialising new PILCO learner
<class 'kusanagi.ghost.control.control_.NNPolicy'> True
[2017-05-18 19:50:43.341667] Experience > Initialising new experience dataset
[2017-05-18 19:50:43.342650] MC_PILCO > Initialising new MC_PILCO learner
[2017-05-18 19:50:43.346530] Cartpole > Reset to inital state
[2017-05-18 19:50:43.347631] PILCO > Compiling cost function
[2017-05-18 19:50:43.348472] PILCO > Cost parameters: {'target': [0, 0, 0, 3.141592653589793], 'width': 0.2

In [None]:
pilco.train_dynamics()

[2017-05-18 19:50:44.872390] PILCO > Training dynamics model
[2017-05-18 19:50:44.874439] GP_UI > Unable to load state from /home/thalassa/gamboa/.kusanagi/output/cartpole/PILCO_GP_UI_6_4_Cartpole_RBFPolicy_sat_dynamics.zip
[2017-05-18 19:50:44.877316] GP_UI > Initialising parameters
[2017-05-18 19:50:44.879210] PILCO > Dataset size:: Inputs: [ (160, 6) ], Targets: [ (160, 4) ]  
[2017-05-18 19:50:44.879995] GP_UI > Initialising expression graph for full GP training loss function
[2017-05-18 19:50:45.174715] GP_UI > Compiling full GP training loss function
[2017-05-18 19:50:45.896486] GP_UI > Compiling gradient of full GP training loss function


In [None]:
mcpilco.train_dynamics(max_episodes=4)

In [None]:
mcpilco.resample = False
mcpilco.rollout_fn = None
mcpilco.set_n_samples(100)
plot_results(mcpilco,plot_samples=True)

In [None]:
mcpilco.resample = True
mcpilco.rollout_fn = None
mcpilco.set_n_samples(100)
plot_results(mcpilco,plot_samples=False)

In [None]:
plot_results(pilco)

In [None]:
#resample = True
#n_samples = 25
#mcpilco.learning_rate = 1e-4
#if mcpilco.trajectory_samples.get_value() != n_samples or mcpilco.resample != resample:
#    mcpilco.resample = resample
#    mcpilco.set_n_samples(n_samples)
#    mcpilco.rollout_fn = None
#    mcpilco.train_fn = None
mcpilco.set_n_samples(25)
mcpilco.learning_rate = 1e-4
# train
mcpilco.train_policy()
# apply controller
mcpilco.plant.reset_state()
mcpilco.apply_controller()

In [9]:
#mcpilco.save()