In [1]:
import atexit
import datetime
import numpy as np
import os
import torch
import tensorboardX

from prob_mbrl import utils, models, algorithms, envs
from functools import partial
torch.set_flush_denormal(True)
torch.set_num_threads(1)
torch.manual_seed(0)
np.random.seed(0)

In [2]:
# parameters
n_rnd = 4
pred_H = 25
control_H = 40
N_particles = 100
pol_opt_iters = 1000
dyn_opt_iters = 2000
ps_iters = 100
dyn_components = 1
dyn_shape = [200] * 2
pol_shape = [200] * 2
use_cuda = False
learn_reward = False
keep_best = False

# initialize environment
# env = envs.Pendulum() # this works better with learning the reward function
env = envs.Cartpole()

results_filename = os.path.expanduser(
    "~/.prob_mbrl/results_%s_%s.pth.tar" %
    (env.__class__.__name__,
     datetime.datetime.now().strftime("%Y%m%d%H%M%S.%f")))
D = env.observation_space.shape[0]
U = env.action_space.shape[0]
maxU = env.action_space.high
minU = env.action_space.low

# initialize reward/cost function
if learn_reward or env.reward_func is None:
    reward_func = None
else:
    reward_func = env.reward_func

# intialize to max episode steps if available
if hasattr(env, 'spec'):
    if hasattr(env.spec, 'max_episode_steps'):
        control_H = env.spec.max_episode_steps
initial_experience = control_H * n_rnd

# initialize dynamics model
dynE = 2 * (D + 1) if learn_reward else 2 * D
if dyn_components > 1:
    output_density = models.GaussianMixtureDensity(dynE / 2,
                                                   dyn_components)
    dynE = (dynE + 1) * dyn_components + 1
else:
    output_density = models.DiagGaussianDensity(dynE / 2)

dyn_model = models.mlp(D + U,
                       dynE,
                       dyn_shape,
                       dropout_layers=[
                           models.modules.CDropout(0.25, 0.1)
                           for i in range(len(dyn_shape))
                       ],
                       nonlin=torch.nn.ReLU)
dyn = models.DynamicsModel(dyn_model,
                           reward_func=reward_func,
                           output_density=output_density).float()

# initalize policy
pol_model = models.mlp(D,
                       2 * U,
                       pol_shape,
                       dropout_layers=[
                           models.modules.BDropout(0.1)
                           for i in range(len(pol_shape))
                       ],
                       nonlin=torch.nn.ReLU,
                       output_nonlin=partial(models.DiagGaussianDensity,
                                             U))

pol = models.Policy(pol_model, maxU, minU).float()
print(dyn)
print(pol)

# initalize experience dataset
exp = utils.ExperienceDataset()

# initialize dynamics optimizer
opt1 = torch.optim.Adam(dyn.parameters(), 1e-4)

# initialize policy optimizer
opt2 = torch.optim.Adam(pol.parameters(), 1e-4)

DynamicsModel(
  (model): BSequential(
    (fc0): Linear(in_features=6, out_features=200, bias=True)
    (nonlin0): ReLU()
    (drop0): CDropout(rate=0.25, temperature=0.10000000149011612, regularizer_scale=0.5)
    (fc1): Linear(in_features=200, out_features=200, bias=True)
    (nonlin1): ReLU()
    (drop1): CDropout(rate=0.25, temperature=0.10000000149011612, regularizer_scale=0.5)
    (fc_out): Linear(in_features=200, out_features=10, bias=True)
  )
  (output_density): DiagGaussianDensity(output_dims=5)
  (reward_func): CartpoleReward()
)
Policy(
  (model): BSequential(
    (fc0): Linear(in_features=5, out_features=200, bias=True)
    (nonlin0): ReLU()
    (drop0): BDropout(rate=0.10000000149011612, regularizer_scale=0.5)
    (fc1): Linear(in_features=200, out_features=200, bias=True)
    (nonlin1): ReLU()
    (drop1): BDropout(rate=0.10000000149011612, regularizer_scale=0.5)
    (fc_out): Linear(in_features=200, out_features=2, bias=True)
    (fc_nonlin): DiagGaussianDensity(output

In [None]:
%matplotlib qt5
if use_cuda and torch.cuda.is_available():
    dyn = dyn.cuda()
    pol = pol.cuda()

writer = tensorboardX.SummaryWriter()

# callbacks
def on_close():
    writer.close()

atexit.register(on_close)

# initial experience data collection
scale = maxU - minU
bias = minU
rnd = lambda x, t: (scale * np.random.rand(U, ) + bias)  # noqa: E731
while exp.n_samples() < initial_experience:
    ret = utils.apply_controller(
        env,
        rnd,
        control_H,
        realtime=False,
        stop_when_done=False,
        callback=lambda *args, **kwargs: env.render())
    params_ = [p.clone() for p in list(pol.parameters())]
    exp.append_episode(*ret, policy_params=params_)
    exp.save(results_filename)

# policy learning loop
for ps_it in range(ps_iters):
    if ps_it > 0 or exp.n_samples() == 0:
        # apply policy
        ret = utils.apply_controller(
            env,
            pol,
            control_H,
            callback=lambda *args, **kwargs: env.render(),
            realtime=False, stop_when_done=False)
        params_ = [p.clone() for p in list(pol.parameters())]
        exp.append_episode(*ret, policy_params=params_)
        exp.save(results_filename)

    # train dynamics
    X, Y = exp.get_dynmodel_dataset(deltas=True, return_costs=learn_reward)
    dyn.set_dataset(X.to(dyn.X.device).float(), Y.to(dyn.X.device).float())
    utils.train_regressor(dyn,
                          dyn_opt_iters,
                          N_particles,
                          True,
                          opt1,
                          log_likelihood=dyn.output_density.log_prob,
                          summary_writer=writer,
                          summary_scope='model_learning/episode_%d' %
                          ps_it)

    # sample initial states for policy optimization
    x0 = exp.sample_states(N_particles,
                           timestep=0).to(dyn.X.device).float()
    x0 = x0 + 1e-1 * torch.randn_like(x0)
    x0 = x0.detach()

    utils.plot_rollout(x0, dyn, pol, control_H)

    # train policy
    def on_iteration(i, loss, states, actions, rewards, discount):
        writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                          loss, i)
        if i % 100 == 0:
            '''
            states = states.transpose(0, 1).cpu().detach().numpy()
            actions = actions.transpose(0, 1).cpu().detach().numpy()
            rewards = rewards.transpose(0, 1).cpu().detach().numpy()
            utils.plot_trajectories(states,
                                    actions,
                                    rewards,
                                    plot_samples=True)
            '''
            writer.flush()

    print("Policy search iteration %d" % (ps_it + 1))
    algorithms.mc_pilco(x0,
                        dyn,
                        pol,
                        pred_H,
                        opt2,
                        exp,
                        pol_opt_iters,
                        pegasus=True,
                        mm_states=True,
                        mm_rewards=True,
                        maximize=True,
                        clip_grad=1.0,
                        on_iteration=on_iteration,
                        step_idx_to_sample=0,
                        init_state_noise=1e-1 * x0.std(0))
    utils.plot_rollout(x0, dyn, pol, control_H)
    writer.add_scalar('robot/evaluation_loss',
                      torch.tensor(ret[2]).sum(), ps_it + 1)


apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [0.009175]
apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [0.572404]
apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [0.006426]
apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [0.025665]


log-likelihood of data: -7.943673:   1%|          | 14/2000 [00:00<00:14, 135.28it/s]

train_regressor > Dataset size [156]


log-likelihood of data: -0.029115: 100%|█████████▉| 1996/2000 [00:12<00:00, 170.74it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Policy search iteration 1


Pred. Cumm. rewards: 3.800731 [25]: 100%|██████████| 1000/1000 [01:51<00:00,  9.66it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: -0.215226:   1%|          | 16/2000 [00:00<00:13, 151.45it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [1.958003]
train_regressor > Dataset size [195]


log-likelihood of data: 1.654886: 100%|█████████▉| 1998/2000 [00:12<00:00, 157.80it/s]
Pred. Cumm. rewards: 1.067301 [25]:   0%|          | 1/1000 [00:00<01:59,  8.34it/s]

Policy search iteration 2


Pred. Cumm. rewards: 7.496438 [25]: 100%|██████████| 1000/1000 [01:43<00:00,  9.27it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 0.920200:   1%|          | 17/2000 [00:00<00:12, 162.41it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [11.374109]
train_regressor > Dataset size [234]


log-likelihood of data: 2.813802:  99%|█████████▉| 1986/2000 [00:12<00:00, 163.58it/s]
Pred. Cumm. rewards: 3.125003 [25]:   0%|          | 1/1000 [00:00<01:43,  9.66it/s]

Policy search iteration 3


Pred. Cumm. rewards: 7.434040 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.83it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 2.049290:   1%|          | 16/2000 [00:00<00:12, 159.38it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [10.344432]
train_regressor > Dataset size [273]


log-likelihood of data: 2.775593: 100%|█████████▉| 1997/2000 [00:12<00:00, 156.08it/s]
Pred. Cumm. rewards: 4.749218 [25]:   0%|          | 1/1000 [00:00<01:43,  9.62it/s]

Policy search iteration 4


Pred. Cumm. rewards: 6.825605 [25]: 100%|██████████| 1000/1000 [01:45<00:00,  9.56it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 2.678162:   1%|          | 16/2000 [00:00<00:12, 154.67it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [5.977985]
train_regressor > Dataset size [312]


log-likelihood of data: 3.355496: 100%|█████████▉| 1996/2000 [00:12<00:00, 162.39it/s]
Pred. Cumm. rewards: 4.069183 [25]:   0%|          | 0/1000 [00:00<?, ?it/s]

Policy search iteration 5


Pred. Cumm. rewards: 6.353509 [25]: 100%|██████████| 1000/1000 [01:43<00:00,  9.70it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 3.251989:   1%|          | 16/2000 [00:00<00:12, 156.18it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [9.977594]
train_regressor > Dataset size [351]


log-likelihood of data: 4.562339: 100%|█████████▉| 1998/2000 [00:12<00:00, 160.43it/s]
Pred. Cumm. rewards: 4.943288 [25]:   0%|          | 1/1000 [00:00<01:42,  9.77it/s]

Policy search iteration 6


Pred. Cumm. rewards: 6.858777 [25]: 100%|██████████| 1000/1000 [01:45<00:00,  9.49it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 3.870353:   1%|          | 16/2000 [00:00<00:13, 149.54it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [10.828394]
train_regressor > Dataset size [390]


log-likelihood of data: 5.380519: 100%|█████████▉| 1999/2000 [00:13<00:00, 150.43it/s]
Pred. Cumm. rewards: 4.559573 [25]:   0%|          | 1/1000 [00:00<01:48,  9.18it/s]

Policy search iteration 7


Pred. Cumm. rewards: 6.050838 [25]: 100%|██████████| 1000/1000 [01:45<00:00,  9.78it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 5.490703:   1%|          | 15/2000 [00:00<00:13, 147.90it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [9.875975]
train_regressor > Dataset size [429]


log-likelihood of data: 5.804075: 100%|█████████▉| 1998/2000 [00:12<00:00, 149.93it/s]
Pred. Cumm. rewards: 5.038359 [25]:   0%|          | 1/1000 [00:00<01:45,  9.45it/s]

Policy search iteration 8


Pred. Cumm. rewards: 6.614202 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.05it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 5.542390:   1%|          | 15/2000 [00:00<00:13, 141.99it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [9.766203]
train_regressor > Dataset size [468]


log-likelihood of data: 6.396270:  99%|█████████▉| 1986/2000 [00:12<00:00, 160.32it/s]
Pred. Cumm. rewards: 5.009095 [25]:   0%|          | 2/1000 [00:00<01:35, 10.47it/s]

Policy search iteration 9


Pred. Cumm. rewards: 8.403789 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.76it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 6.405999:   1%|          | 16/2000 [00:00<00:12, 156.07it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [6.155827]
train_regressor > Dataset size [507]


log-likelihood of data: 6.576864: 100%|█████████▉| 1991/2000 [00:12<00:00, 159.05it/s]
Pred. Cumm. rewards: 4.414504 [25]:   0%|          | 1/1000 [00:00<01:46,  9.41it/s]

Policy search iteration 10


Pred. Cumm. rewards: 9.068583 [25]: 100%|██████████| 1000/1000 [01:45<00:00,  9.46it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 6.374048:   1%|          | 17/2000 [00:00<00:11, 165.57it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [18.865135]
train_regressor > Dataset size [546]


log-likelihood of data: 7.658630: 100%|█████████▉| 1995/2000 [00:12<00:00, 168.03it/s]
Pred. Cumm. rewards: 6.258204 [25]:   0%|          | 2/1000 [00:00<01:37, 10.29it/s]

Policy search iteration 11


Pred. Cumm. rewards: 16.094427 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.53it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 6.727750:   1%|          | 16/2000 [00:00<00:12, 157.18it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [5.804806]
train_regressor > Dataset size [585]


log-likelihood of data: 8.120251:  99%|█████████▉| 1984/2000 [00:12<00:00, 154.57it/s]
Pred. Cumm. rewards: 4.405800 [25]:   0%|          | 2/1000 [00:00<01:36, 10.39it/s]

Policy search iteration 12


Pred. Cumm. rewards: 16.438950 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.55it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 7.035192:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [30.615314]
train_regressor > Dataset size [624]


log-likelihood of data: 8.469097: 100%|█████████▉| 1996/2000 [00:12<00:00, 160.32it/s]
Pred. Cumm. rewards: 4.171557 [25]:   0%|          | 0/1000 [00:00<?, ?it/s]

Policy search iteration 13


Pred. Cumm. rewards: 17.038742 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.64it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 7.769278:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [31.781195]
train_regressor > Dataset size [663]


log-likelihood of data: 9.246945: 100%|█████████▉| 1992/2000 [00:12<00:00, 158.02it/s]
Pred. Cumm. rewards: 4.866514 [25]:   0%|          | 1/1000 [00:00<01:51,  8.99it/s]

Policy search iteration 14


Pred. Cumm. rewards: 17.507059 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.91it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 8.057062:   1%|          | 17/2000 [00:00<00:12, 163.04it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [32.368610]
train_regressor > Dataset size [702]


log-likelihood of data: 9.202250: 100%|█████████▉| 1997/2000 [00:12<00:00, 156.43it/s] 
Pred. Cumm. rewards: 3.817450 [25]:   0%|          | 1/1000 [00:00<01:41,  9.85it/s]

Policy search iteration 15


Pred. Cumm. rewards: 17.507593 [25]: 100%|██████████| 1000/1000 [01:43<00:00,  9.12it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 9.539936:   1%|          | 16/2000 [00:00<00:12, 159.13it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [31.940863]
train_regressor > Dataset size [741]


log-likelihood of data: 9.680073: 100%|█████████▉| 1997/2000 [00:12<00:00, 156.34it/s] 
Pred. Cumm. rewards: 5.758907 [25]:   0%|          | 1/1000 [00:00<01:46,  9.39it/s]

Policy search iteration 16


Pred. Cumm. rewards: 17.689659 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.61it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 9.633832:   1%|          | 15/2000 [00:00<00:13, 142.40it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [32.487495]
train_regressor > Dataset size [780]


log-likelihood of data: 10.581008: 100%|█████████▉| 1994/2000 [00:12<00:00, 153.55it/s]
Pred. Cumm. rewards: 2.599709 [25]:   0%|          | 1/1000 [00:00<01:53,  8.78it/s]

Policy search iteration 17


Pred. Cumm. rewards: 17.726728 [25]: 100%|██████████| 1000/1000 [01:44<00:00,  9.61it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [32.873203]


log-likelihood of data: 10.220001:   1%|          | 16/2000 [00:00<00:13, 151.48it/s]

train_regressor > Dataset size [819]


log-likelihood of data: 10.972873:  99%|█████████▉| 1985/2000 [00:12<00:00, 156.80it/s]
Pred. Cumm. rewards: 6.075938 [25]:   0%|          | 1/1000 [00:00<01:48,  9.17it/s]

Policy search iteration 18


Pred. Cumm. rewards: 17.802050 [25]: 100%|██████████| 1000/1000 [01:46<00:00,  9.07it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 10.808597:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [33.014568]
train_regressor > Dataset size [858]


log-likelihood of data: 12.225194:  99%|█████████▉| 1987/2000 [00:11<00:00, 173.79it/s]
Pred. Cumm. rewards: 4.181459 [25]:   0%|          | 1/1000 [00:00<01:49,  9.13it/s]

Policy search iteration 19


Pred. Cumm. rewards: 17.770880 [25]: 100%|██████████| 1000/1000 [01:39<00:00,  9.19it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 12.590409:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [32.674847]
train_regressor > Dataset size [897]


log-likelihood of data: 12.773203:  99%|█████████▉| 1988/2000 [00:12<00:00, 159.45it/s]
Pred. Cumm. rewards: 4.142340 [25]:   0%|          | 1/1000 [00:00<01:51,  8.94it/s]

Policy search iteration 20


Pred. Cumm. rewards: 17.944530 [25]: 100%|██████████| 1000/1000 [01:41<00:00, 10.54it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 11.522026:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [33.002983]
train_regressor > Dataset size [936]


log-likelihood of data: 13.655376:  99%|█████████▉| 1989/2000 [00:12<00:00, 160.44it/s]
Pred. Cumm. rewards: 3.662322 [25]:   0%|          | 1/1000 [00:00<01:50,  9.04it/s]

Policy search iteration 21


Pred. Cumm. rewards: 18.035997 [25]: 100%|██████████| 1000/1000 [01:38<00:00, 10.15it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds


log-likelihood of data: 12.243679:   0%|          | 0/2000 [00:00<?, ?it/s]

apply_controller Done after [40] steps. Stopping robot. Value of run [32.837471]
train_regressor > Dataset size [975]


log-likelihood of data: 13.260921: 100%|█████████▉| 1992/2000 [00:12<00:00, 160.97it/s]
Pred. Cumm. rewards: 3.975950 [25]:   0%|          | 1/1000 [00:00<01:56,  8.56it/s]

Policy search iteration 22


Pred. Cumm. rewards: 17.972471 [25]: 100%|██████████| 1000/1000 [01:41<00:00, 11.42it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [33.013786]


log-likelihood of data: 12.329764:   1%|          | 14/2000 [00:00<00:14, 138.51it/s]

train_regressor > Dataset size [1014]


log-likelihood of data: 14.467130:  99%|█████████▉| 1988/2000 [00:14<00:00, 134.70it/s]
Pred. Cumm. rewards: 11.993865 [25]:   0%|          | 2/1000 [00:00<01:34, 10.58it/s]

Policy search iteration 23


Pred. Cumm. rewards: 18.055079 [25]: 100%|██████████| 1000/1000 [01:30<00:00, 11.02it/s]


apply_controller Starting run
apply_controller Running for 4.000000 seconds
apply_controller Done after [40] steps. Stopping robot. Value of run [32.969631]


log-likelihood of data: 13.322544:   1%|          | 14/2000 [00:00<00:15, 131.16it/s]

train_regressor > Dataset size [1053]


log-likelihood of data: 13.291028:  56%|█████▋    | 1129/2000 [00:08<00:06, 132.01it/s]