In [1]:
from IPython import display
%matplotlib inline
import gym
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import torch
import omegaconf
import time
import torch

from torch import nn as nn
import torch.nn.functional as F

import mbrl.env.cartpole_continuous as cartpole_env
import mbrl.env.reward_fns as reward_fns
import mbrl.env.termination_fns as termination_fns
import mbrl.models as models
import mbrl.planning as planning
import mbrl.util.common as common_util
import mbrl.util as util

import tactile_gym.rl_envs
from tactile_gym.sb3_helpers.params import import_parameters

%load_ext autoreload
%autoreload 2

mpl.rcParams.update({"font.size": 16})

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
# Load the environment 
algo_name = 'ppo'
env_name = 'object_push-v0'
rl_params, algo_params, augmentations = import_parameters(env_name, algo_name)
rl_params["max_ep_len"] = 1000    
rl_params["env_modes"][ 'observation_mode'] = 'tactile_pose_relative_data'
rl_params["env_modes"][ 'control_mode'] = 'TCP_position_control'
rl_params["env_modes"]['movement_mode'] = 'TyRz'
rl_params["env_modes"]['traj_type'] = 'point'
rl_params["env_modes"]['task'] = "goal_pos"
rl_params["env_modes"]['planar_states'] = True
rl_params["env_modes"]['use_contact'] = True
rl_params["env_modes"]['terminate_early']  = True
rl_params["env_modes"]['terminate_terminate_early'] = True

rl_params["env_modes"]['rand_init_orn'] = True
# rl_params["env_modes"]['rand_init_pos_y'] = True
# rl_params["env_modes"]['rand_obj_mass'] = True

rl_params["env_modes"]['additional_reward_settings'] = 'john_guide_off_normal'
rl_params["env_modes"]['terminated_early_penalty'] =  -500
rl_params["env_modes"]['reached_goal_reward'] = 100
rl_params["env_modes"]['max_no_contact_steps'] = 40
rl_params["env_modes"]['max_tcp_to_obj_orn'] = 30/180 * np.pi
rl_params["env_modes"]['importance_obj_goal_pos'] = 1.0
rl_params["env_modes"]['importance_obj_goal_orn'] = 1.0
rl_params["env_modes"]['importance_tip_obj_orn'] = 1.0

rl_params["env_modes"]['mpc_goal_orn_update'] = True
rl_params["env_modes"]['goal_orn_update_freq'] = 'every_step'


# set limits and goals
TCP_lims = np.zeros(shape=(6, 2))
TCP_lims[0, 0], TCP_lims[0, 1] = -0.1, 0.4  # x lims
TCP_lims[1, 0], TCP_lims[1, 1] = -0.3, 0.3  # y lims
TCP_lims[2, 0], TCP_lims[2, 1] = -0.0, 0.0  # z lims
TCP_lims[3, 0], TCP_lims[3, 1] = -0.0, 0.0  # roll lims
TCP_lims[4, 0], TCP_lims[4, 1] = -0.0, 0.0  # pitch lims
TCP_lims[5, 0], TCP_lims[5, 1] = -180 * np.pi / 180, 180 * np.pi / 180  # yaw lims

# goal parameter
goal_edges = [(0, -1), (0, 1), (1, 0)] # Top bottom and stright
# goal_edges = [(1, 0)]
goal_x_max = np.float64(TCP_lims[0, 1] * 0.8).item()
goal_x_min = 0.0 # np.float64(TCP_lims[0, 0] * 0.6).item()
goal_y_max = np.float64(TCP_lims[1, 1] * 0.6).item()
goal_y_min = np.float64(TCP_lims[1, 0] * 0.6).item()
goal_ranges = [goal_x_min, goal_x_max, goal_y_min, goal_y_max]

rl_params["env_modes"]['tcp_lims'] = TCP_lims.tolist()
rl_params["env_modes"]['goal_edges'] = goal_edges
rl_params["env_modes"]['goal_ranges'] = goal_ranges

env_kwargs={
    'show_gui':False,
    'show_tactile':False,
    'obs_stacked_len': 1,
    'max_steps':rl_params["max_ep_len"],
    'image_size':rl_params["image_size"],
    'env_modes':rl_params["env_modes"],
}

# training environment
env = gym.make(env_name, **env_kwargs)
seed = 0
env.seed(seed)
rng = np.random.default_rng(seed=0)
generator = torch.Generator(device=device)
generator.manual_seed(seed)
obs_shape = env.observation_space.shape
act_shape = env.action_space.shape

pybullet build time: Mar  8 2021 17:26:24


argv[0]=
Loaded EGL 1.5 after reload.
GL_VENDOR=NVIDIA Corporation
GL_RENDERER=NVIDIA GeForce RTX 3090/PCIe/SSE2
GL_VERSION=4.6.0 NVIDIA 495.29.05
GL_SHADING_LANGUAGE_VERSION=4.60 NVIDIA
Version = 4.6.0 NVIDIA 495.29.05
Vendor = NVIDIA Corporation
Renderer = NVIDIA GeForce RTX 3090/PCIe/SSE2
ven = NVIDIA Corporation
ven = NVIDIA Corporation


In [3]:
print(obs_shape)
print(act_shape)

env.reset()
for i in range(4):
    obs, _, done, _ = env.step(env.action_space.sample())
    print(obs)
    if done:
        break

(1, 8)
(2,)
[[0.002749 -0.005235 -0.158004 0.987438 -0.001386 0.005535 0.153985
  0.988073]]
[[0.002843 -0.005203 -0.165753 0.986167 -0.000767 0.005098 0.157203
  0.987566]]
[[0.002956 -0.005161 -0.164881 0.986313 -0.000455 0.004742 0.159286
  0.987233]]
[[0.003141 -0.005099 -0.162799 0.986659 0.000168 0.005242 0.156160
  0.987732]]


In [4]:
trial_length = 1000
num_trials = 10
ensemble_size = 5
buffer_size = 10000
target_normalised = True

# Everything with "???" indicates an option with a missing value.
# Our utility functions will fill in these details using the 
# environment information
cfg_dict = {
    # dynamics model configuration
    "dynamics_model": {
        "_target_": "mbrl.models.GaussianMLP",
        "device": device,
        "num_layers": 3,
        "ensemble_size": ensemble_size,
        "hid_size": 200,
        "in_size": "???",
        "out_size": "???",
        "deterministic": False,
        "propagation_method": "fixed_model",
        # can also configure activation function for GaussianMLP
        "activation_fn_cfg": {
            "_target_": "torch.nn.LeakyReLU",
            "negative_slope": 0.01
        }
    },
    # options for training the dynamics model
    "algorithm": {
        "learned_rewards": False,
        "target_is_delta": True,
        "normalize": True,
        "target_normalize": target_normalised,
        "dataset_size": buffer_size
    },
    # these are experiment specific options
    "overrides": {
        "trial_length": trial_length,
        "num_steps": num_trials * trial_length,
        "model_batch_size": 32,
        "validation_ratio": 0.05
    }
}
cfg = omegaconf.OmegaConf.create(cfg_dict)

In [5]:
obs_shape

(1, 8)

In [6]:
# Create a 1-D dynamics model for this environment
dynamics_model = common_util.create_one_dim_tr_model(cfg, obs_shape, act_shape)

# Create a gym-like environment to encapsulate the model
model_env = models.ModelEnvPushing(env, dynamics_model, termination_fn=None, reward_fn=None, generator=generator)

  if OmegaConf.is_none(config):


In [7]:
print(dynamics_model)

OneDTransitionRewardModel(
  (model): GaussianMLP(
    (hidden_layers): Sequential(
      (0): Sequential(
        (0): EnsembleLinearLayer(num_members=5, in_size=10, out_size=200, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
      (1): Sequential(
        (0): EnsembleLinearLayer(num_members=5, in_size=200, out_size=200, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
      (2): Sequential(
        (0): EnsembleLinearLayer(num_members=5, in_size=200, out_size=200, bias=True)
        (1): LeakyReLU(negative_slope=0.01)
      )
    )
    (mean_and_logvar): EnsembleLinearLayer(num_members=5, in_size=200, out_size=16, bias=True)
  )
)


In [8]:
replay_buffer = common_util.create_replay_buffer(cfg, obs_shape, act_shape, rng=rng)
common_util.rollout_agent_trajectories(
    env,
    1000, # initial exploration steps
    planning.RandomAgent(env),
    {}, # keyword arguments to pass to agent.act()
    replay_buffer=replay_buffer,
    trial_length=trial_length
)

print("# samples stored", replay_buffer.num_stored)

# samples stored 1000


In [20]:
planning_horizon = 15

# Get action sequence from buffer
data = replay_buffer.get_all()
action_sequences = data.act[0:planning_horizon,:]
action_sequences = np.tile(action_sequences, (5,1,1)).astype(np.float32)
action_sequences = torch.from_numpy(action_sequences)
# print(action_sequences.shape)

# Initialise state and create model input
initial_state = data.obs[0]
print(initial_state.shape)
initial_obs_batch = np.tile(initial_state, (5,1)).astype(np.float32)
print(initial_obs_batch.shape)
model_state = model_env.reset(initial_obs_batch, return_as_np=False)
print(model_state['obs'].shape)
print(action_sequences.shape)
print(action_sequences[:, 1, :].shape)

(1, 8)
(5, 8)
torch.Size([5, 8])
torch.Size([5, 15, 2])
torch.Size([5, 2])


In [10]:
# -------- Test model one set of action sequences from buffer to see exploding -------
# states still occur

planning_horizon = 15

# Get action sequence from buffer
data = replay_buffer.get_all()
action_sequences = data.act[0:planning_horizon,:]
action_sequences = np.tile(action_sequences, (5,1,1)).astype(np.float32)
action_sequences = torch.from_numpy(action_sequences)
# print(action_sequences.shape)

# Initialise state and create model input
initial_state = data.obs[0]
# print(initial_state.shape)
initial_obs_batch = np.tile(initial_state, (5,1)).astype(np.float32)
# print(initial_obs_batch.shape)
model_state = model_env.reset(initial_obs_batch, return_as_np=False)
# print(model_state['propagation_indices'])

batch_size = initial_obs_batch.shape[0]
total_rewards = torch.zeros(batch_size, 1)
terminated = torch.zeros(batch_size, 1, dtype=bool)
model_env.reset_batch_goals(batch_size)

# print(data.obs[1][0:3])
# print(data.next_obs[1][0:3])
# print(data.act[1])

for time_step in range(planning_horizon):
    print(torch.mean(model_state["obs"], 0)[0:3])
    # print(model_state["obs"].shape)
    # print(torch.mean(model_state["obs"]))
    action_for_step = action_sequences[:, time_step, :]
    # print(action_for_step[0])

    # Re-initialise model state from data buffer with every time step (1 step rollouts)
    # Comment out to do planning_horizon step rollouts
    # initial_state = data.obs[time_step]
    # initial_obs_batch = np.tile(initial_state, (5,1)).astype(np.float32)
    # initial_obs_batch = torch.from_numpy(initial_obs_batch)
    # model_state.update({'obs': initial_obs_batch})
    # action_batch = torch.repeat_interleave(
    #     action_for_step, 20, dim=0
    # )

    action_batch = action_for_step
    # ---------------- Use model_env.step -----------------
    # _, rewards, dones, model_state = model_env.step(
    #     action_batch, model_state, sample=True
    # )
    # rewards[terminated] = 0
    # terminated |= dones
    # total_rewards += rewards

    # -------------- Use one_dim_tr_model sample -------------
    with torch.no_grad():
        next_observs, _, _, next_model_state, = model_env.dynamics_model.sample(
            action_batch, model_state, deterministic=False, rng=model_env._rng,
        )

    # -------------- Use model.sample_1d() --------------------
    # with torch.no_grad():
    #     obs = model_state["obs"]
    #     model_in = model_env.dynamics_model._get_model_input(model_state["obs"], action_batch)
    #     next_observs, _ = model_env.dynamics_model.model.sample_1d(
    #         model_in, model_state, rng=model_env._rng, deterministic=False
    #     )
    #     next_observs += obs
    #     model_state["obs"] = next_observs

    # -------------- Use model.forward()-------------------------
    # with torch.no_grad():
    #     obs = model_state["obs"]
    #     model_in = model_env.dynamics_model._get_model_input(model_state["obs"], action_batch)
    #     means, logvars = model_env.dynamics_model.model.forward(
    #         model_in, rng=model_env._rng, propagation_indices=model_state["propagation_indices"]
    #     )
    #     variances = logvars.exp()
    #     stds = torch.sqrt(variances)
    #     # stds = torch.ones((5,30))
    #     next_observs = torch.normal(means, stds, generator=model_env._rng)
    #     # next_observs = means
    #     # print(torch.mean(means))
    #     # print(torch.mean(logvars))
    #     # print(torch.mean(stds))
    #     if dynamics_model.target_normalizer:
    #         next_observs = dynamics_model.target_normalizer.denormalize(next_observs)

    #     if dynamics_model.target_is_delta:
    #         next_observs += obs
    #     model_state["obs"] = next_observs

tensor([ 8.6693e-04,  8.9407e-07, -8.9774e-02], device='cuda:0')
tensor([-0.4178,  0.0146, -0.3882], device='cuda:0')
tensor([-0.3934,  0.1383,  0.1007], device='cuda:0')
tensor([ 0.2892, -0.1566,  0.1400], device='cuda:0')
tensor([ 0.1399, -0.0492, -0.2984], device='cuda:0')
tensor([ 0.3311,  0.4704, -0.3678], device='cuda:0')
tensor([ 0.8319,  0.0420, -0.0200], device='cuda:0')
tensor([ 0.4669,  1.0363, -0.0445], device='cuda:0')
tensor([ 0.7695,  1.3614, -0.2305], device='cuda:0')
tensor([ 0.5526,  2.0521, -0.8765], device='cuda:0')
tensor([ 0.8042,  1.9921, -1.2258], device='cuda:0')
tensor([ 0.8107,  2.5183, -1.1476], device='cuda:0')
tensor([ 0.7837,  2.4401, -0.8553], device='cuda:0')
tensor([ 1.1362,  2.9114, -0.9633], device='cuda:0')
tensor([ 1.2109,  2.0777, -0.9531], device='cuda:0')
