## A notebook containing training logic for a robot to lift a cube using the mpo algorithm

In [20]:
# Include all the imports here
from typing import Dict, Sequence

from absl import app
from absl import flags
import acme
from acme import specs
from acme import types
from acme import wrappers
from acme.agents.tf import dmpo
from acme.tf import networks
from acme.tf import utils as tf2_utils

import numpy as np
import sonnet as snt

In [5]:
import robosuite as suite
from robosuite.wrappers import GymWrapper
from robosuite.controllers import load_controller_config

In [11]:
# Prepare the environment
env_config = {
     "control_freq": 20,
    "env_name": "Lift",
    "hard_reset": False,
    "horizon": 500,
    "ignore_done": False,
    "reward_scale": 1.0,
    "camera_names": "frontview",
    "robots": [
      "Panda"
    ]
}
controller_config = load_controller_config(default_controller="OSC_POSITION")

keys = ["object-state"]
for idx in range(1):
    keys.append(f"robot{idx}_proprio-state")

In [14]:
def make_environment(env_config, controller_config, keys):
    env_suite = suite.make(**env_config,
                 has_renderer=False,
                 has_offscreen_renderer=False,
                 use_camera_obs=False,
                 reward_shaping=True,
                 controller_configs=controller_config,
                 )
    env = GymWrapper(env_suite, keys=keys)
    env = wrappers.gym_wrapper.GymWrapper(env)
    env = wrappers.SinglePrecisionWrapper(env)
    
    spec = specs.make_environment_spec(env)
    
    return env, spec

In [15]:
env, spec = make_environment(env_config, controller_config, keys)

In [17]:
# Prepare the agent

def make_networks(
    action_spec: specs.BoundedArray,
    policy_layer_sizes: Sequence[int] = (256, 256, 256),
    critic_layer_sizes: Sequence[int] = (512, 512, 256),
    vmin: float = -150.,
    vmax: float = 150.,
    num_atoms: int = 51,
) -> Dict[str, types.TensorTransformation]:
      """Creates networks used by the agent."""

      # Get total number of action dimensions from action spec.
      num_dimensions = np.prod(action_spec.shape, dtype=int)

      # Create the shared observation network; here simply a state-less operation.
      observation_network = tf2_utils.batch_concat

      # Create the policy network.
      policy_network = snt.Sequential([
          networks.LayerNormMLP(policy_layer_sizes),
          networks.MultivariateNormalDiagHead(num_dimensions)
      ])

      # The multiplexer transforms concatenates the observations/actions.
      multiplexer = networks.CriticMultiplexer(
          critic_network=networks.LayerNormMLP(critic_layer_sizes),
          action_network=networks.ClipToSpec(action_spec))

      # Create the critic network.
      critic_network = snt.Sequential([
          multiplexer,
          networks.DiscreteValuedHead(vmin, vmax, num_atoms),
      ])

      return {
          'policy': policy_network,
          'critic': critic_network,
          'observation': observation_network,
      }

In [18]:
agent_networks = make_networks(spec.actions)

In [22]:
# construct the agent
agent = dmpo.DistributionalMPO(
      environment_spec=spec,
      policy_network=agent_networks['policy'],
      critic_network=agent_networks['critic'],
      observation_network=agent_networks['observation'],  # pytype: disable=wrong-arg-types
  )

Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.


Instructions for updating:
`scale_identity_multiplier` is deprecated; please combine it into `scale_diag` directly instead.


In [23]:
# Start the training process
loop = acme.EnvironmentLoop(env, agent)

In [24]:
num_episodes = 100

In [25]:
loop.run(num_episodes=num_episodes)

Instructions for updating:
Please pass an integer value for `reinterpreted_batch_ndims`. The current behavior corresponds to `reinterpreted_batch_ndims=tf.size(distribution.batch_shape_tensor()) - 1`.


Instructions for updating:
Please pass an integer value for `reinterpreted_batch_ndims`. The current behavior corresponds to `reinterpreted_batch_ndims=tf.size(distribution.batch_shape_tensor()) - 1`.


INFO:tensorflow:Assets written to: /home/mohan/acme/05b572d2-c46c-11eb-9e00-d7ad18c91467/snapshots/policy/assets


INFO:tensorflow:Assets written to: /home/mohan/acme/05b572d2-c46c-11eb-9e00-d7ad18c91467/snapshots/policy/assets


KeyboardInterrupt: 