In [None]:
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)
hide_toggle()

#### All imports

In [None]:
import logging
import functools
import sys

import gym
import gym.wrappers
import numpy as np
import torch
from torch import distributions, nn

import pfrl
from pfrl import experiments, replay_buffers, utils
from pfrl.nn.lmbda import Lambda

from distutils.version import LooseVersion

In [None]:
import robosuite as suite
from robosuite.wrappers import GymWrapper

from robosuite.controllers import load_controller_config, ALL_CONTROLLERS

#### Configuration

In [None]:
class Args(dict):
    def __init__(self):
        dict.__init__(self)
        self.outdir = "results4"
        self.num_envs = 1
        self.seed = 0
        self.gpu = 0
        self.load = "" # directory to load agent from
        self.steps = 2000*2500 # total number of timesteps to train the agent.
        self.eval_n_runs = 2500 # Number of episodes run for each evaluation.
        self.eval_interval = 25000 # Interval in timesteps between evaluations
        self.replay_start_size = 3300 # Minimum replay buffer size before performing gradient updates.
        self.batch_size = 128 # Minibatch size
        self.update_interval = 2500
        self.n_times_update = 1000
        self.render = False
        self.demo = False
        self.load_pretrained = False
        self.pretrained_type = "best"
        self.monitor = False
        self.log_interval = 20000 # Interval in timesteps between outputting log messages during training
        self.log_level = logging.INFO
        self.policy_output_scale = 1.0

#### Utilities

##### Environment utils

In [None]:
env_config = {
     "control_freq": 20,
    "env_name": "Lift",
    "hard_reset": False,
    "horizon": 500,
    "ignore_done": True,
    "reward_scale": 1.0,
    "robots": [
      "Sawyer"
    ]
}

controller_config = load_controller_config(default_controller="OSC_POSE")

In [None]:
print(controller_config)

In [None]:
### Environment utils

def make_env(process_idx, test):
        env = GymWrapper(
        suite.make(**env_config,
                 has_renderer=False,
                 has_offscreen_renderer=False,
                 use_object_obs=True,
                 use_camera_obs=False,
                 reward_shaping=True,
                 controller_configs=controller_config,
                 )
        )
        # Unwrap TimiLimit wrapper
        assert isinstance(env, gym.wrappers.TimeLimit)
        env = env.env
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[process_idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        # Normalize action space to [-1, 1]^n
        env = pfrl.wrappers.NormalizeActionSpace(env)
        if args.monitor:
            env = gym.wrappers.Monitor(env, args.outdir)
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

def make_batch_env(test):
    return pfrl.envs.SerialVectorEnv(
            [make_env(0, test)]
        )
hide_toggle()

##### Autodiff utils

In [None]:
### Autodiff package utils

def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1
        )
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]
        )

def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(q_func.parameters(), lr=5e-4)
        return q_func, q_func_optimizer

def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low, action_space.high).astype(np.float32)
    
hide_toggle()

#### Setup

In [None]:
args = Args()

logging.basicConfig(level=args.log_level, stream=sys.stdout, format='')
outdir = experiments.prepare_output_dir(args, args.outdir, None)

# Set a random seed used in PFRL
utils.set_random_seed(args.seed)
process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs

sample_env = make_env(process_idx=0, test=False)
timestep_limit = 500#sample_env.spec.max_episode_steps
obs_space = sample_env.observation_space
action_space = sample_env.action_space
print("Observation space:", obs_space)
print("Action space:", action_space)

obs_size = obs_space.low.size
action_size = action_space.low.size
hide_toggle()

In [None]:
print(timestep_limit)

In [None]:
if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

#### Build Networks

In [None]:
policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
torch.nn.init.xavier_uniform_(policy[0].weight)
torch.nn.init.xavier_uniform_(policy[2].weight)
torch.nn.init.xavier_uniform_(policy[4].weight, gain=args.policy_output_scale)
policy_optimizer = torch.optim.Adam(policy.parameters(), lr=5e-4)

In [None]:
q_func1, q_func1_optimizer = make_q_func_with_optimizer()
q_func2, q_func2_optimizer = make_q_func_with_optimizer()

rbuf = replay_buffers.ReplayBuffer(10 ** 6)

#### Train agent

In [None]:
agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=0.99,
        replay_start_size=args.replay_start_size,
        update_interval=args.update_interval,
        n_times_update=args.n_times_update,
        gpu=args.gpu,
        minibatch_size=args.batch_size,
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=3e-4,
    )

In [None]:
args.load = "./results4/best"
agent.load(args.load)

In [None]:
args.eval_n_runs = 10
eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
print(
    "n_runs: {} mean: {} median: {} stdev {}".format(
        args.eval_n_runs,
        eval_stats["mean"],
        eval_stats["median"],
        eval_stats["stdev"],
    )
)

In [None]:
experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            outdir=args.outdir,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=timestep_limit,
        )

In [None]:
args.eval_n_runs = 1

In [None]:
eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
print(
    "n_runs: {} mean: {} median: {} stdev {}".format(
        args.eval_n_runs,
        eval_stats["mean"],
        eval_stats["median"],
        eval_stats["stdev"],
    )
)

In [None]:
args.render = True