# Reinforcement Learning Final Project: Exploration Strategies for Robotic Manipulation

### Import helper libraries

In [9]:
### Important trick to keep JupyterLab from crashing
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import numpy as np
import gym
import sys
import pickle
import traceback
from arguments import get_args
import random
import torch
import tqdm
from datetime import datetime
from mpi4py import MPI
from mpi_utils.mpi_utils import sync_networks, sync_grads
from rl_modules.replay_buffer import replay_buffer
from rl_modules.models import actor, critic, noisy_actor, noisy_critic
from mpi_utils.normalizer import normalizer
from her_modules.her import her_sampler
import copy
import math
from typing import Dict, List, Tuple, Callable
from collections import namedtuple
from copy import deepcopy
import ipywidgets as widgets
import matplotlib.pyplot as plt
import more_itertools as mitt
import pygame
import glfw
from math import floor
from tiling import IHT
from pathlib import Path
from schedule import ExponentialSchedule, OUSchedule

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [36, 4]
fullPath = str(Path('.').absolute())

os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['IN_MPI'] = '1'

### Create environments and place to store model weights
envs = {
    'push': {
        'model': gym.make('FetchPush-v1'),
        'weights': None
    },
    'reach': {
        'model': gym.make('FetchReach-v1'),
        'weights': None
    },
    'slide': {
        'model': gym.make('FetchSlide-v1'),
        'weights': None
    },
    'pick': {
        'model': gym.make('FetchPickAndPlace-v1'),
        'weights': None
    }
}

### Graphically Render Policy

Using Mujoco backend

In [10]:
def render(env, policy=None):
    """Graphically render an episode using the given policy

    :param env:  Gym environment
    :param policy:  function which maps state to action.  If None, the random
                    policy is used.
    """
    glfw.init()
    if policy is None:

        def policy(state):
            return env.action_space.sample()

    state = env.reset()
    env.render()
    i = 0
    while i < 3000:
        action = policy(state)
        state, _, done, _ = env.step(action)
        env.render()
        if done:
            break
        i += 1
            
    env.close()
    glfw.terminate()

### Helper Functions

In [11]:
def preproc_og(o, g):
    o = np.clip(o, -200, 200)
    g = np.clip(g, -200, 200)
    return o, g

# pre_process the inputs
def preproc_inputs(obs, g):
    obs_norm = self.o_norm.normalize(obs)
    g_norm = self.g_norm.normalize(g)
    # concatenate the stuffs
    inputs = np.concatenate([obs_norm, g_norm])
    inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
    return inputs

### Define Structure for Deep Deterministic Policy Gradient (DDPG) Agent

In [12]:
"""
ddpg with HER (MPI-version)

"""
class ddpg_agent:
    def __init__(self, args, env, env_params):
        self.args = args
        self.env = env
        self.env_params = env_params
        # create the network
        self.actor_network = noisy_actor(env_params) if bool(self.args.noisy) else actor(env_params)
        self.critic_network = noisy_critic(env_params) if bool(self.args.noisy) else critic(env_params) 
        # sync the networks across the cpus
        sync_networks(self.actor_network)
        sync_networks(self.critic_network)
        # build up the target network
        self.actor_target_network = actor(env_params)
        self.critic_target_network = critic(env_params)
        # load the weights into the target networks
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        self.critic_target_network.load_state_dict(self.critic_network.state_dict())
        # if use gpu
        if self.args.cuda:
            self.actor_network.cuda()
            self.critic_network.cuda()
            self.actor_target_network.cuda()
            self.critic_target_network.cuda()
        # create the optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=self.args.lr_actor)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=self.args.lr_critic)
        # her sampler
        self.her_module = her_sampler(self.args.replay_strategy, self.args.replay_k, self.env.compute_reward)
        # create the replay buffer
        self.buffer = replay_buffer(self.env_params, self.args.buffer_size, self.her_module.sample_her_transitions)
        # create the normalizer
        self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.args.clip_range)
        self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.args.clip_range)
        ########### Setup exploration parameters ########
        # Create exploration parameters
        self.epsilon = self.args.epsilon
        # Create e-greedy schedule
        self.e_schedule = ExponentialSchedule(1.0,0.01,self.args.n_cycles)
        self.ou_schedule = OUSchedule(self.args.n_cycles,4)
        # Create store for count based methods
        self.num_tiles = 10
        self.max_size = 1000
        self.iht = IHT(self.max_size)
        self.weights_fq = np.zeros(self.max_size)
        self.weights_ucb = np.zeros(self.max_size)
        # Create strategy store
        self.strategies = {
        'no_explore': self.no_explore,
        'standard': self.standard_explore,
        'e_greedy': self.e_greedy,
        'e_greedy_decay': self.e_greedy_decay,
        'ucb': self.ucb,
        'ou': self.ornstein_uhlenbeck,
        'count': self.frequency_explore
        }
        # Select the exploration strategy
        self.explore_strategy = self.strategies[self.args.strategy]

    def learn(self):
        """
        train the network

        """
        # start to collect samples
        pbar = tqdm.notebook.trange(self.args.n_epochs)
        models = []
        accuracies = []
        o_norms = []
        g_norms = []
        for epoch in pbar:
            pbar.set_description(f'Executing epoch: {epoch}/{self.args.n_epochs}')
            # Reset process noise
            self.ou_schedule.reset()
            # Reset count based method stores
            self.iht = IHT(self.max_size)
            self.weights_fq = np.zeros(self.max_size)
            self.weights_ucb = np.zeros(self.max_size)
            for cycle_num in range(self.args.n_cycles):
                mb_obs, mb_ag, mb_g, mb_actions = [], [], [], []
                for _ in range(self.args.num_rollouts_per_mpi):
                    # reset the rollouts
                    ep_obs, ep_ag, ep_g, ep_actions = [], [], [], []
                    # reset the environment
                    observation = self.env.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']
                    # start to collect samples
                    for t in range(self.env_params['max_timesteps']):
                        with torch.no_grad():
                            input_tensor = self._preproc_inputs(obs, g)
                            pi = self.actor_network(input_tensor)
                            action = self._select_actions(pi, input_tensor, cycle_num)
                        # feed the actions into the environment
                        observation_new, _, _, info = self.env.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        # append rollouts
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_actions.append(action.copy())
                        # re-assign the observation
                        obs = obs_new
                        ag = ag_new
                    ep_obs.append(obs.copy())
                    ep_ag.append(ag.copy())
                    mb_obs.append(ep_obs)
                    mb_ag.append(ep_ag)
                    mb_g.append(ep_g)
                    mb_actions.append(ep_actions)
                # convert them into arrays
                mb_obs = np.array(mb_obs)
                mb_ag = np.array(mb_ag)
                mb_g = np.array(mb_g)
                mb_actions = np.array(mb_actions)
                # store the episodes
                self.buffer.store_episode([mb_obs, mb_ag, mb_g, mb_actions])
                self._update_normalizer([mb_obs, mb_ag, mb_g, mb_actions])
                for _ in range(self.args.n_batches):
                    # train the network
                    self._update_network()
                # soft update
                self._soft_update_target_network(self.actor_target_network, self.actor_network)
                self._soft_update_target_network(self.critic_target_network, self.critic_network)
            # start to do the evaluation
            success_rate = self._eval_agent()
            if MPI.COMM_WORLD.Get_rank() == 0:
                print('[{}] epoch is: {}, eval success rate is: {:.3f}'.format(datetime.now(), epoch, success_rate))
            models.append(self.actor_network.state_dict())
            o_norms.append(self.o_norm)
            g_norms.append(self.g_norm)
            accuracies.append(success_rate)
        return models,o_norms,g_norms,accuracies

    def get_active_tiles(self, state, action):
        active_tiles = self.tiles(self.iht, self.num_tiles, np.concatenate((state,action)))
        return active_tiles

    def hash_coords(self, coordinates, m, read_only=False):
        if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only)
        if isinstance(m, int): return hash(tuple(coordinates)) % m
        if m is None: return coordinates

    def tiles(self, iht_or_size, num_tilings, floats, ints=None, read_only=False):
        """returns num-tilings tile indices corresponding to the floats and ints"""
        if ints is None:
            ints = []
        qfloats = [floor(f * num_tilings) for f in floats]
        tiles = []
        for tiling in range(num_tilings):
            tilingX2 = tiling * 2
            coords = [tiling]
            b = tiling
            for q in qfloats:
                coords.append((q + b) // num_tilings)
                b += tilingX2
            coords.extend(ints)
            tiles.append(self.hash_coords(coords, iht_or_size, read_only))
        return tiles

    # no exploration, returns the action
    def no_explore(self, action, state, step):
        return action
        
    # run basic e-greedy exploration
    def e_greedy(self, action, state, step):
        if np.random.random() < self.epsilon:
            # Select a random action state
            action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
                                            size=self.env_params['action'])
        return action
        
    # Use decaying e-greedy strategy
    def e_greedy_decay(self, action, state, step):
        if np.random.random() < self.e_schedule.value(step):
            # Select a random action state
            action = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
                                            size=self.env_params['action'])
        return action
    
    # Use upper confidence bound
    def ucb(self, action, state, step):
        active = self.get_active_tiles(np.random.normal(state.numpy().flatten()),np.random.normal(action))
        self.weights_ucb[active] += 1
        additive = np.sqrt(np.log(np.sum(self.weights_ucb[active]))/np.sum(self.weights_ucb))
        return action + np.random.normal(loc=0.0,scale=additive,size=4)
    
    # Select action as a function of how frequently they have occured
    def frequency_explore(self, action, state, step):
        beta = 0.5
        active = self.get_active_tiles(np.random.normal(state.numpy().flatten()),np.random.normal(action))
        self.weights_fq[active] += 1
        additive = beta/np.sqrt(np.sum(self.weights_fq[active]))
        return action + np.random.normal(loc=0.0,scale=additive,size=4)

    # White noise based exploration
    def standard_explore(self, action, state, step):
        # random actions...
        random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
                                            size=self.env_params['action'])
        # choose if use the random actions
        return action + np.random.binomial(1, self.args.random_eps, 1)[0] * (random_actions - action)
    
    # Explore using OU process
    def ornstein_uhlenbeck(self, action, state, step):
        return action + self.ou_schedule.value(step)
    
    # pre_process the inputs
    def _preproc_inputs(self, obs, g):
        obs_norm = self.o_norm.normalize(obs)
        g_norm = self.g_norm.normalize(g)
        # concatenate the stuffs
        inputs = np.concatenate([obs_norm, g_norm])
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        if self.args.cuda:
            inputs = inputs.cuda()
        return inputs
    
    # this function will choose action for the agent and do the exploration
    def _select_actions(self, pi, state, step):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        #action += self.args.noise_eps * self.env_params['action_max'] * np.random.randn(*action.shape)
        #action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max'])
        action = self.explore_strategy(action,state,step)
        action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max'])
        return action

    # update the normalizer
    def _update_normalizer(self, episode_batch):
        mb_obs, mb_ag, mb_g, mb_actions = episode_batch
        mb_obs_next = mb_obs[:, 1:, :]
        mb_ag_next = mb_ag[:, 1:, :]
        # get the number of normalization transitions
        num_transitions = mb_actions.shape[1]
        # create the new buffer to store them
        buffer_temp = {'obs': mb_obs, 
                       'ag': mb_ag,
                       'g': mb_g, 
                       'actions': mb_actions, 
                       'obs_next': mb_obs_next,
                       'ag_next': mb_ag_next,
                       }
        transitions = self.her_module.sample_her_transitions(buffer_temp, num_transitions)
        obs, g = transitions['obs'], transitions['g']
        # pre process the obs and g
        transitions['obs'], transitions['g'] = preproc_og(obs, g)
        # update
        self.o_norm.update(transitions['obs'])
        self.g_norm.update(transitions['g'])
        # recompute the stats
        self.o_norm.recompute_stats()
        self.g_norm.recompute_stats()

    # soft update
    def _soft_update_target_network(self, target, source):
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)

    # update the network
    def _update_network(self):
        # sample the episodes
        transitions = self.buffer.sample(self.args.batch_size)
        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions['obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = preproc_og(o, g)
        transitions['obs_next'], transitions['g_next'] = preproc_og(o_next, g)
        # start to do the update
        obs_norm = self.o_norm.normalize(transitions['obs'])
        g_norm = self.g_norm.normalize(transitions['g'])
        inputs_norm = np.concatenate([obs_norm, g_norm], axis=1)
        obs_next_norm = self.o_norm.normalize(transitions['obs_next'])
        g_next_norm = self.g_norm.normalize(transitions['g_next'])
        inputs_next_norm = np.concatenate([obs_next_norm, g_next_norm], axis=1)
        # transfer them into the tensor
        inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32)
        inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32)
        actions_tensor = torch.tensor(transitions['actions'], dtype=torch.float32)
        r_tensor = torch.tensor(transitions['r'], dtype=torch.float32) 
        if self.args.cuda:
            inputs_norm_tensor = inputs_norm_tensor.cuda()
            inputs_next_norm_tensor = inputs_next_norm_tensor.cuda()
            actions_tensor = actions_tensor.cuda()
            r_tensor = r_tensor.cuda()
        # calculate the target Q value function
        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            actions_next = self.actor_target_network(inputs_next_norm_tensor)
            q_next_value = self.critic_target_network(inputs_next_norm_tensor, actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + self.args.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - self.args.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)
        # the q loss
        real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor_network(inputs_norm_tensor)
        actor_loss = -self.critic_network(inputs_norm_tensor, actions_real).mean()
        actor_loss += self.args.action_l2 * (actions_real / self.env_params['action_max']).pow(2).mean()
        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()
        sync_grads(self.actor_network)
        self.actor_optim.step()
        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()
        sync_grads(self.critic_network)
        self.critic_optim.step()

    # do the evaluation
    def _eval_agent(self):
        total_success_rate = []
        for _ in range(self.args.n_test_rollouts):
            per_success_rate = []
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(self.env_params['max_timesteps']):
                with torch.no_grad():
                    input_tensor = self._preproc_inputs(obs, g)
                    pi = self.actor_network(input_tensor)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                per_success_rate.append(info['is_success'])
            total_success_rate.append(per_success_rate)
        total_success_rate = np.array(total_success_rate)
        local_success_rate = np.mean(total_success_rate[:, -1])
        global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM)
        return global_success_rate / MPI.COMM_WORLD.Get_size()


### Define environmental parameters

In [13]:
def get_env_params(env):
    obs = env.reset()
    # close the environment
    params = {'obs': obs['observation'].shape[0],
            'goal': obs['desired_goal'].shape[0],
            'action': env.action_space.shape[0],
            'action_max': env.action_space.high[0],
            }
    params['max_timesteps'] = env._max_episode_steps
    return params


### Parse arguments and run code

In [14]:
def launch(args, path=None):
    # Save path
    save_path = os.path.join(path,'fetch_models',f'{args.env_name}_{args.strategy}.wts')
    print(save_path)
    # create the ddpg_agent
    env = gym.make(args.env_name)
    # get the environment parameters
    env_params = get_env_params(env)
    # create the ddpg agent to interact with the environment 
    models = []
    accuracies = []
    o_norms = []
    g_norms = []
    t_runs = tqdm.notebook.trange(args.n_runs)
    for run in t_runs:
        t_runs.set_description(f'Executing run: {run}/{args.n_runs}')
        ddpg_trainer = ddpg_agent(args, env, env_params)
        t_models,t_o_norms,t_g_norms,t_accuracies = ddpg_trainer.learn()
        models.append(t_models)
        accuracies.append(deepcopy(t_accuracies))
        o_norms.append(t_o_norms)
        g_norms.append(t_g_norms)
        # Remove model to prevent bad things from happening
        del ddpg_trainer
    # Save weights here
    # torch.save(agent_weights.state_dict(), save_path)
    # Save extra essential pieces
    with open(save_path,'wb') as f:
        pickle.dump([accuracies, models[-1][-1]], f)
    #torch.save([o_norms,g_norms,accuracies], save_path)
    return models,o_norms,g_norms,accuracies

### Execute various configurations

##### Run 1: No exploration noise

In [None]:
args_to_parse = '--env-name FetchSlide-v1 --n-epochs=200 --n-runs=5 --strategy=no_explore --noisy=0 --cuda'
args = get_args(args_to_parse)
out_no_explore = launch(args, fullPath)

/home/nathaniel/Classes/CS5180_Reinforcement_Learning/ReinforcementRobotExploration/fetch_models/FetchSlide-v1_no_explore.wts


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-06 22:28:19.616375] epoch is: 0, eval success rate is: 0.000
[2021-12-06 22:28:35.338606] epoch is: 1, eval success rate is: 0.000
[2021-12-06 22:28:50.715284] epoch is: 2, eval success rate is: 0.000
[2021-12-06 22:29:06.162597] epoch is: 3, eval success rate is: 0.000
[2021-12-06 22:29:21.708419] epoch is: 4, eval success rate is: 0.000
[2021-12-06 22:29:37.065190] epoch is: 5, eval success rate is: 0.000
[2021-12-06 22:29:52.295867] epoch is: 6, eval success rate is: 0.000
[2021-12-06 22:30:07.774909] epoch is: 7, eval success rate is: 0.000
[2021-12-06 22:30:23.359474] epoch is: 8, eval success rate is: 0.000
[2021-12-06 22:30:39.128251] epoch is: 9, eval success rate is: 0.000
[2021-12-06 22:30:54.424761] epoch is: 10, eval success rate is: 0.000
[2021-12-06 22:31:09.934649] epoch is: 11, eval success rate is: 0.100
[2021-12-06 22:31:25.433427] epoch is: 12, eval success rate is: 0.000
[2021-12-06 22:31:40.747129] epoch is: 13, eval success rate is: 0.000
[2021-12-06 22:3

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-06 23:19:50.204250] epoch is: 0, eval success rate is: 0.000
[2021-12-06 23:20:05.766648] epoch is: 1, eval success rate is: 0.000
[2021-12-06 23:20:21.114875] epoch is: 2, eval success rate is: 0.000
[2021-12-06 23:20:36.177847] epoch is: 3, eval success rate is: 0.000
[2021-12-06 23:20:51.323199] epoch is: 4, eval success rate is: 0.000
[2021-12-06 23:21:06.587543] epoch is: 5, eval success rate is: 0.000
[2021-12-06 23:21:22.662291] epoch is: 6, eval success rate is: 0.000
[2021-12-06 23:21:38.036699] epoch is: 7, eval success rate is: 0.000
[2021-12-06 23:21:53.673603] epoch is: 8, eval success rate is: 0.000
[2021-12-06 23:22:09.311768] epoch is: 9, eval success rate is: 0.100
[2021-12-06 23:22:24.691231] epoch is: 10, eval success rate is: 0.000
[2021-12-06 23:22:39.956447] epoch is: 11, eval success rate is: 0.100
[2021-12-06 23:22:55.206828] epoch is: 12, eval success rate is: 0.000
[2021-12-06 23:23:10.516985] epoch is: 13, eval success rate is: 0.000
[2021-12-06 23:2

##### Run 2: OU Scheduling

In [7]:
args_to_parse = '--env-name FetchSlide-v1 --n-epochs=200 --n-runs=5 --strategy=ou --noisy=0'
args = get_args(args_to_parse)
out_no_explore = launch(args, fullPath)

/home/nathaniel/Classes/CS5180_Reinforcement_Learning/ReinforcementRobotExploration/fetch_models/FetchSlide-v1_ou.wts


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-06 23:44:26.292203] epoch is: 0, eval success rate is: 0.000
[2021-12-06 23:44:47.162507] epoch is: 1, eval success rate is: 0.000
[2021-12-06 23:45:08.263472] epoch is: 2, eval success rate is: 0.000
[2021-12-06 23:45:29.241594] epoch is: 3, eval success rate is: 0.000
[2021-12-06 23:45:49.931334] epoch is: 4, eval success rate is: 0.000
[2021-12-06 23:46:09.660193] epoch is: 5, eval success rate is: 0.000
[2021-12-06 23:46:29.836982] epoch is: 6, eval success rate is: 0.000
[2021-12-06 23:46:51.399430] epoch is: 7, eval success rate is: 0.000
[2021-12-06 23:47:12.235415] epoch is: 8, eval success rate is: 0.000
[2021-12-06 23:47:33.557147] epoch is: 9, eval success rate is: 0.000
[2021-12-06 23:47:54.271158] epoch is: 10, eval success rate is: 0.000
[2021-12-06 23:48:15.628495] epoch is: 11, eval success rate is: 0.000
[2021-12-06 23:48:35.991740] epoch is: 12, eval success rate is: 0.000
[2021-12-06 23:48:57.327124] epoch is: 13, eval success rate is: 0.000
[2021-12-06 23:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 01:02:15.573705] epoch is: 0, eval success rate is: 0.000
[2021-12-07 01:02:38.235142] epoch is: 1, eval success rate is: 0.000
[2021-12-07 01:03:00.440169] epoch is: 2, eval success rate is: 0.000
[2021-12-07 01:03:21.065367] epoch is: 3, eval success rate is: 0.000
[2021-12-07 01:03:41.441737] epoch is: 4, eval success rate is: 0.000
[2021-12-07 01:04:01.912245] epoch is: 5, eval success rate is: 0.000
[2021-12-07 01:04:23.074146] epoch is: 6, eval success rate is: 0.000
[2021-12-07 01:04:44.387724] epoch is: 7, eval success rate is: 0.000
[2021-12-07 01:05:06.576104] epoch is: 8, eval success rate is: 0.000
[2021-12-07 01:05:27.540042] epoch is: 9, eval success rate is: 0.000
[2021-12-07 01:05:49.585633] epoch is: 10, eval success rate is: 0.000
[2021-12-07 01:06:10.280953] epoch is: 11, eval success rate is: 0.000
[2021-12-07 01:06:30.919316] epoch is: 12, eval success rate is: 0.000
[2021-12-07 01:06:53.749728] epoch is: 13, eval success rate is: 0.000
[2021-12-07 01:0

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 02:22:12.034131] epoch is: 0, eval success rate is: 0.000
[2021-12-07 02:22:32.712397] epoch is: 1, eval success rate is: 0.000
[2021-12-07 02:22:53.599268] epoch is: 2, eval success rate is: 0.100
[2021-12-07 02:23:15.578586] epoch is: 3, eval success rate is: 0.000
[2021-12-07 02:23:36.108270] epoch is: 4, eval success rate is: 0.000
[2021-12-07 02:23:57.468587] epoch is: 5, eval success rate is: 0.000
[2021-12-07 02:24:19.416503] epoch is: 6, eval success rate is: 0.000
[2021-12-07 02:24:40.516232] epoch is: 7, eval success rate is: 0.000
[2021-12-07 02:25:01.378180] epoch is: 8, eval success rate is: 0.000
[2021-12-07 02:25:22.771960] epoch is: 9, eval success rate is: 0.000
[2021-12-07 02:25:43.361027] epoch is: 10, eval success rate is: 0.000
[2021-12-07 02:26:04.669142] epoch is: 11, eval success rate is: 0.000
[2021-12-07 02:26:26.179681] epoch is: 12, eval success rate is: 0.000
[2021-12-07 02:26:49.569099] epoch is: 13, eval success rate is: 0.000
[2021-12-07 02:2

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 03:37:06.648285] epoch is: 0, eval success rate is: 0.000
[2021-12-07 03:37:22.548201] epoch is: 1, eval success rate is: 0.000
[2021-12-07 03:37:38.893298] epoch is: 2, eval success rate is: 0.000
[2021-12-07 03:37:55.754216] epoch is: 3, eval success rate is: 0.000
[2021-12-07 03:38:13.077678] epoch is: 4, eval success rate is: 0.100
[2021-12-07 03:38:29.963062] epoch is: 5, eval success rate is: 0.000
[2021-12-07 03:38:47.346871] epoch is: 6, eval success rate is: 0.000
[2021-12-07 03:39:03.537168] epoch is: 7, eval success rate is: 0.000
[2021-12-07 03:39:20.606455] epoch is: 8, eval success rate is: 0.000
[2021-12-07 03:39:36.715560] epoch is: 9, eval success rate is: 0.000
[2021-12-07 03:39:53.556674] epoch is: 10, eval success rate is: 0.000
[2021-12-07 03:40:10.083979] epoch is: 11, eval success rate is: 0.000
[2021-12-07 03:40:26.575132] epoch is: 12, eval success rate is: 0.000
[2021-12-07 03:40:42.623580] epoch is: 13, eval success rate is: 0.000
[2021-12-07 03:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 04:39:30.405388] epoch is: 0, eval success rate is: 0.000
[2021-12-07 04:39:46.409917] epoch is: 1, eval success rate is: 0.000
[2021-12-07 04:40:03.215380] epoch is: 2, eval success rate is: 0.000
[2021-12-07 04:40:19.823363] epoch is: 3, eval success rate is: 0.000
[2021-12-07 04:40:36.051073] epoch is: 4, eval success rate is: 0.000
[2021-12-07 04:40:52.323072] epoch is: 5, eval success rate is: 0.000
[2021-12-07 04:41:09.307305] epoch is: 6, eval success rate is: 0.000
[2021-12-07 04:41:25.430672] epoch is: 7, eval success rate is: 0.000
[2021-12-07 04:41:41.864525] epoch is: 8, eval success rate is: 0.000
[2021-12-07 04:41:58.243733] epoch is: 9, eval success rate is: 0.000
[2021-12-07 04:42:14.397672] epoch is: 10, eval success rate is: 0.000
[2021-12-07 04:42:30.433637] epoch is: 11, eval success rate is: 0.000
[2021-12-07 04:42:46.538952] epoch is: 12, eval success rate is: 0.000
[2021-12-07 04:43:03.524982] epoch is: 13, eval success rate is: 0.000
[2021-12-07 04:4

##### Run 3: e-greedy

In [8]:
args_to_parse = '--env-name FetchSlide-v1 --n-epochs=200 --n-runs=5 --strategy=e_greedy --noisy=0'
args = get_args(args_to_parse)
out_e_greedy = launch(args, fullPath)

/home/nathaniel/Classes/CS5180_Reinforcement_Learning/ReinforcementRobotExploration/fetch_models/FetchSlide-v1_e_greedy.wts


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 05:41:26.340712] epoch is: 0, eval success rate is: 0.000
[2021-12-07 05:41:42.553921] epoch is: 1, eval success rate is: 0.000
[2021-12-07 05:41:58.748790] epoch is: 2, eval success rate is: 0.000
[2021-12-07 05:42:15.120411] epoch is: 3, eval success rate is: 0.000
[2021-12-07 05:42:31.642712] epoch is: 4, eval success rate is: 0.000
[2021-12-07 05:42:47.865011] epoch is: 5, eval success rate is: 0.000
[2021-12-07 05:43:04.244447] epoch is: 6, eval success rate is: 0.100
[2021-12-07 05:43:20.855598] epoch is: 7, eval success rate is: 0.000
[2021-12-07 05:43:37.081378] epoch is: 8, eval success rate is: 0.000
[2021-12-07 05:43:53.678801] epoch is: 9, eval success rate is: 0.000
[2021-12-07 05:44:10.306583] epoch is: 10, eval success rate is: 0.000
[2021-12-07 05:44:27.145912] epoch is: 11, eval success rate is: 0.100
[2021-12-07 05:44:44.094147] epoch is: 12, eval success rate is: 0.000
[2021-12-07 05:45:00.838486] epoch is: 13, eval success rate is: 0.000
[2021-12-07 05:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 06:42:41.823578] epoch is: 0, eval success rate is: 0.000
[2021-12-07 06:42:57.740104] epoch is: 1, eval success rate is: 0.000
[2021-12-07 06:43:13.578411] epoch is: 2, eval success rate is: 0.000
[2021-12-07 06:43:29.977551] epoch is: 3, eval success rate is: 0.100
[2021-12-07 06:43:46.184939] epoch is: 4, eval success rate is: 0.000
[2021-12-07 06:44:02.308578] epoch is: 5, eval success rate is: 0.000
[2021-12-07 06:44:18.534519] epoch is: 6, eval success rate is: 0.000
[2021-12-07 06:44:34.790087] epoch is: 7, eval success rate is: 0.000
[2021-12-07 06:44:50.954545] epoch is: 8, eval success rate is: 0.000
[2021-12-07 06:45:07.097271] epoch is: 9, eval success rate is: 0.000
[2021-12-07 06:45:23.164086] epoch is: 10, eval success rate is: 0.000
[2021-12-07 06:45:39.452332] epoch is: 11, eval success rate is: 0.000
[2021-12-07 06:45:55.896633] epoch is: 12, eval success rate is: 0.000
[2021-12-07 06:46:12.559272] epoch is: 13, eval success rate is: 0.100
[2021-12-07 06:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 07:43:21.287120] epoch is: 0, eval success rate is: 0.000
[2021-12-07 07:43:36.952642] epoch is: 1, eval success rate is: 0.000
[2021-12-07 07:43:52.651425] epoch is: 2, eval success rate is: 0.000
[2021-12-07 07:44:08.421673] epoch is: 3, eval success rate is: 0.000
[2021-12-07 07:44:24.244592] epoch is: 4, eval success rate is: 0.000
[2021-12-07 07:44:40.246661] epoch is: 5, eval success rate is: 0.000
[2021-12-07 07:44:56.218848] epoch is: 6, eval success rate is: 0.000
[2021-12-07 07:45:12.403612] epoch is: 7, eval success rate is: 0.000
[2021-12-07 07:45:28.643613] epoch is: 8, eval success rate is: 0.000
[2021-12-07 07:45:44.819891] epoch is: 9, eval success rate is: 0.000
[2021-12-07 07:46:01.071593] epoch is: 10, eval success rate is: 0.000
[2021-12-07 07:46:17.244159] epoch is: 11, eval success rate is: 0.000
[2021-12-07 07:46:33.440869] epoch is: 12, eval success rate is: 0.000
[2021-12-07 07:46:49.786718] epoch is: 13, eval success rate is: 0.100
[2021-12-07 07:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 08:44:33.309357] epoch is: 0, eval success rate is: 0.000
[2021-12-07 08:44:48.785531] epoch is: 1, eval success rate is: 0.100
[2021-12-07 08:45:04.586976] epoch is: 2, eval success rate is: 0.000
[2021-12-07 08:45:20.533956] epoch is: 3, eval success rate is: 0.000
[2021-12-07 08:45:36.403055] epoch is: 4, eval success rate is: 0.000
[2021-12-07 08:45:52.253002] epoch is: 5, eval success rate is: 0.000
[2021-12-07 08:46:08.278769] epoch is: 6, eval success rate is: 0.000
[2021-12-07 08:46:24.414388] epoch is: 7, eval success rate is: 0.000
[2021-12-07 08:46:40.550473] epoch is: 8, eval success rate is: 0.000
[2021-12-07 08:46:56.820748] epoch is: 9, eval success rate is: 0.000
[2021-12-07 08:47:13.881157] epoch is: 10, eval success rate is: 0.000
[2021-12-07 08:47:30.252024] epoch is: 11, eval success rate is: 0.000
[2021-12-07 08:47:46.375138] epoch is: 12, eval success rate is: 0.000
[2021-12-07 08:48:02.587064] epoch is: 13, eval success rate is: 0.000
[2021-12-07 08:4

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 10:09:10.015909] epoch is: 0, eval success rate is: 0.000
[2021-12-07 10:09:36.782222] epoch is: 1, eval success rate is: 0.000
[2021-12-07 10:10:01.170368] epoch is: 2, eval success rate is: 0.000
[2021-12-07 10:10:24.881242] epoch is: 3, eval success rate is: 0.000
[2021-12-07 10:10:53.342272] epoch is: 4, eval success rate is: 0.000
[2021-12-07 10:11:35.434802] epoch is: 5, eval success rate is: 0.000
[2021-12-07 10:12:03.948557] epoch is: 6, eval success rate is: 0.000
[2021-12-07 10:12:29.451184] epoch is: 7, eval success rate is: 0.000
[2021-12-07 10:12:53.935114] epoch is: 8, eval success rate is: 0.000
[2021-12-07 10:13:18.656042] epoch is: 9, eval success rate is: 0.000
[2021-12-07 10:13:47.712582] epoch is: 10, eval success rate is: 0.000
[2021-12-07 10:14:11.949637] epoch is: 11, eval success rate is: 0.000
[2021-12-07 10:14:35.697875] epoch is: 12, eval success rate is: 0.100
[2021-12-07 10:15:01.030663] epoch is: 13, eval success rate is: 0.000
[2021-12-07 10:1

##### Run 5: E-Greedy Decay

In [7]:
args_to_parse = '--env-name FetchSlide-v1 --n-epochs=200 --n-runs=5 --strategy=e_greedy_decay --noisy=0 --cuda'
args = get_args(args_to_parse)
out_e_greedy = launch(args, fullPath)

/home/nathaniel/Classes/CS5180_Reinforcement_Learning/ReinforcementRobotExploration/fetch_models/FetchSlide-v1_e_greedy_decay.wts


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 16:33:50.353588] epoch is: 0, eval success rate is: 0.000
[2021-12-07 16:34:05.471325] epoch is: 1, eval success rate is: 0.000
[2021-12-07 16:34:20.976617] epoch is: 2, eval success rate is: 0.000
[2021-12-07 16:34:37.609656] epoch is: 3, eval success rate is: 0.000
[2021-12-07 16:34:55.056394] epoch is: 4, eval success rate is: 0.000
[2021-12-07 16:35:14.046988] epoch is: 5, eval success rate is: 0.000
[2021-12-07 16:35:31.714854] epoch is: 6, eval success rate is: 0.000
[2021-12-07 16:35:50.069745] epoch is: 7, eval success rate is: 0.000
[2021-12-07 16:36:07.918518] epoch is: 8, eval success rate is: 0.000
[2021-12-07 16:36:23.151250] epoch is: 9, eval success rate is: 0.000
[2021-12-07 16:36:38.451204] epoch is: 10, eval success rate is: 0.000
[2021-12-07 16:36:56.765109] epoch is: 11, eval success rate is: 0.000
[2021-12-07 16:37:15.034759] epoch is: 12, eval success rate is: 0.100
[2021-12-07 16:37:33.236324] epoch is: 13, eval success rate is: 0.000
[2021-12-07 16:3

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 17:33:28.541520] epoch is: 0, eval success rate is: 0.000
[2021-12-07 17:33:46.018053] epoch is: 1, eval success rate is: 0.000
[2021-12-07 17:34:03.116445] epoch is: 2, eval success rate is: 0.100
[2021-12-07 17:34:21.238933] epoch is: 3, eval success rate is: 0.000
[2021-12-07 17:34:38.502081] epoch is: 4, eval success rate is: 0.000
[2021-12-07 17:34:56.464357] epoch is: 5, eval success rate is: 0.000
[2021-12-07 17:35:14.039601] epoch is: 6, eval success rate is: 0.000
[2021-12-07 17:35:31.601458] epoch is: 7, eval success rate is: 0.000
[2021-12-07 17:35:49.687786] epoch is: 8, eval success rate is: 0.000
[2021-12-07 17:36:07.205246] epoch is: 9, eval success rate is: 0.000
[2021-12-07 17:36:24.729971] epoch is: 10, eval success rate is: 0.000
[2021-12-07 17:36:42.668395] epoch is: 11, eval success rate is: 0.000
[2021-12-07 17:37:01.039938] epoch is: 12, eval success rate is: 0.000
[2021-12-07 17:37:19.514096] epoch is: 13, eval success rate is: 0.000
[2021-12-07 17:3

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 18:33:15.036012] epoch is: 0, eval success rate is: 0.000
[2021-12-07 18:33:32.465054] epoch is: 1, eval success rate is: 0.000
[2021-12-07 18:33:49.732207] epoch is: 2, eval success rate is: 0.000
[2021-12-07 18:34:07.920322] epoch is: 3, eval success rate is: 0.000
[2021-12-07 18:34:25.404209] epoch is: 4, eval success rate is: 0.000
[2021-12-07 18:34:42.831634] epoch is: 5, eval success rate is: 0.000
[2021-12-07 18:35:00.749615] epoch is: 6, eval success rate is: 0.000
[2021-12-07 18:35:18.652059] epoch is: 7, eval success rate is: 0.000
[2021-12-07 18:35:36.491343] epoch is: 8, eval success rate is: 0.000
[2021-12-07 18:35:53.871061] epoch is: 9, eval success rate is: 0.000
[2021-12-07 18:36:11.431891] epoch is: 10, eval success rate is: 0.000
[2021-12-07 18:36:29.419896] epoch is: 11, eval success rate is: 0.000
[2021-12-07 18:36:47.211515] epoch is: 12, eval success rate is: 0.000
[2021-12-07 18:37:05.670585] epoch is: 13, eval success rate is: 0.000
[2021-12-07 18:3

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 19:33:10.763430] epoch is: 0, eval success rate is: 0.000
[2021-12-07 19:33:28.893377] epoch is: 1, eval success rate is: 0.000
[2021-12-07 19:33:46.063010] epoch is: 2, eval success rate is: 0.000
[2021-12-07 19:34:03.118388] epoch is: 3, eval success rate is: 0.100
[2021-12-07 19:34:20.906758] epoch is: 4, eval success rate is: 0.000
[2021-12-07 19:34:37.940961] epoch is: 5, eval success rate is: 0.000
[2021-12-07 19:34:55.706277] epoch is: 6, eval success rate is: 0.000
[2021-12-07 19:35:13.522230] epoch is: 7, eval success rate is: 0.000
[2021-12-07 19:35:30.836893] epoch is: 8, eval success rate is: 0.000
[2021-12-07 19:35:47.729365] epoch is: 9, eval success rate is: 0.000
[2021-12-07 19:36:05.536483] epoch is: 10, eval success rate is: 0.000
[2021-12-07 19:36:23.455474] epoch is: 11, eval success rate is: 0.000
[2021-12-07 19:36:40.744326] epoch is: 12, eval success rate is: 0.000
[2021-12-07 19:36:58.053540] epoch is: 13, eval success rate is: 0.000
[2021-12-07 19:3

  0%|          | 0/200 [00:00<?, ?it/s]

[2021-12-07 20:32:50.512791] epoch is: 0, eval success rate is: 0.000
[2021-12-07 20:33:08.712450] epoch is: 1, eval success rate is: 0.000
[2021-12-07 20:33:26.172552] epoch is: 2, eval success rate is: 0.000
[2021-12-07 20:33:43.596191] epoch is: 3, eval success rate is: 0.000
[2021-12-07 20:34:00.613454] epoch is: 4, eval success rate is: 0.000
[2021-12-07 20:34:19.130474] epoch is: 5, eval success rate is: 0.000
[2021-12-07 20:34:36.981698] epoch is: 6, eval success rate is: 0.000
[2021-12-07 20:34:54.770976] epoch is: 7, eval success rate is: 0.000
[2021-12-07 20:35:13.188944] epoch is: 8, eval success rate is: 0.000
[2021-12-07 20:35:32.169603] epoch is: 9, eval success rate is: 0.000
[2021-12-07 20:35:50.179082] epoch is: 10, eval success rate is: 0.000
[2021-12-07 20:36:07.838206] epoch is: 11, eval success rate is: 0.000
[2021-12-07 20:36:25.498544] epoch is: 12, eval success rate is: 0.000
[2021-12-07 20:36:44.391647] epoch is: 13, eval success rate is: 0.000
[2021-12-07 20:3

##### Run 7: Upper Confidence Bound (UCB)

In [16]:
args_to_parse = '--env-name FetchSlide-v1 --n-epochs=200 --n-runs=5 --strategy=ucb --noisy=0'
args = get_args(args_to_parse)
out_e_greedy = launch(args, fullPath)

/home/nathaniel/Classes/CS5180_Reinforcement_Learning/ReinforcementRobotExploration/fetch_models/FetchSlide-v1_ucb.wts


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

IHT full, starting to allow collisions
[2021-12-07 22:19:46.514623] epoch is: 0, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:20:10.335920] epoch is: 1, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:20:33.582561] epoch is: 2, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:20:56.424484] epoch is: 3, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:21:20.787477] epoch is: 4, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:21:44.625745] epoch is: 5, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:22:08.971862] epoch is: 6, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:22:32.897808] epoch is: 7, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 22:22:55.907006] epoch is: 8, eval success rate is: 0.000
IHT full, starting 

  0%|          | 0/200 [00:00<?, ?it/s]

IHT full, starting to allow collisions
[2021-12-07 23:40:23.193255] epoch is: 0, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:40:44.591492] epoch is: 1, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:41:06.574691] epoch is: 2, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:41:28.228425] epoch is: 3, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:41:49.776029] epoch is: 4, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:42:11.236813] epoch is: 5, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:42:32.973599] epoch is: 6, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:42:55.242432] epoch is: 7, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-07 23:43:17.103715] epoch is: 8, eval success rate is: 0.000
IHT full, starting 

  0%|          | 0/200 [00:00<?, ?it/s]

IHT full, starting to allow collisions
[2021-12-08 01:00:14.144346] epoch is: 0, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:00:35.730624] epoch is: 1, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:00:56.184385] epoch is: 2, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:01:17.118404] epoch is: 3, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:01:38.360967] epoch is: 4, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:01:59.376191] epoch is: 5, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:02:20.539051] epoch is: 6, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:02:41.910739] epoch is: 7, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 01:03:03.148442] epoch is: 8, eval success rate is: 0.000
IHT full, starting 

  0%|          | 0/200 [00:00<?, ?it/s]

IHT full, starting to allow collisions
[2021-12-08 02:19:40.111003] epoch is: 0, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:20:01.110103] epoch is: 1, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:20:22.485223] epoch is: 2, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:20:43.896749] epoch is: 3, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:21:05.280083] epoch is: 4, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:21:26.652755] epoch is: 5, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:21:48.236060] epoch is: 6, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:22:09.910455] epoch is: 7, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 02:22:31.228586] epoch is: 8, eval success rate is: 0.000
IHT full, starting 

  0%|          | 0/200 [00:00<?, ?it/s]

IHT full, starting to allow collisions
[2021-12-08 03:39:33.117267] epoch is: 0, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:39:54.071972] epoch is: 1, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:40:15.872375] epoch is: 2, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:40:36.890954] epoch is: 3, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:40:58.398926] epoch is: 4, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:41:20.225705] epoch is: 5, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:41:41.785491] epoch is: 6, eval success rate is: 0.000
IHT full, starting to allow collisions
[2021-12-08 03:42:03.295583] epoch is: 7, eval success rate is: 0.100
IHT full, starting to allow collisions
[2021-12-08 03:42:24.944886] epoch is: 8, eval success rate is: 0.000
IHT full, starting 

In [None]:
#  Jupyter UI - Trigger policies for individual agents
def button_callback(button):
    for b in buttons:
        b.disabled = True

    env = envs[button.description]['model']
    final_policy = None
    try:
        env_name = envs[button.description]['model'].env.spec.id
        env_params = get_env_params(env)
        model = torch.load(f'{env_name}.wts')
        model_weights = model[4]
        actor_model = actor(env_params)
        actor_model.load_state_dict(model_weights)
        # Create policy using function
        def policy(state):
            # reset the environment
            obs = state['observation']
            g = state['desired_goal']
            o_norm = normalizer(size=10, default_clip_range=200)
            o_norm.mean = model[0]
            o_norm.std = model[1]
            g_norm = normalizer(size=3, default_clip_range=200)
            g_norm.mean = model[2]
            g_norm.std = model[3]
            obs_norm = o_norm.normalize(obs)
            g_norm = g_norm.normalize(g)
            # concatenate the stuffs
            input_tensor = np.concatenate([obs_norm, g_norm])
            input_tensor = torch.tensor(input_tensor, dtype=torch.float32).unsqueeze(0)
            with torch.no_grad():
                pi = actor_model(input_tensor)
            action = pi.cpu().numpy().squeeze()
            # add the gaussian
            action = np.clip(action, -1, 1)
            return action
        final_policy = policy
    except Exception as e:
        print('Could not create policy - running random policy')
        print(str(e))
        traceback.print_exc()
        return
    render(env, final_policy)
    env.close()
        
    for b in buttons:
        b.disabled = False

buttons = []
for env_id in envs.keys():
    button = widgets.Button(description=env_id)
    button.on_click(button_callback)
    buttons.append(button)

print('Click a button to evaluate a policy')
b = widgets.HBox(buttons)
display(b)