Using old autoencoder trained only on colors (models/encoder50.h5) to train on both shapes and colors.

In [1]:
from MuJoCo_Gym.mujoco_rl import MuJoCoRL
from MuJoCo_Gym.wrappers import GymnasiumWrapper, GymWrapper
from gymnasium.wrappers.frame_stack import FrameStack
from gymnasium.experimental.wrappers import NormalizeObservationV0
from dynamics import *
import argparse
import os
import random
import time
from distutils.util import strtobool
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter

from wrappers.record_episode_statistics import RecordEpisodeStatistics
from wrappers.frame_stack import FrameStack
from wrappers.normalizeObservation import NormalizeObservation
from wrappers.normalizeRewards import NormalizeReward

from progressbar import progressbar

  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
  VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
  if (distutils.version.LooseVersion(tf.__version__) <
  distutils.version.Loos

# Dynamics

## Vision

In [2]:
class Image:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0 for _ in range(50)], "high": [1 for _ in range(50)]}
        self.action_space = {"low": [], "high": []}
        self.autoencoder = Autoencoder(latent_dim=50, input_shape=(64, 64, 3))
        self.autoencoder.encoder.load_weights("models/encoder50.h5")
        self.index = 0

    def dynamic(self, agent, actions):
        self.index = self.index + 1
        image = self.environment.get_camera_data(agent + "_camera")
        image = cv2.resize(image, (64, 64))
        result = self.autoencoder.encoder.predict(np.array([image]), verbose=0)[0]
        # cv2.imwrite(f"/Users/imtezcan/Repositories/CogSci/EBIMAS/s.language_experiments/blah/images/{agent}_{self.index}.png", image)
        return 0, result, False, {}

## Communication

In [3]:
class Communication:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0, 0, 0, 0, 0, 0], "high": [1, 1, 1, 1, 1, 1]}
        self.action_space = {"low": [0, 0, 0, 0, 0, 0], "high": [1, 1, 1, 1, 1, 1]}

    def dynamic(self, agent, actions):
        if "utterance" not in self.environment.data_store[agent].keys():
            self.environment.data_store[agent]["utterance"] = None
        if agent == "receiver":
            utterance = [0, 0, 0, 0, 0, 0]
            if "target_color" in self.environment.data_store.keys() and "target_shape" in self.environment.data_store.keys():
                utterance[np.argmax(self.environment.data_store["target_color"])] = 1
                # index 0, 1, 2 for color, 3, 4 for shape and 5 for silent utterance
                utterance[np.argmax(self.environment.data_store["target_shape"]) + 3] = 1
            observation = utterance
        elif agent == "sender":
            utterance = [0, 0, 0, 0, 0, 0]
            color_action = actions[:3]
            shape_action = actions[3:5]
            utterance[np.argmax(color_action)] = 1
            utterance[np.argmax(shape_action) + 3] = 1
            self.environment.data_store[agent]["utterance"] = actions
            self.environment.data_store[agent]["utterance_max"] = utterance
            observation = [0, 0, 0, 0, 0, 0]
        else:
            print("Unknown agent")
        return 0, observation, False, {}

## Reward Dynamic Function
The reward function for the receiver gives a positive reward for the agent to move closer to the current target object.<br/>
The sender in turn gets a positive reward if it chooses the correct word encoding for the color and shape of the current target object.

In [4]:
from sklearn.metrics import mean_squared_error
from autoencoder import Autoencoder
import math
import mujoco

In [5]:
class Reward:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [], "high": []}
        self.action_space = {"low": [], "high": []}
        self.choices = ["choice_1", "choice_2"]

    def dynamic(self, agent, actions):
        if not "target" in self.environment.data_store.keys():
            color = self.environment.get_data("reference_geom")["color"]
            shape = self.environment.get_data("reference_geom")["shape"]
            for choice in self.choices:
                choice_color = self.environment.get_data(choice + "_geom")["color"]
                choice_shape = self.environment.get_data(choice + "_geom")["shape"]
                # Check if both color and shape match
                if (color == choice_color).all() and (shape == choice_shape).all():
                    self.environment.data_store["target"] = choice
                    self.environment.data_store["target_color"] = choice_color
                    self.environment.data_store["target_shape"] = choice_shape
                    self.environment.data_store["last_distance"] = copy.deepcopy(self.environment.distance("receiver_geom", choice + "_geom"))
        if agent == "receiver":
            target = self.environment.data_store["target"]
            new_distance = self.environment.distance("receiver_geom", target + "_geom")
            reward = (self.environment.data_store["last_distance"] - new_distance) * 10
            self.environment.data_store["last_distance"] = copy.deepcopy(new_distance)
        elif agent == "sender":
            reference = [0, 0, 0, 0, 0, 0]
            color = self.environment.data_store["target_color"]
            shape = self.environment.data_store["target_shape"]
            
            reference[np.argmax(color)] = 1
            reference[np.argmax(shape) + 3] = 1
            reward = 0
            if "utterance" in self.environment.data_store[agent].keys():
                reward = -1 * mean_squared_error(reference, self.environment.data_store[agent]["utterance"])
        return reward, [], False, {}

## Accuracy Measurement Dynamic

In [6]:
class Accuracy:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [], "high": []}
        self.action_space = {"low": [], "high": []}
        self.accuracies = []
        self.variances = []
        self.sendAccuracies = []
        self.sendVariances = []
        self.currentSend = []
        self.report_accuracy = {"sender": 0, "receiver": 0}

    def dynamic(self, agent, actions):
        choices = ["choice_1", "choice_2"]
        variance = {"choice_1":1, "choice_2":-1}
        if "target" in self.environment.data_store.keys():
            if "sendVariances" not in self.environment.data_store.keys():
                self.environment.data_store["sendVariances"] = True
                self.currentSend = [0, 0, 0, 0, 0, 0]
            target = self.environment.data_store["target"]
            # if any(self.environment.collision(ankle, target + "_geom") for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]):
            if self.environment.collision("receiver_geom", target + "_geom"):
                self.accuracies.append(1)
                self.variances.append(variance[target])

                if len(self.variances) > 50:
                    report_variance = 1 - abs(sum(self.variances[-50:]) / 50)
                    self.report_accuracy["receiver"] = sum(self.accuracies[-50:]) / 50
            # elif any(self.environment.collision(ankle, [choice for choice in choices if choice != target][0] + "_geom") for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]):
            elif self.environment.collision("receiver_geom", [choice for choice in choices if choice != target][0] + "_geom"):
                self.accuracies.append(0)
                self.variances.append(variance[[choice for choice in choices if choice != target][0]])

                if len(self.variances) > 50:
                    report_variance = 1 - abs(sum(self.variances[-50:]) / 50)
                    self.report_accuracy[0] = sum(self.accuracies[-50:]) / 50
            if "utterance_max" in self.environment.data_store[agent].keys():
                reference = [0, 0, 0, 0, 0, 0]
                color = self.environment.data_store["target_color"]
                shape = self.environment.data_store["target_shape"]
                reference[np.argmax(color)] = 1
                reference[np.argmax(shape) + 3] = 1
                self.currentSend = np.add(self.currentSend, self.environment.data_store[agent]["utterance_max"])

                if self.environment.data_store[agent]["utterance_max"]  == reference:
                    self.sendAccuracies.append(1)
                else:
                    self.sendAccuracies.append(0)
        return 0, [], False, {}

# Reward Functions

In [7]:
def target_reward(mujoco_gym, agent):
    if agent == "receiver":
        choices = ["choice_1", "choice_2"]
        if not "target" in mujoco_gym.data_store.keys():
                color = mujoco_gym.get_data("reference_geom")["color"]
                shape = mujoco_gym.get_data("reference_geom")["shape"]
                for choice in choices:
                    if (color == mujoco_gym.get_data(choice + "_geom")["color"]).all() and (shape == mujoco_gym.get_data(choice + "_geom")["shape"]).all():
                        mujoco_gym.data_store["target"] = choice
                        mujoco_gym.data_store["target_color"] = mujoco_gym.get_data(choice + "_geom")["color"]
                        mujoco_gym.data_store["target_shape"] = mujoco_gym.get_data(choice + "_geom")["shape"]
        
        target = mujoco_gym.data_store["target"]
        # for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]:
        for ankle in ["receiver_geom"]:
            if mujoco_gym.collision(ankle, target + "_geom"):
                return 1
            elif mujoco_gym.collision(ankle, [choice for choice in choices if choice != target][0] + "_geom"):
                return -1
    return 0

In [8]:
def collision_reward(mujoco_gym, agent):
    for border in ["border1_geom", "border2_geom", "border3_geom", "border4_geom", "border5_geom"]:
        # for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]:
        for ankle in [agent + "_geom"]:
            if mujoco_gym.collision(border, ankle):
                return -0.1
    return 0

# Environment Setup

In [9]:
def make_env(config_dict):
    def thunk():
        window = 5
        env = MuJoCoRL(config_dict=config_dict)
        # env = GymWrapper(env, "receiver")
        # env = FrameStack(env, 4)
        env = NormalizeObservation(env)
        env = NormalizeReward(env)
        # env = RecordEpisodeStatistics(env)
        return env

    return thunk

In [10]:
# xml_files = ["levels/" + file for file in os.listdir("levels/")]
xml_files = ["levels_shape/" + file for file in os.listdir("levels_shape/")]
agents = ["sender", "receiver"]

config_dict = {"xmlPath":xml_files, 
                   "agents":agents, 
                   "rewardFunctions":[collision_reward, target_reward], 
                   "doneFunctions":[target_done, border_done], 
                   "skipFrames":5,
                   "environmentDynamics":[Image, Reward, Communication, Accuracy],
                   "freeJoint":True,
                   "renderMode":False,
                   "maxSteps":1024,
                   "agentCameras":True}

env = make_env(config_dict)()

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  updates=self.state_updates,


# RL Model

In [11]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.critic = nn.Sequential(
            nn.Flatten(),
            layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            nn.Flatten(),
            layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, np.prod(envs.action_space.shape)), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.action_space.shape)))

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

# Buffer

In [12]:
class Buffer():
    def __init__(self, num_steps, envs, num_envs, device):
        self.obs = torch.zeros((num_steps, num_envs) + envs.observation_space.shape).to(device)
        self.actions = torch.zeros((num_steps, num_envs) + envs.action_space.shape).to(device)
        self.logprobs = torch.zeros((num_steps, num_envs)).to(device)
        self.rewards = torch.zeros((num_steps, num_envs)).to(device)
        self.dones = torch.zeros((num_steps, num_envs)).to(device)
        self.values = torch.zeros((num_steps, num_envs)).to(device)

# Update Function

In [13]:
def update_agent(agent, buffer, optimizer, next_obs, next_done, env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, num_steps):

    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        if gae:
            advantages = torch.zeros_like(buffer.rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(num_steps)):
                if t == num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - buffer.dones[t + 1]
                    nextvalues = buffer.values[t + 1]
                delta = buffer.rewards[t] + gamma * nextvalues * nextnonterminal - buffer.values[t]
                advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + buffer.values
        else:
            returns = torch.zeros_like(buffer.rewards).to(device)
            for t in reversed(range(num_steps)):
                if t == num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    next_return = next_value
                else:
                    nextnonterminal = 1.0 - buffer.dones[t + 1]
                    next_return = returns[t + 1]
                returns[t] = buffer.rewards[t] + gamma * nextnonterminal * next_return
            advantages = returns - buffer.values

    # flatten the batch
    b_obs = buffer.obs.reshape((-1,) + env.observation_space.shape)
    b_logprobs = buffer.logprobs.reshape(-1)
    b_actions = buffer.actions.reshape((-1,) + env.action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = buffer.values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            if clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -clip_coef,
                    clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()

        if target_kl is not None:
            if approx_kl > target_kl:
                break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

# Training

## Helper Function

In [14]:
def initialize_agent(env, device, learning_rate):
    agent = Agent(env).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
    return agent, optimizer

def get_action_and_update_buffer(agent, obs, buffer, step):
    with torch.no_grad():
        action, logprob, _, value = agent.get_action_and_value(obs)
        buffer.values[step] = value.flatten()
    buffer.actions[step] = action
    buffer.logprobs[step] = logprob
    return action

def reset_environment(env, device):
    next_obs, infos = env.reset()
    next_obs = {k: torch.Tensor(v).unsqueeze(0).to(device) for k, v in next_obs.items()}
    return next_obs, infos

## Hyperparameters

In [15]:
exp_name = "Sender box shape"

learning_rate = 1e-5
seed = 1
# total_timesteps = 20000000
total_timesteps = 1000000
torch_deterministic = True
cuda = False
mps = False
track = False
wandb_project_name = "ppo-implementation-details"
wandb_entity = None
capture_video = False

# Algorithm-specific arguments
num_envs = 1
num_steps = 2048
anneal_lr = True
gae = True
gamma = 0.99
gae_lambda = 0.95
num_minibatches = 128
update_epochs = 10
norm_adv = True
clip_coef = 0.2
clip_vloss = True
ent_coef = 0.0
vf_coef = 0.5
max_grad_norm = 0.5
target_kl = None
store_freq = 20

# Calculate derived variables
batch_size = int(num_envs * num_steps)
minibatch_size = int(batch_size // num_minibatches)

In [16]:
torch.set_default_dtype(torch.float32)

run_name = f"{exp_name}__{seed}__{int(time.time())}"

writer = SummaryWriter(f"runs/{run_name}")

writer.add_text("environment/level_number", str(len(xml_files)), 0)
writer.add_text("environment/agents", ', '.join(agents), 0)
writer.add_text("hyperparameters/learning_rate", str(learning_rate), 0)
writer.add_text("hyperparameters/network_size", ', '.join(str(e) for e in [512, 256]), 0)
writer.add_text("hyperparameters/batch", str(minibatch_size), 0)

# TRY NOT TO MODIFY: seeding
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and cuda else "cpu")

obs, infos = env.reset()

sender, sender_optimizer = initialize_agent(env, device, learning_rate)
receiver, receiver_optimizer = initialize_agent(env, device, learning_rate)

buffer_sender = Buffer(num_steps, env, num_envs, device)
buffer_receiver = Buffer(num_steps, env, num_envs, device)

global_step = 0
start_time = time.time()
next_obs, infos = reset_environment(env, device)

next_done = {"sender": torch.zeros(num_envs).to(device), "receiver": torch.zeros(num_envs).to(device)}

num_updates = total_timesteps // batch_size
train_start = time.time()

epoch_lengths = []
current_length = 0

for update in progressbar(range(1, num_updates + 1), redirect_stdout=True):
    # Annealing the rate if instructed to do so.
    if anneal_lr:
        frac = 1.0 - (update - 1.0) / num_updates
        lrnow = frac * learning_rate
        sender_optimizer.param_groups[0]["lr"] = lrnow
        receiver_optimizer.param_groups[0]["lr"] = lrnow
    
    epoch_rewards = {"sender":0, "receiver":0}
    current_rewards = {"sender":[], "receiver":[]}
    variances = {"sender":[], "receiver":[]}
    epoch_runs = 0
    episode_accuracies = 0
    episode_sendAccuracies = 0
    for step in range(0, num_steps):
        global_step += 1 * num_envs
        current_length += 1
        buffer_sender.obs[step] = next_obs["sender"]
        buffer_receiver.obs[step] = next_obs["receiver"]


        sender_action = get_action_and_update_buffer(sender, next_obs["sender"], buffer_sender, step)
        receiver_action = get_action_and_update_buffer(receiver, next_obs["receiver"], buffer_receiver, step)

        next_obs, reward, terminations, truncations, info = env.step({"sender": sender_action.cpu().numpy()[0], "receiver": receiver_action.cpu().numpy()[0]})
        current_rewards["sender"].append(reward["sender"])
        current_rewards["receiver"].append(reward["receiver"])
        next_obs = {"sender": torch.Tensor(next_obs["sender"]).unsqueeze(0).to(device), "receiver": torch.Tensor(next_obs["receiver"]).unsqueeze(0).to(device)}

        if terminations["sender"] or terminations["receiver"] or truncations["sender"] or truncations["receiver"]:
            next_obs, infos = reset_environment(env, device)
            epoch_rewards["sender"] += sum(current_rewards["sender"])
            epoch_rewards["receiver"] += sum(current_rewards["receiver"])

            epoch_lengths.append(current_length)
            current_length = 0

            dynamic = env.env.env.environment_dynamics[3]

            if len(dynamic.sendAccuracies) > 512:
                episode_sendAccuracies = sum(dynamic.sendAccuracies[-512:]) / 512
                del dynamic.sendAccuracies[:-513]
                writer.add_scalar("charts/sender/accuracies", episode_sendAccuracies, global_step)

            if len(dynamic.accuracies) > 4:
                window = min(15, len(dynamic.accuracies))
                episode_accuracies = sum(dynamic.accuracies[-1 * window:]) / window
                writer.add_scalar("charts/receiver/accuracies", episode_accuracies, global_step)
                if window == 15:
                    del dynamic.accuracies[:-16]

            if len(dynamic.variances) > 4:
                window = min(15, len(dynamic.variances))
                current_variance = sum(dynamic.variances[-1 * window:]) / window
                writer.add_scalar("charts/receiver_variance", current_variance, global_step)
                if window == 15:
                    del dynamic.variances[:-16]

            if len(epoch_lengths) > 3:
                window = min(10, len(epoch_lengths))
                epoch_length = sum(epoch_lengths[-1 * window:]) / window
                writer.add_scalar("charts/episodic_length", epoch_length, global_step)
                if window == 10:
                    del epoch_lengths[:-11]
            epoch_runs += 1
        
        buffer_sender.rewards[step] = torch.tensor(reward["sender"]).to(device).view(-1)
        buffer_receiver.rewards[step] = torch.tensor(reward["receiver"]).to(device).view(-1)
        next_done = {"sender": torch.Tensor([terminations["sender"]]).to(device), "receiver": torch.Tensor([terminations["receiver"]]).to(device)}
    if update % store_freq == 0:
        torch.save(sender, "models/model" + str(start_time) + ".pth")
        torch.save(receiver, "models/model" + str(start_time) + ".pth")

    update_agent(sender, buffer_sender, sender_optimizer, next_obs["sender"], next_done["sender"], env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, step)
    update_agent(receiver, buffer_receiver, receiver_optimizer, next_obs["receiver"], next_done["receiver"], env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, step)

    writer.add_scalar("charts/learning_rate", sender_optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("charts/sender/episodic_return", epoch_rewards["sender"] / epoch_runs, global_step)
    writer.add_scalar("charts/receiver/episodic_return", epoch_rewards["receiver"] / epoch_runs, global_step)
    print("SPS:", int(global_step / (time.time() - start_time)), "Average Reward:", epoch_rewards["sender"] / epoch_runs)
    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

torch.save(sender, "models/model" + str(start_time) + ".pth")
torch.save(receiver, "models/model" + str(start_time) + ".pth")
writer.close()

  0% (0 of 488) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--

SPS: 72 Average Reward: -40.665287049122156


  0% (1 of 488) |                        | Elapsed Time: 0:00:28 ETA:   3:49:55

SPS: 75 Average Reward: -10.580664623667115


  0% (2 of 488) |                        | Elapsed Time: 0:00:54 ETA:   3:31:42

SPS: 76 Average Reward: -10.547878611932672


  0% (3 of 488) |                        | Elapsed Time: 0:01:19 ETA:   3:25:56

SPS: 77 Average Reward: -10.519320285368881


  0% (4 of 488) |                        | Elapsed Time: 0:01:45 ETA:   3:24:44

SPS: 78 Average Reward: -10.484549139505182


  1% (5 of 488) |                        | Elapsed Time: 0:02:10 ETA:   3:25:25

SPS: 78 Average Reward: -10.527719132292589


  1% (6 of 488) |                        | Elapsed Time: 0:02:36 ETA:   3:25:07

SPS: 78 Average Reward: -10.840192587259002


  1% (7 of 488) |                        | Elapsed Time: 0:03:01 ETA:   3:23:44

SPS: 78 Average Reward: -11.132679566093602


  1% (8 of 488) |                        | Elapsed Time: 0:03:28 ETA:   3:29:54

SPS: 79 Average Reward: -10.413369736528935


  1% (9 of 488) |                        | Elapsed Time: 0:03:52 ETA:   3:16:59

SPS: 79 Average Reward: -10.544256355382517


  2% (10 of 488) |                       | Elapsed Time: 0:04:17 ETA:   3:16:50

SPS: 79 Average Reward: -10.049155369891691


  2% (11 of 488) |                       | Elapsed Time: 0:04:42 ETA:   3:18:03

SPS: 80 Average Reward: -10.616598154203235


  2% (12 of 488) |                       | Elapsed Time: 0:05:07 ETA:   3:16:42

SPS: 80 Average Reward: -11.182605729670392


  2% (13 of 488) |                       | Elapsed Time: 0:05:31 ETA:   3:16:00

SPS: 80 Average Reward: -10.864321016857934


  2% (14 of 488) |                       | Elapsed Time: 0:05:56 ETA:   3:16:04

SPS: 80 Average Reward: -10.161678001507505


  3% (15 of 488) |                       | Elapsed Time: 0:06:21 ETA:   3:18:42

SPS: 80 Average Reward: -10.143871053353125


  3% (16 of 488) |                       | Elapsed Time: 0:06:47 ETA:   3:20:11

SPS: 80 Average Reward: -11.026934790238702


  3% (17 of 488) |                       | Elapsed Time: 0:07:12 ETA:   3:14:13

SPS: 80 Average Reward: -10.378304971470843


  3% (18 of 488) |                       | Elapsed Time: 0:07:36 ETA:   3:12:04

SPS: 80 Average Reward: -10.38480541585726


  3% (19 of 488) |                       | Elapsed Time: 0:08:01 ETA:   3:12:10

SPS: 80 Average Reward: -10.774907176167304


  4% (20 of 488) |                       | Elapsed Time: 0:08:25 ETA:   3:12:29

SPS: 81 Average Reward: -11.622380476727825


  4% (21 of 488) |                       | Elapsed Time: 0:08:50 ETA:   3:11:18

SPS: 81 Average Reward: -10.778754142756895


  4% (22 of 488) |#                      | Elapsed Time: 0:09:14 ETA:   3:03:15

SPS: 81 Average Reward: -10.78310094876799


  4% (23 of 488) |#                      | Elapsed Time: 0:09:37 ETA:   3:04:49

SPS: 81 Average Reward: -10.642367144916992


  4% (24 of 488) |#                      | Elapsed Time: 0:10:01 ETA:   3:04:21

SPS: 81 Average Reward: -10.640549431348512


  5% (25 of 488) |#                      | Elapsed Time: 0:10:25 ETA:   3:04:56

SPS: 81 Average Reward: -11.08641190090181


  5% (26 of 488) |#                      | Elapsed Time: 0:10:49 ETA:   3:05:27

SPS: 82 Average Reward: -11.12375478343925


  5% (27 of 488) |#                      | Elapsed Time: 0:11:13 ETA:   3:03:45

SPS: 82 Average Reward: -10.397321237177493


  5% (28 of 488) |#                      | Elapsed Time: 0:11:37 ETA:   3:03:48

SPS: 82 Average Reward: -10.834803521226


  5% (29 of 488) |#                      | Elapsed Time: 0:12:01 ETA:   3:04:04

SPS: 82 Average Reward: -11.328515045734093


  6% (30 of 488) |#                      | Elapsed Time: 0:12:25 ETA:   3:03:35

SPS: 82 Average Reward: -11.920921673472234


  6% (31 of 488) |#                      | Elapsed Time: 0:12:49 ETA:   3:02:36

SPS: 82 Average Reward: -12.079834075400097


  6% (32 of 488) |#                      | Elapsed Time: 0:13:13 ETA:   3:01:55

SPS: 82 Average Reward: -11.104281692315933


  6% (33 of 488) |#                      | Elapsed Time: 0:13:37 ETA:   3:02:29

SPS: 82 Average Reward: -11.052305182893173


  6% (34 of 488) |#                      | Elapsed Time: 0:14:01 ETA:   3:02:43

SPS: 82 Average Reward: -10.495663950497196


  7% (35 of 488) |#                      | Elapsed Time: 0:14:26 ETA:   3:02:21

SPS: 82 Average Reward: -11.621803647539279


  7% (36 of 488) |#                      | Elapsed Time: 0:14:50 ETA:   3:02:22

SPS: 82 Average Reward: -11.659022023914094


  7% (37 of 488) |#                      | Elapsed Time: 0:15:14 ETA:   3:02:36

SPS: 82 Average Reward: -11.715549045964217


  7% (38 of 488) |#                      | Elapsed Time: 0:15:38 ETA:   3:00:40

SPS: 82 Average Reward: -12.665652876693029


  7% (39 of 488) |#                      | Elapsed Time: 0:16:02 ETA:   3:00:14

SPS: 83 Average Reward: -17.983480236849957


  8% (40 of 488) |#                      | Elapsed Time: 0:16:26 ETA:   2:59:52

SPS: 83 Average Reward: -23.033982766996843


  8% (41 of 488) |#                      | Elapsed Time: 0:16:50 ETA:   2:59:31

SPS: 83 Average Reward: -17.200086776740235


  8% (42 of 488) |#                      | Elapsed Time: 0:17:15 ETA:   2:59:13

SPS: 83 Average Reward: -20.151643897237097


  8% (43 of 488) |##                     | Elapsed Time: 0:17:39 ETA:   2:59:27

SPS: 83 Average Reward: -23.394838312411835


  9% (44 of 488) |##                     | Elapsed Time: 0:18:03 ETA:   2:58:33

SPS: 83 Average Reward: -23.341452767400526


  9% (45 of 488) |##                     | Elapsed Time: 0:18:27 ETA:   2:58:53

SPS: 83 Average Reward: -22.724152256052463


  9% (46 of 488) |##                     | Elapsed Time: 0:18:51 ETA:   2:57:49

SPS: 83 Average Reward: -10.876412929743077


  9% (47 of 488) |##                     | Elapsed Time: 0:19:15 ETA:   2:57:54

SPS: 83 Average Reward: -17.945096985268425


  9% (48 of 488) |##                     | Elapsed Time: 0:19:39 ETA:   2:56:01

SPS: 83 Average Reward: -17.984207682628746


 10% (49 of 488) |##                     | Elapsed Time: 0:20:04 ETA:   2:55:55

SPS: 83 Average Reward: -16.85548794049717


 10% (50 of 488) |##                     | Elapsed Time: 0:20:27 ETA:   2:54:58

SPS: 83 Average Reward: -17.42883237658287


 10% (51 of 488) |##                     | Elapsed Time: 0:20:52 ETA:   2:55:46

SPS: 83 Average Reward: -17.36392131459084


 10% (52 of 488) |##                     | Elapsed Time: 0:21:16 ETA:   2:54:36

SPS: 83 Average Reward: -16.78136575197363


 10% (53 of 488) |##                     | Elapsed Time: 0:21:40 ETA:   2:55:15

SPS: 83 Average Reward: -16.20676777122952


 11% (54 of 488) |##                     | Elapsed Time: 0:22:04 ETA:   2:55:15

SPS: 83 Average Reward: -8.709159824491921


 11% (55 of 488) |##                     | Elapsed Time: 0:22:28 ETA:   2:54:00

SPS: 83 Average Reward: -8.461724588181557


 11% (56 of 488) |##                     | Elapsed Time: 0:22:52 ETA:   2:53:54

SPS: 83 Average Reward: -16.49414017797933


 11% (57 of 488) |##                     | Elapsed Time: 0:23:16 ETA:   2:53:25

SPS: 83 Average Reward: -13.044628067688276


 11% (58 of 488) |##                     | Elapsed Time: 0:23:41 ETA:   2:53:40

SPS: 83 Average Reward: -12.379446134446159


 12% (59 of 488) |##                     | Elapsed Time: 0:24:05 ETA:   2:52:36

SPS: 83 Average Reward: -13.343841162621949


 12% (60 of 488) |##                     | Elapsed Time: 0:24:29 ETA:   2:52:25

SPS: 83 Average Reward: -13.127558006966645


 12% (61 of 488) |##                     | Elapsed Time: 0:24:53 ETA:   2:49:48

SPS: 83 Average Reward: -12.495590658053118


 12% (62 of 488) |##                     | Elapsed Time: 0:25:17 ETA:   2:51:56

SPS: 83 Average Reward: -13.05964313603872


 12% (63 of 488) |##                     | Elapsed Time: 0:25:41 ETA:   2:51:59

SPS: 83 Average Reward: -19.95173327135205


 13% (64 of 488) |###                    | Elapsed Time: 0:26:06 ETA:   2:50:41

SPS: 83 Average Reward: -20.29049205016301


 13% (65 of 488) |###                    | Elapsed Time: 0:26:30 ETA:   2:50:12

SPS: 83 Average Reward: -16.20483900743198


 13% (66 of 488) |###                    | Elapsed Time: 0:26:54 ETA:   2:50:05

SPS: 83 Average Reward: -13.775108393750175


 13% (67 of 488) |###                    | Elapsed Time: 0:27:18 ETA:   2:50:01

SPS: 83 Average Reward: -13.860173655282832


 13% (68 of 488) |###                    | Elapsed Time: 0:27:42 ETA:   2:49:49

SPS: 83 Average Reward: -16.54997280877475


 14% (69 of 488) |###                    | Elapsed Time: 0:28:07 ETA:   2:49:18

SPS: 83 Average Reward: -13.320561205232176


 14% (70 of 488) |###                    | Elapsed Time: 0:28:31 ETA:   2:48:29

SPS: 83 Average Reward: -20.799741788043193


 14% (71 of 488) |###                    | Elapsed Time: 0:28:55 ETA:   2:47:33

SPS: 83 Average Reward: -14.64385231022793


 14% (72 of 488) |###                    | Elapsed Time: 0:29:19 ETA:   2:47:29

SPS: 83 Average Reward: -8.25410860921318


 14% (73 of 488) |###                    | Elapsed Time: 0:29:43 ETA:   2:47:39

SPS: 83 Average Reward: -12.49327869923665


 15% (74 of 488) |###                    | Elapsed Time: 0:30:07 ETA:   2:46:56

SPS: 83 Average Reward: -14.079439507041156


 15% (75 of 488) |###                    | Elapsed Time: 0:30:32 ETA:   2:46:35

SPS: 83 Average Reward: -15.986161910200025


 15% (76 of 488) |###                    | Elapsed Time: 0:30:56 ETA:   2:45:54

SPS: 83 Average Reward: -17.472831460118858


 15% (77 of 488) |###                    | Elapsed Time: 0:31:20 ETA:   2:45:29

SPS: 83 Average Reward: -16.45684093852692


 15% (78 of 488) |###                    | Elapsed Time: 0:31:44 ETA:   2:45:27

SPS: 83 Average Reward: -9.277440739945588


 16% (79 of 488) |###                    | Elapsed Time: 0:32:08 ETA:   2:44:54

SPS: 83 Average Reward: -14.221473678422162


 16% (80 of 488) |###                    | Elapsed Time: 0:32:33 ETA:   2:44:14

SPS: 83 Average Reward: -17.82111565904302


 16% (81 of 488) |###                    | Elapsed Time: 0:32:56 ETA:   2:42:21

SPS: 83 Average Reward: -16.292064237754857


 16% (82 of 488) |###                    | Elapsed Time: 0:33:21 ETA:   2:43:40

SPS: 83 Average Reward: -18.55961239799195


 17% (83 of 488) |###                    | Elapsed Time: 0:33:45 ETA:   2:41:47

SPS: 83 Average Reward: -18.164932633855003


 17% (84 of 488) |###                    | Elapsed Time: 0:34:09 ETA:   2:43:06

SPS: 83 Average Reward: -15.080021108536686


 17% (85 of 488) |####                   | Elapsed Time: 0:34:33 ETA:   2:42:18

SPS: 83 Average Reward: -17.958604043564925


 17% (86 of 488) |####                   | Elapsed Time: 0:34:57 ETA:   2:42:10

SPS: 83 Average Reward: -12.906157787395676


 17% (87 of 488) |####                   | Elapsed Time: 0:35:21 ETA:   2:41:46

SPS: 83 Average Reward: -16.128915684760955


 18% (88 of 488) |####                   | Elapsed Time: 0:35:46 ETA:   2:42:27

SPS: 83 Average Reward: -12.158808336671076


 18% (89 of 488) |####                   | Elapsed Time: 0:36:10 ETA:   2:40:43

SPS: 83 Average Reward: -15.443300239234915


 18% (90 of 488) |####                   | Elapsed Time: 0:36:34 ETA:   2:41:07

SPS: 83 Average Reward: -20.184399714987922


 18% (91 of 488) |####                   | Elapsed Time: 0:36:58 ETA:   2:40:22

SPS: 83 Average Reward: -15.74067274789311


 18% (92 of 488) |####                   | Elapsed Time: 0:37:23 ETA:   2:39:05

SPS: 84 Average Reward: -13.984755089061347


 19% (93 of 488) |####                   | Elapsed Time: 0:37:47 ETA:   2:38:59

SPS: 84 Average Reward: -9.662095681590323


 19% (94 of 488) |####                   | Elapsed Time: 0:38:11 ETA:   2:38:44

SPS: 84 Average Reward: -16.489488920234216


 19% (95 of 488) |####                   | Elapsed Time: 0:38:35 ETA:   2:38:17

SPS: 84 Average Reward: -20.776422413585795


 19% (96 of 488) |####                   | Elapsed Time: 0:38:59 ETA:   2:38:06

SPS: 84 Average Reward: -14.601203128280591


 19% (97 of 488) |####                   | Elapsed Time: 0:39:23 ETA:   2:35:13

SPS: 84 Average Reward: -15.940462206268743


 20% (98 of 488) |####                   | Elapsed Time: 0:39:47 ETA:   2:35:58

SPS: 84 Average Reward: -23.786989670960352


 20% (99 of 488) |####                   | Elapsed Time: 0:40:11 ETA:   2:34:52

SPS: 84 Average Reward: -14.507791234358125


 20% (100 of 488) |####                  | Elapsed Time: 0:40:35 ETA:   2:36:50

SPS: 84 Average Reward: -12.84812519714243


 20% (101 of 488) |####                  | Elapsed Time: 0:40:59 ETA:   2:34:33

SPS: 84 Average Reward: -11.534300503859672


 20% (102 of 488) |####                  | Elapsed Time: 0:41:23 ETA:   2:33:19

SPS: 84 Average Reward: -13.145495827650599


 21% (103 of 488) |####                  | Elapsed Time: 0:41:47 ETA:   2:34:56

SPS: 84 Average Reward: -13.128716783419835


 21% (104 of 488) |####                  | Elapsed Time: 0:42:11 ETA:   2:33:51

SPS: 84 Average Reward: -17.104533098791034


 21% (105 of 488) |####                  | Elapsed Time: 0:42:35 ETA:   2:33:28

SPS: 84 Average Reward: -14.258451843020552


 21% (106 of 488) |####                  | Elapsed Time: 0:42:59 ETA:   2:33:45

SPS: 84 Average Reward: -13.553841913095766


 21% (107 of 488) |####                  | Elapsed Time: 0:43:24 ETA:   2:32:55

SPS: 84 Average Reward: -15.492339423992894


 22% (108 of 488) |####                  | Elapsed Time: 0:43:47 ETA:   2:31:39

SPS: 84 Average Reward: -10.746540999848055


 22% (109 of 488) |####                  | Elapsed Time: 0:44:12 ETA:   2:32:10

SPS: 84 Average Reward: -13.707571223032563


 22% (110 of 488) |####                  | Elapsed Time: 0:44:36 ETA:   2:31:28

SPS: 84 Average Reward: -18.685906100132396


 22% (111 of 488) |#####                 | Elapsed Time: 0:45:00 ETA:   2:32:10

SPS: 84 Average Reward: -10.019573554191803


 22% (112 of 488) |#####                 | Elapsed Time: 0:45:24 ETA:   2:31:22

SPS: 84 Average Reward: -13.39317638010231


 23% (113 of 488) |#####                 | Elapsed Time: 0:45:48 ETA:   2:31:14

SPS: 84 Average Reward: -15.52472701194689


 23% (114 of 488) |#####                 | Elapsed Time: 0:46:12 ETA:   2:31:14

SPS: 84 Average Reward: -10.066444645767307


 23% (115 of 488) |#####                 | Elapsed Time: 0:46:37 ETA:   2:30:02

SPS: 84 Average Reward: -12.480510059874367


 23% (116 of 488) |#####                 | Elapsed Time: 0:47:01 ETA:   2:30:41

SPS: 84 Average Reward: -13.231801688400253


 23% (117 of 488) |#####                 | Elapsed Time: 0:47:25 ETA:   2:28:36

SPS: 84 Average Reward: -12.316016227895071


 24% (118 of 488) |#####                 | Elapsed Time: 0:47:49 ETA:   2:29:14

SPS: 84 Average Reward: -17.050428214411628


 24% (119 of 488) |#####                 | Elapsed Time: 0:48:13 ETA:   2:28:28

SPS: 84 Average Reward: -15.209903504399787


 24% (120 of 488) |#####                 | Elapsed Time: 0:48:37 ETA:   2:28:14

SPS: 84 Average Reward: -14.669392492080073


 24% (121 of 488) |#####                 | Elapsed Time: 0:49:02 ETA:   2:27:48

SPS: 84 Average Reward: -15.466115884044157


 25% (122 of 488) |#####                 | Elapsed Time: 0:49:26 ETA:   2:27:37

SPS: 84 Average Reward: -10.920659941910785


 25% (123 of 488) |#####                 | Elapsed Time: 0:49:50 ETA:   2:27:07

SPS: 84 Average Reward: -13.825593779703897


 25% (124 of 488) |#####                 | Elapsed Time: 0:50:14 ETA:   2:26:54

SPS: 84 Average Reward: -16.606517408092266


 25% (125 of 488) |#####                 | Elapsed Time: 0:50:38 ETA:   2:26:22

SPS: 84 Average Reward: -18.462969565573584


 25% (126 of 488) |#####                 | Elapsed Time: 0:51:02 ETA:   2:25:17

SPS: 84 Average Reward: -14.524742376557358


 26% (127 of 488) |#####                 | Elapsed Time: 0:51:27 ETA:   2:25:22

SPS: 84 Average Reward: -12.37760931579033


 26% (128 of 488) |#####                 | Elapsed Time: 0:51:51 ETA:   2:25:12

SPS: 84 Average Reward: -13.090279730967207


 26% (129 of 488) |#####                 | Elapsed Time: 0:52:15 ETA:   2:24:07

SPS: 84 Average Reward: -16.208228543461882


 26% (130 of 488) |#####                 | Elapsed Time: 0:52:39 ETA:   2:23:18

SPS: 84 Average Reward: -11.765903459185118


 26% (131 of 488) |#####                 | Elapsed Time: 0:53:03 ETA:   2:23:49

SPS: 84 Average Reward: -16.962787102386784


 27% (132 of 488) |#####                 | Elapsed Time: 0:53:27 ETA:   2:22:41

SPS: 84 Average Reward: -17.193689021370876


 27% (133 of 488) |#####                 | Elapsed Time: 0:53:51 ETA:   2:23:22

SPS: 84 Average Reward: -12.924610389895236


 27% (134 of 488) |######                | Elapsed Time: 0:54:16 ETA:   2:22:36

SPS: 84 Average Reward: -8.751864369811965


 27% (135 of 488) |######                | Elapsed Time: 0:54:40 ETA:   2:21:18

SPS: 84 Average Reward: -12.26570445788263


 27% (136 of 488) |######                | Elapsed Time: 0:55:04 ETA:   2:21:26

SPS: 84 Average Reward: -11.863633444252308


 28% (137 of 488) |######                | Elapsed Time: 0:55:28 ETA:   2:20:57

SPS: 84 Average Reward: -8.116475860507594


 28% (138 of 488) |######                | Elapsed Time: 0:55:52 ETA:   2:20:24

SPS: 84 Average Reward: -10.952295366033452


 28% (139 of 488) |######                | Elapsed Time: 0:56:16 ETA:   2:20:59

SPS: 84 Average Reward: -11.812892761799048


 28% (140 of 488) |######                | Elapsed Time: 0:56:40 ETA:   2:19:51

SPS: 84 Average Reward: -13.39879604793969


 28% (141 of 488) |######                | Elapsed Time: 0:57:04 ETA:   2:19:53

SPS: 84 Average Reward: -14.5694390192759


 29% (142 of 488) |######                | Elapsed Time: 0:57:28 ETA:   2:18:54

SPS: 84 Average Reward: -11.34671246412303


 29% (143 of 488) |######                | Elapsed Time: 0:57:53 ETA:   2:19:04

SPS: 84 Average Reward: -16.42983218520471


 29% (144 of 488) |######                | Elapsed Time: 0:58:17 ETA:   2:19:37

SPS: 84 Average Reward: -15.596702929632096


 29% (145 of 488) |######                | Elapsed Time: 0:58:41 ETA:   2:17:01

SPS: 84 Average Reward: -10.622820745427497


 29% (146 of 488) |######                | Elapsed Time: 0:59:05 ETA:   2:17:23

SPS: 84 Average Reward: -12.14704367441396


 30% (147 of 488) |######                | Elapsed Time: 0:59:29 ETA:   2:17:23

SPS: 84 Average Reward: -14.23542105331289


 30% (148 of 488) |######                | Elapsed Time: 0:59:53 ETA:   2:16:53

SPS: 84 Average Reward: -9.126349649013134


 30% (149 of 488) |######                | Elapsed Time: 1:00:18 ETA:   2:16:24

SPS: 84 Average Reward: -12.990072541461638


 30% (150 of 488) |######                | Elapsed Time: 1:00:42 ETA:   2:15:54

SPS: 84 Average Reward: -17.56370582670039


 30% (151 of 488) |######                | Elapsed Time: 1:01:06 ETA:   2:15:26

SPS: 84 Average Reward: -9.42673410725404


 31% (152 of 488) |######                | Elapsed Time: 1:01:30 ETA:   2:15:19

SPS: 84 Average Reward: -9.506947651182328


 31% (153 of 488) |######                | Elapsed Time: 1:01:54 ETA:   2:14:54

SPS: 84 Average Reward: -11.083758787605998


 31% (154 of 488) |######                | Elapsed Time: 1:02:18 ETA:   2:13:36

SPS: 84 Average Reward: -11.190564838509731


 31% (155 of 488) |######                | Elapsed Time: 1:02:42 ETA:   2:14:12

SPS: 84 Average Reward: -9.215674900715467


 31% (156 of 488) |#######               | Elapsed Time: 1:03:06 ETA:   2:13:02

SPS: 84 Average Reward: -13.110603744335322


 32% (157 of 488) |#######               | Elapsed Time: 1:03:30 ETA:   2:12:34

SPS: 84 Average Reward: -10.346722515592786


 32% (158 of 488) |#######               | Elapsed Time: 1:03:54 ETA:   2:11:53

SPS: 84 Average Reward: -9.7741974343398


 32% (159 of 488) |#######               | Elapsed Time: 1:04:19 ETA:   2:12:21

SPS: 84 Average Reward: -13.937644948783287


 32% (160 of 488) |#######               | Elapsed Time: 1:04:43 ETA:   2:11:50

SPS: 84 Average Reward: -8.247448734369968


 32% (161 of 488) |#######               | Elapsed Time: 1:05:07 ETA:   2:11:08

SPS: 84 Average Reward: -10.781131058849727


 33% (162 of 488) |#######               | Elapsed Time: 1:05:31 ETA:   2:11:02

SPS: 84 Average Reward: -10.908386393862486


 33% (163 of 488) |#######               | Elapsed Time: 1:05:55 ETA:   2:10:24

SPS: 84 Average Reward: -11.35995841290034


 33% (164 of 488) |#######               | Elapsed Time: 1:06:19 ETA:   2:10:03

SPS: 84 Average Reward: -14.228304770053454


 33% (165 of 488) |#######               | Elapsed Time: 1:06:43 ETA:   2:09:59

SPS: 84 Average Reward: -10.24251021893742


 34% (166 of 488) |#######               | Elapsed Time: 1:07:07 ETA:   2:09:19

SPS: 84 Average Reward: -14.493306515429527


 34% (167 of 488) |#######               | Elapsed Time: 1:07:31 ETA:   2:09:32

SPS: 84 Average Reward: -11.330837015389466


 34% (168 of 488) |#######               | Elapsed Time: 1:07:56 ETA:   2:08:44

SPS: 84 Average Reward: -10.336075082467126


 34% (169 of 488) |#######               | Elapsed Time: 1:08:20 ETA:   2:07:44

SPS: 84 Average Reward: -13.14727524113056


 34% (170 of 488) |#######               | Elapsed Time: 1:08:44 ETA:   2:07:36

SPS: 84 Average Reward: -11.898049105834016


 35% (171 of 488) |#######               | Elapsed Time: 1:09:08 ETA:   2:07:01

SPS: 84 Average Reward: -12.50384515223576


 35% (172 of 488) |#######               | Elapsed Time: 1:09:32 ETA:   2:05:57

SPS: 84 Average Reward: -11.00618708691042


 35% (173 of 488) |#######               | Elapsed Time: 1:09:56 ETA:   2:06:38

SPS: 84 Average Reward: -11.341806398758745


 35% (174 of 488) |#######               | Elapsed Time: 1:10:20 ETA:   2:05:51

SPS: 84 Average Reward: -11.539463646039007


 35% (175 of 488) |#######               | Elapsed Time: 1:10:44 ETA:   2:05:48

SPS: 84 Average Reward: -11.46638346259546


 36% (176 of 488) |#######               | Elapsed Time: 1:11:08 ETA:   2:05:11

SPS: 84 Average Reward: -10.393364485083955


 36% (177 of 488) |#######               | Elapsed Time: 1:11:32 ETA:   2:04:25

SPS: 84 Average Reward: -12.195390203595759


 36% (178 of 488) |########              | Elapsed Time: 1:11:56 ETA:   2:03:20

SPS: 84 Average Reward: -10.062773772832537


 36% (179 of 488) |########              | Elapsed Time: 1:12:20 ETA:   2:02:54

SPS: 84 Average Reward: -9.530728168462698


 36% (180 of 488) |########              | Elapsed Time: 1:12:44 ETA:   2:03:30

SPS: 84 Average Reward: -12.285707253038048


 37% (181 of 488) |########              | Elapsed Time: 1:13:08 ETA:   2:03:14

SPS: 84 Average Reward: -11.955059302996782


 37% (182 of 488) |########              | Elapsed Time: 1:13:32 ETA:   2:03:38

SPS: 84 Average Reward: -13.518092597962598


 37% (183 of 488) |########              | Elapsed Time: 1:13:56 ETA:   2:02:58

SPS: 84 Average Reward: -13.492127078577283


 37% (184 of 488) |########              | Elapsed Time: 1:14:20 ETA:   2:02:25

SPS: 84 Average Reward: -11.714685302489775


 37% (185 of 488) |########              | Elapsed Time: 1:14:45 ETA:   2:01:39

SPS: 84 Average Reward: -11.842166891659856


 38% (186 of 488) |########              | Elapsed Time: 1:15:09 ETA:   2:01:34

SPS: 84 Average Reward: -11.623166322793594


 38% (187 of 488) |########              | Elapsed Time: 1:15:33 ETA:   2:00:43

SPS: 84 Average Reward: -11.610781226773518


 38% (188 of 488) |########              | Elapsed Time: 1:15:57 ETA:   2:00:42

SPS: 84 Average Reward: -12.415831828191848


 38% (189 of 488) |########              | Elapsed Time: 1:16:21 ETA:   2:00:33

SPS: 84 Average Reward: -15.955228284420542


 38% (190 of 488) |########              | Elapsed Time: 1:16:45 ETA:   2:00:14

SPS: 84 Average Reward: -10.808059566881377


 39% (191 of 488) |########              | Elapsed Time: 1:17:09 ETA:   1:59:33

SPS: 84 Average Reward: -11.209156305965296


 39% (192 of 488) |########              | Elapsed Time: 1:17:34 ETA:   1:58:49

SPS: 84 Average Reward: -10.610172056529336


 39% (193 of 488) |########              | Elapsed Time: 1:17:58 ETA:   1:58:57

SPS: 84 Average Reward: -7.077181343402688


 39% (194 of 488) |########              | Elapsed Time: 1:18:22 ETA:   1:57:58

SPS: 84 Average Reward: -10.960333419800014


 39% (195 of 488) |########              | Elapsed Time: 1:18:46 ETA:   1:57:31

SPS: 84 Average Reward: -10.684198844451062


 40% (196 of 488) |########              | Elapsed Time: 1:19:10 ETA:   1:57:15

SPS: 84 Average Reward: -8.591007447882488


 40% (197 of 488) |########              | Elapsed Time: 1:19:34 ETA:   1:56:15

SPS: 84 Average Reward: -15.171328928369675


 40% (198 of 488) |########              | Elapsed Time: 1:19:58 ETA:   1:56:23

SPS: 84 Average Reward: -11.497625338583111


 40% (199 of 488) |########              | Elapsed Time: 1:20:22 ETA:   1:56:34

SPS: 84 Average Reward: -10.029625185951621


 40% (200 of 488) |#########             | Elapsed Time: 1:20:46 ETA:   1:56:04

SPS: 84 Average Reward: -10.318359234432053


 41% (201 of 488) |#########             | Elapsed Time: 1:21:11 ETA:   1:55:36

SPS: 84 Average Reward: -10.331692635399323


 41% (202 of 488) |#########             | Elapsed Time: 1:21:35 ETA:   1:54:46

SPS: 84 Average Reward: -11.670226225585273


 41% (203 of 488) |#########             | Elapsed Time: 1:21:59 ETA:   1:55:12

SPS: 84 Average Reward: -10.851846428660606


 41% (204 of 488) |#########             | Elapsed Time: 1:22:23 ETA:   1:54:29

SPS: 84 Average Reward: -13.72023049454869


 42% (205 of 488) |#########             | Elapsed Time: 1:22:47 ETA:   1:53:42

SPS: 84 Average Reward: -14.191748446513945


 42% (206 of 488) |#########             | Elapsed Time: 1:23:11 ETA:   1:53:04

SPS: 84 Average Reward: -16.048821586252544


 42% (207 of 488) |#########             | Elapsed Time: 1:23:35 ETA:   1:52:38

SPS: 84 Average Reward: -13.240496837439954


 42% (208 of 488) |#########             | Elapsed Time: 1:23:59 ETA:   1:52:17

SPS: 84 Average Reward: -10.973758664772282


 42% (209 of 488) |#########             | Elapsed Time: 1:24:24 ETA:   1:52:14

SPS: 84 Average Reward: -10.160655576263144


 43% (210 of 488) |#########             | Elapsed Time: 1:24:48 ETA:   1:51:28

SPS: 84 Average Reward: -13.015825354784823


 43% (211 of 488) |#########             | Elapsed Time: 1:25:12 ETA:   1:51:38

SPS: 84 Average Reward: -12.747347234976223


 43% (212 of 488) |#########             | Elapsed Time: 1:25:36 ETA:   1:50:49

SPS: 84 Average Reward: -11.234154482582973


 43% (213 of 488) |#########             | Elapsed Time: 1:26:00 ETA:   1:49:54

SPS: 84 Average Reward: -13.000743359022042


 43% (214 of 488) |#########             | Elapsed Time: 1:26:24 ETA:   1:49:59

SPS: 84 Average Reward: -14.41213840931367


 44% (215 of 488) |#########             | Elapsed Time: 1:26:48 ETA:   1:49:25

SPS: 84 Average Reward: -9.885646853139784


 44% (216 of 488) |#########             | Elapsed Time: 1:27:12 ETA:   1:49:24

SPS: 84 Average Reward: -9.355997819897818


 44% (217 of 488) |#########             | Elapsed Time: 1:27:36 ETA:   1:48:41

SPS: 84 Average Reward: -7.26374418702716


 44% (218 of 488) |#########             | Elapsed Time: 1:28:00 ETA:   1:48:10

SPS: 84 Average Reward: -12.261820757602392


 44% (219 of 488) |#########             | Elapsed Time: 1:28:24 ETA:   1:48:18

SPS: 84 Average Reward: -7.127195656612669


 45% (220 of 488) |#########             | Elapsed Time: 1:28:49 ETA:   1:47:42

SPS: 84 Average Reward: -8.373563761596873


 45% (221 of 488) |#########             | Elapsed Time: 1:29:13 ETA:   1:47:14

SPS: 84 Average Reward: -11.14430895315305


 45% (222 of 488) |##########            | Elapsed Time: 1:29:37 ETA:   1:46:13

SPS: 84 Average Reward: -14.81882767668713


 45% (223 of 488) |##########            | Elapsed Time: 1:30:01 ETA:   1:46:27

SPS: 84 Average Reward: -12.787264303079194


 45% (224 of 488) |##########            | Elapsed Time: 1:30:25 ETA:   1:46:00

SPS: 84 Average Reward: -9.432201823504963


 46% (225 of 488) |##########            | Elapsed Time: 1:30:49 ETA:   1:45:57

SPS: 84 Average Reward: -11.557981838473047


 46% (226 of 488) |##########            | Elapsed Time: 1:31:13 ETA:   1:44:07

SPS: 84 Average Reward: -6.729271144636378


 46% (227 of 488) |##########            | Elapsed Time: 1:31:37 ETA:   1:45:06

SPS: 84 Average Reward: -9.392374710211657


 46% (228 of 488) |##########            | Elapsed Time: 1:32:01 ETA:   1:44:08

SPS: 84 Average Reward: -12.11668884268632


 46% (229 of 488) |##########            | Elapsed Time: 1:32:25 ETA:   1:44:25

SPS: 84 Average Reward: -6.400989588445613


 47% (230 of 488) |##########            | Elapsed Time: 1:32:49 ETA:   1:43:26

SPS: 84 Average Reward: -10.063387033835104


 47% (231 of 488) |##########            | Elapsed Time: 1:33:13 ETA:   1:43:15

SPS: 84 Average Reward: -8.959489700180098


 47% (232 of 488) |##########            | Elapsed Time: 1:33:37 ETA:   1:42:59

SPS: 84 Average Reward: -9.785132364427268


 47% (233 of 488) |##########            | Elapsed Time: 1:34:02 ETA:   1:43:04

SPS: 84 Average Reward: -9.425484757097987


 47% (234 of 488) |##########            | Elapsed Time: 1:34:26 ETA:   1:42:14

SPS: 84 Average Reward: -9.702737191509765


 48% (235 of 488) |##########            | Elapsed Time: 1:34:50 ETA:   1:41:24

SPS: 84 Average Reward: -6.11165821779878


 48% (236 of 488) |##########            | Elapsed Time: 1:35:14 ETA:   1:41:00

SPS: 84 Average Reward: -9.832725427785556


 48% (237 of 488) |##########            | Elapsed Time: 1:35:38 ETA:   1:41:03

SPS: 84 Average Reward: -11.24799618516108


 48% (238 of 488) |##########            | Elapsed Time: 1:36:02 ETA:   1:40:31

SPS: 84 Average Reward: -7.3559376497668065


 48% (239 of 488) |##########            | Elapsed Time: 1:36:26 ETA:   1:39:39

SPS: 84 Average Reward: -10.328542749344019


 49% (240 of 488) |##########            | Elapsed Time: 1:36:51 ETA:   1:40:11

SPS: 84 Average Reward: -11.884716767760935


 49% (241 of 488) |##########            | Elapsed Time: 1:37:15 ETA:   1:39:16

SPS: 84 Average Reward: -11.275283290475599


 49% (242 of 488) |##########            | Elapsed Time: 1:37:39 ETA:   1:38:43

SPS: 84 Average Reward: -9.079996736307173


 49% (243 of 488) |##########            | Elapsed Time: 1:38:03 ETA:   1:38:31

SPS: 84 Average Reward: -7.031934391776474


 50% (244 of 488) |###########           | Elapsed Time: 1:38:27 ETA:   1:38:00

SPS: 84 Average Reward: -9.174849317238737


 50% (245 of 488) |###########           | Elapsed Time: 1:38:51 ETA:   1:37:44

SPS: 84 Average Reward: -8.74995998408832


 50% (246 of 488) |###########           | Elapsed Time: 1:39:15 ETA:   1:37:32

SPS: 84 Average Reward: -10.699063600542077


 50% (247 of 488) |###########           | Elapsed Time: 1:39:39 ETA:   1:36:47

SPS: 84 Average Reward: -13.41309199914722


 50% (248 of 488) |###########           | Elapsed Time: 1:40:04 ETA:   1:36:34

SPS: 84 Average Reward: -9.677389211902398


 51% (249 of 488) |###########           | Elapsed Time: 1:40:28 ETA:   1:35:57

SPS: 84 Average Reward: -6.203481366189732


 51% (250 of 488) |###########           | Elapsed Time: 1:40:52 ETA:   1:36:03

SPS: 84 Average Reward: -12.253128325890168


 51% (251 of 488) |###########           | Elapsed Time: 1:41:16 ETA:   1:35:30

SPS: 84 Average Reward: -6.700715843829866


 51% (252 of 488) |###########           | Elapsed Time: 1:41:40 ETA:   1:35:14

SPS: 84 Average Reward: -7.178202810044863


 51% (253 of 488) |###########           | Elapsed Time: 1:42:04 ETA:   1:34:39

SPS: 84 Average Reward: -9.800345056134814


 52% (254 of 488) |###########           | Elapsed Time: 1:42:28 ETA:   1:33:44

SPS: 84 Average Reward: -11.164287424925057


 52% (255 of 488) |###########           | Elapsed Time: 1:42:52 ETA:   1:33:32

SPS: 84 Average Reward: -7.875878348488162


 52% (256 of 488) |###########           | Elapsed Time: 1:43:17 ETA:   1:33:05

SPS: 84 Average Reward: -12.298852466829995


 52% (257 of 488) |###########           | Elapsed Time: 1:43:41 ETA:   1:33:04

SPS: 84 Average Reward: -10.770323811729122


 52% (258 of 488) |###########           | Elapsed Time: 1:44:05 ETA:   1:32:34

SPS: 84 Average Reward: -7.7628638333870255


 53% (259 of 488) |###########           | Elapsed Time: 1:44:29 ETA:   1:32:03

SPS: 84 Average Reward: -11.52681999360061


 53% (260 of 488) |###########           | Elapsed Time: 1:44:53 ETA:   1:31:38

SPS: 84 Average Reward: -7.983973976559175


 53% (261 of 488) |###########           | Elapsed Time: 1:45:17 ETA:   1:31:11

SPS: 84 Average Reward: -10.572701852817003


 53% (262 of 488) |###########           | Elapsed Time: 1:45:41 ETA:   1:31:20

SPS: 84 Average Reward: -8.82728402043222


 53% (263 of 488) |###########           | Elapsed Time: 1:46:06 ETA:   1:30:35

SPS: 84 Average Reward: -9.878470719129714


 54% (264 of 488) |###########           | Elapsed Time: 1:46:30 ETA:   1:30:28

SPS: 84 Average Reward: -5.512791736464486


 54% (265 of 488) |###########           | Elapsed Time: 1:46:54 ETA:   1:29:32

SPS: 84 Average Reward: -7.467418838328844


 54% (266 of 488) |###########           | Elapsed Time: 1:47:18 ETA:   1:29:13

SPS: 84 Average Reward: -12.484748061155395


 54% (267 of 488) |############          | Elapsed Time: 1:47:42 ETA:   1:29:01

SPS: 84 Average Reward: -9.009501522741754


 54% (268 of 488) |############          | Elapsed Time: 1:48:06 ETA:   1:28:32

SPS: 84 Average Reward: -12.329649758607575


 55% (269 of 488) |############          | Elapsed Time: 1:48:31 ETA:   1:28:18

SPS: 84 Average Reward: -11.190500539650245


 55% (270 of 488) |############          | Elapsed Time: 1:48:55 ETA:   1:28:04

SPS: 84 Average Reward: -11.996191536289093


 55% (271 of 488) |############          | Elapsed Time: 1:49:19 ETA:   1:27:01

SPS: 84 Average Reward: -8.180714136548298


 55% (272 of 488) |############          | Elapsed Time: 1:49:43 ETA:   1:26:57

SPS: 84 Average Reward: -8.679818322710116


 55% (273 of 488) |############          | Elapsed Time: 1:50:07 ETA:   1:26:31

SPS: 84 Average Reward: -9.522645658918442


 56% (274 of 488) |############          | Elapsed Time: 1:50:31 ETA:   1:26:04

SPS: 84 Average Reward: -8.083826565140846


 56% (275 of 488) |############          | Elapsed Time: 1:50:55 ETA:   1:25:20

SPS: 84 Average Reward: -10.93627416322532


 56% (276 of 488) |############          | Elapsed Time: 1:51:20 ETA:   1:25:22

SPS: 84 Average Reward: -8.408601927163561


 56% (277 of 488) |############          | Elapsed Time: 1:51:44 ETA:   1:24:18

SPS: 84 Average Reward: -9.407735699749585


 56% (278 of 488) |############          | Elapsed Time: 1:52:08 ETA:   1:24:25

SPS: 84 Average Reward: -12.310481892211964


 57% (279 of 488) |############          | Elapsed Time: 1:52:32 ETA:   1:24:17

SPS: 84 Average Reward: -8.65143920270166


 57% (280 of 488) |############          | Elapsed Time: 1:52:56 ETA:   1:23:36

SPS: 84 Average Reward: -8.582980576595507


 57% (281 of 488) |############          | Elapsed Time: 1:53:20 ETA:   1:23:06

SPS: 84 Average Reward: -8.970192477158783


 57% (282 of 488) |############          | Elapsed Time: 1:53:44 ETA:   1:22:31

SPS: 84 Average Reward: -9.714277575655034


 57% (283 of 488) |############          | Elapsed Time: 1:54:08 ETA:   1:22:46

SPS: 84 Average Reward: -7.999809356115828


 58% (284 of 488) |############          | Elapsed Time: 1:54:33 ETA:   1:22:28

SPS: 84 Average Reward: -9.129524509449007


 58% (285 of 488) |############          | Elapsed Time: 1:54:57 ETA:   1:21:36

SPS: 84 Average Reward: -8.658533789433186


 58% (286 of 488) |############          | Elapsed Time: 1:55:21 ETA:   1:21:18

SPS: 84 Average Reward: -7.985018480510896


 58% (287 of 488) |############          | Elapsed Time: 1:55:45 ETA:   1:21:06

SPS: 84 Average Reward: -11.072259548320508


 59% (288 of 488) |############          | Elapsed Time: 1:56:09 ETA:   1:20:29

SPS: 84 Average Reward: -9.395604461254319


 59% (289 of 488) |#############         | Elapsed Time: 1:56:33 ETA:   1:19:42

SPS: 84 Average Reward: -8.718081414319856


 59% (290 of 488) |#############         | Elapsed Time: 1:56:57 ETA:   1:19:51

SPS: 84 Average Reward: -8.536402348442556


 59% (291 of 488) |#############         | Elapsed Time: 1:57:22 ETA:   1:19:29

SPS: 84 Average Reward: -8.664802982129789


 59% (292 of 488) |#############         | Elapsed Time: 1:57:46 ETA:   1:18:33

SPS: 84 Average Reward: -8.776007084185249


 60% (293 of 488) |#############         | Elapsed Time: 1:58:10 ETA:   1:18:32

SPS: 84 Average Reward: -8.323145631300505


 60% (294 of 488) |#############         | Elapsed Time: 1:58:34 ETA:   1:17:54

SPS: 84 Average Reward: -7.99193003817272


 60% (295 of 488) |#############         | Elapsed Time: 1:58:58 ETA:   1:17:34

SPS: 84 Average Reward: -8.307451745147398


 60% (296 of 488) |#############         | Elapsed Time: 1:59:22 ETA:   1:16:28

SPS: 84 Average Reward: -8.857920772964436


 60% (297 of 488) |#############         | Elapsed Time: 1:59:46 ETA:   1:16:37

SPS: 84 Average Reward: -8.884388567312563


 61% (298 of 488) |#############         | Elapsed Time: 2:00:10 ETA:   1:16:27

SPS: 84 Average Reward: -8.961200155898771


 61% (299 of 488) |#############         | Elapsed Time: 2:00:34 ETA:   1:16:04

SPS: 84 Average Reward: -7.152758983565024


 61% (300 of 488) |#############         | Elapsed Time: 2:00:58 ETA:   1:15:39

SPS: 84 Average Reward: -8.64323405795984


 61% (301 of 488) |#############         | Elapsed Time: 2:01:22 ETA:   1:14:45

SPS: 84 Average Reward: -8.587986750655501


 61% (302 of 488) |#############         | Elapsed Time: 2:01:47 ETA:   1:14:53

SPS: 84 Average Reward: -7.997904281306068


 62% (303 of 488) |#############         | Elapsed Time: 2:02:11 ETA:   1:14:17

SPS: 84 Average Reward: -7.988506441960173


 62% (304 of 488) |#############         | Elapsed Time: 2:02:35 ETA:   1:14:10

SPS: 84 Average Reward: -10.467973856281773


 62% (305 of 488) |#############         | Elapsed Time: 2:02:59 ETA:   1:13:37

SPS: 84 Average Reward: -8.185913342507463


 62% (306 of 488) |#############         | Elapsed Time: 2:03:23 ETA:   1:13:05

SPS: 84 Average Reward: -7.86030746083664


 62% (307 of 488) |#############         | Elapsed Time: 2:03:47 ETA:   1:12:51

SPS: 84 Average Reward: -8.049106027184694


 63% (308 of 488) |#############         | Elapsed Time: 2:04:12 ETA:   1:12:42

SPS: 84 Average Reward: -8.327208254487282


 63% (309 of 488) |#############         | Elapsed Time: 2:04:36 ETA:   1:11:59

SPS: 84 Average Reward: -8.466111431074328


 63% (310 of 488) |#############         | Elapsed Time: 2:05:00 ETA:   1:11:16

SPS: 84 Average Reward: -8.580371826214172


 63% (311 of 488) |##############        | Elapsed Time: 2:05:24 ETA:   1:10:47

SPS: 84 Average Reward: -8.295008045586204


 63% (312 of 488) |##############        | Elapsed Time: 2:05:48 ETA:   1:10:50

SPS: 84 Average Reward: -8.2207222800689


 64% (313 of 488) |##############        | Elapsed Time: 2:06:12 ETA:   1:10:32

SPS: 84 Average Reward: -7.77618793437765


 64% (314 of 488) |##############        | Elapsed Time: 2:06:36 ETA:   1:10:01

SPS: 84 Average Reward: -7.855196186330574


 64% (315 of 488) |##############        | Elapsed Time: 2:07:00 ETA:   1:09:20

SPS: 84 Average Reward: -8.825681531866994


 64% (316 of 488) |##############        | Elapsed Time: 2:07:24 ETA:   1:08:49

SPS: 84 Average Reward: -6.5857541357658835


 64% (317 of 488) |##############        | Elapsed Time: 2:07:48 ETA:   1:08:53

SPS: 84 Average Reward: -8.518769918480647


 65% (318 of 488) |##############        | Elapsed Time: 2:08:13 ETA:   1:08:25

SPS: 84 Average Reward: -8.191551920179501


 65% (319 of 488) |##############        | Elapsed Time: 2:08:37 ETA:   1:08:05

SPS: 84 Average Reward: -10.06396891860645


 65% (320 of 488) |##############        | Elapsed Time: 2:09:01 ETA:   1:07:39

SPS: 84 Average Reward: -7.974280491056484


 65% (321 of 488) |##############        | Elapsed Time: 2:09:25 ETA:   1:07:13

SPS: 84 Average Reward: -8.1201036997299


 65% (322 of 488) |##############        | Elapsed Time: 2:09:49 ETA:   1:06:52

SPS: 84 Average Reward: -8.363475740580384


 66% (323 of 488) |##############        | Elapsed Time: 2:10:13 ETA:   1:06:33

SPS: 84 Average Reward: -8.58269526663411


 66% (324 of 488) |##############        | Elapsed Time: 2:10:38 ETA:   1:05:58

SPS: 84 Average Reward: -10.742646693776232


 66% (325 of 488) |##############        | Elapsed Time: 2:11:02 ETA:   1:05:55

SPS: 84 Average Reward: -7.75381040219684


 66% (326 of 488) |##############        | Elapsed Time: 2:11:26 ETA:   1:05:14

SPS: 84 Average Reward: -6.046342158984002


 67% (327 of 488) |##############        | Elapsed Time: 2:11:50 ETA:   1:04:43

SPS: 84 Average Reward: -10.211848243272238


 67% (328 of 488) |##############        | Elapsed Time: 2:12:14 ETA:   1:04:04

SPS: 84 Average Reward: -9.404852028130229


 67% (329 of 488) |##############        | Elapsed Time: 2:12:38 ETA:   1:03:52

SPS: 84 Average Reward: -8.420333740400224


 67% (330 of 488) |##############        | Elapsed Time: 2:13:02 ETA:   1:03:36

SPS: 84 Average Reward: -6.999519684029864


 67% (331 of 488) |##############        | Elapsed Time: 2:13:27 ETA:   1:03:05

SPS: 84 Average Reward: -9.711053865247717


 68% (332 of 488) |##############        | Elapsed Time: 2:13:51 ETA:   1:02:49

SPS: 84 Average Reward: -8.713834296917918


 68% (333 of 488) |###############       | Elapsed Time: 2:14:15 ETA:   1:02:12

SPS: 84 Average Reward: -7.333814474854503


 68% (334 of 488) |###############       | Elapsed Time: 2:14:39 ETA:   1:01:19

SPS: 84 Average Reward: -9.974129009751241


 68% (335 of 488) |###############       | Elapsed Time: 2:15:03 ETA:   1:01:33

SPS: 84 Average Reward: -7.715149078172984


 68% (336 of 488) |###############       | Elapsed Time: 2:15:27 ETA:   1:01:27

SPS: 84 Average Reward: -8.433876402974182


 69% (337 of 488) |###############       | Elapsed Time: 2:15:51 ETA:   1:00:44

SPS: 84 Average Reward: -8.6621506432026


 69% (338 of 488) |###############       | Elapsed Time: 2:16:15 ETA:   1:00:21

SPS: 84 Average Reward: -7.9786147749284115


 69% (339 of 488) |###############       | Elapsed Time: 2:16:40 ETA:   1:00:05

SPS: 84 Average Reward: -8.052418125907808


 69% (340 of 488) |###############       | Elapsed Time: 2:17:04 ETA:   0:59:42

SPS: 84 Average Reward: -7.604460468690867


 69% (341 of 488) |###############       | Elapsed Time: 2:17:28 ETA:   0:59:09

SPS: 84 Average Reward: -7.643215760007019


 70% (342 of 488) |###############       | Elapsed Time: 2:17:52 ETA:   0:58:10

SPS: 84 Average Reward: -7.985046232611192


 70% (343 of 488) |###############       | Elapsed Time: 2:18:16 ETA:   0:58:28

SPS: 84 Average Reward: -7.862318950482692


 70% (344 of 488) |###############       | Elapsed Time: 2:18:40 ETA:   0:58:08

SPS: 84 Average Reward: -7.488336339310143


 70% (345 of 488) |###############       | Elapsed Time: 2:19:04 ETA:   0:57:43

SPS: 84 Average Reward: -6.3240302529192265


 70% (346 of 488) |###############       | Elapsed Time: 2:19:29 ETA:   0:57:12

SPS: 84 Average Reward: -9.90927600526266


 71% (347 of 488) |###############       | Elapsed Time: 2:19:53 ETA:   0:56:52

SPS: 84 Average Reward: -7.891745553979358


 71% (348 of 488) |###############       | Elapsed Time: 2:20:17 ETA:   0:56:32

SPS: 84 Average Reward: -8.03121257444985


 71% (349 of 488) |###############       | Elapsed Time: 2:20:41 ETA:   0:55:57

SPS: 84 Average Reward: -7.714335747410881


 71% (350 of 488) |###############       | Elapsed Time: 2:21:05 ETA:   0:55:31

SPS: 84 Average Reward: -7.996947512009411


 71% (351 of 488) |###############       | Elapsed Time: 2:21:30 ETA:   0:55:14

SPS: 84 Average Reward: -7.8221842437388105


 72% (352 of 488) |###############       | Elapsed Time: 2:21:54 ETA:   0:54:55

SPS: 84 Average Reward: -7.474950766520529


 72% (353 of 488) |###############       | Elapsed Time: 2:22:18 ETA:   0:54:27

SPS: 84 Average Reward: -7.8745698495011


 72% (354 of 488) |###############       | Elapsed Time: 2:22:42 ETA:   0:54:08

SPS: 84 Average Reward: -7.5299786420785235


 72% (355 of 488) |################      | Elapsed Time: 2:23:06 ETA:   0:53:31

SPS: 84 Average Reward: -8.215450468303551


 72% (356 of 488) |################      | Elapsed Time: 2:23:30 ETA:   0:52:47

SPS: 84 Average Reward: -7.842984129280559


 73% (357 of 488) |################      | Elapsed Time: 2:23:54 ETA:   0:52:44

SPS: 84 Average Reward: -11.5052797795867


 73% (358 of 488) |################      | Elapsed Time: 2:24:19 ETA:   0:52:15

SPS: 84 Average Reward: -7.386373978292941


 73% (359 of 488) |################      | Elapsed Time: 2:24:43 ETA:   0:51:46

SPS: 84 Average Reward: -7.967289093818223


 73% (360 of 488) |################      | Elapsed Time: 2:25:07 ETA:   0:51:32

SPS: 84 Average Reward: -6.02988821341172


 73% (361 of 488) |################      | Elapsed Time: 2:25:31 ETA:   0:51:15

SPS: 84 Average Reward: -9.278808061105671


 74% (362 of 488) |################      | Elapsed Time: 2:25:55 ETA:   0:50:34

SPS: 84 Average Reward: -7.69311211249925


 74% (363 of 488) |################      | Elapsed Time: 2:26:19 ETA:   0:50:16

SPS: 84 Average Reward: -7.2165611226763975


 74% (364 of 488) |################      | Elapsed Time: 2:26:43 ETA:   0:49:55

SPS: 84 Average Reward: -7.287714777834611


 74% (365 of 488) |################      | Elapsed Time: 2:27:08 ETA:   0:49:26

SPS: 84 Average Reward: -7.4072872028542145


 75% (366 of 488) |################      | Elapsed Time: 2:27:32 ETA:   0:49:12

SPS: 84 Average Reward: -7.958445110718769


 75% (367 of 488) |################      | Elapsed Time: 2:27:56 ETA:   0:48:49

SPS: 84 Average Reward: -8.303371234306471


 75% (368 of 488) |################      | Elapsed Time: 2:28:20 ETA:   0:48:25

SPS: 84 Average Reward: -8.776926056805708


 75% (369 of 488) |################      | Elapsed Time: 2:28:44 ETA:   0:47:55

SPS: 84 Average Reward: -7.390986667511166


 75% (370 of 488) |################      | Elapsed Time: 2:29:09 ETA:   0:47:37

SPS: 84 Average Reward: -7.613274413686919


 76% (371 of 488) |################      | Elapsed Time: 2:29:33 ETA:   0:47:10

SPS: 84 Average Reward: -7.965869883874888


 76% (372 of 488) |################      | Elapsed Time: 2:29:57 ETA:   0:46:43

SPS: 84 Average Reward: -7.942780335013098


 76% (373 of 488) |################      | Elapsed Time: 2:30:21 ETA:   0:46:04

SPS: 84 Average Reward: -7.926443218470169


 76% (374 of 488) |################      | Elapsed Time: 2:30:45 ETA:   0:45:50

SPS: 84 Average Reward: -5.467579924036015


 76% (375 of 488) |################      | Elapsed Time: 2:31:09 ETA:   0:45:32

SPS: 84 Average Reward: -8.254388343150042


 77% (376 of 488) |################      | Elapsed Time: 2:31:33 ETA:   0:45:08

SPS: 84 Average Reward: -7.5722842353980635


 77% (377 of 488) |################      | Elapsed Time: 2:31:58 ETA:   0:44:47

SPS: 84 Average Reward: -7.194536868237634


 77% (378 of 488) |#################     | Elapsed Time: 2:32:22 ETA:   0:44:15

SPS: 84 Average Reward: -7.131392263996132


 77% (379 of 488) |#################     | Elapsed Time: 2:32:46 ETA:   0:43:56

SPS: 84 Average Reward: -8.01566814140671


 77% (380 of 488) |#################     | Elapsed Time: 2:33:10 ETA:   0:43:30

SPS: 84 Average Reward: -7.465827827369972


 78% (381 of 488) |#################     | Elapsed Time: 2:33:34 ETA:   0:42:57

SPS: 84 Average Reward: -7.335138902893231


 78% (382 of 488) |#################     | Elapsed Time: 2:33:59 ETA:   0:42:52

SPS: 84 Average Reward: -7.489355858255623


 78% (383 of 488) |#################     | Elapsed Time: 2:34:23 ETA:   0:42:27

SPS: 84 Average Reward: -7.37543816249409


 78% (384 of 488) |#################     | Elapsed Time: 2:34:47 ETA:   0:41:25

SPS: 84 Average Reward: -7.215329540387938


 78% (385 of 488) |#################     | Elapsed Time: 2:35:11 ETA:   0:41:26

SPS: 84 Average Reward: -7.205652683593784


 79% (386 of 488) |#################     | Elapsed Time: 2:35:35 ETA:   0:41:12

SPS: 84 Average Reward: -7.212423452551716


 79% (387 of 488) |#################     | Elapsed Time: 2:35:59 ETA:   0:40:47

SPS: 84 Average Reward: -7.184986369388737


 79% (388 of 488) |#################     | Elapsed Time: 2:36:24 ETA:   0:40:24

SPS: 84 Average Reward: -7.573460564186761


 79% (389 of 488) |#################     | Elapsed Time: 2:36:48 ETA:   0:40:04

SPS: 84 Average Reward: -7.119397508519461


 79% (390 of 488) |#################     | Elapsed Time: 2:37:12 ETA:   0:39:18

SPS: 84 Average Reward: -7.0016687302058624


 80% (391 of 488) |#################     | Elapsed Time: 2:37:36 ETA:   0:39:05

SPS: 84 Average Reward: -7.6381432257847175


 80% (392 of 488) |#################     | Elapsed Time: 2:38:00 ETA:   0:38:43

SPS: 84 Average Reward: -7.6727646271590375


 80% (393 of 488) |#################     | Elapsed Time: 2:38:24 ETA:   0:38:11

SPS: 84 Average Reward: -7.627854508972415


 80% (394 of 488) |#################     | Elapsed Time: 2:38:49 ETA:   0:37:45

SPS: 84 Average Reward: -7.078018686156915


 80% (395 of 488) |#################     | Elapsed Time: 2:39:13 ETA:   0:37:30

SPS: 84 Average Reward: -8.254113168935904


 81% (396 of 488) |#################     | Elapsed Time: 2:39:37 ETA:   0:37:02

SPS: 84 Average Reward: -7.77036979500271


 81% (397 of 488) |#################     | Elapsed Time: 2:40:01 ETA:   0:36:35

SPS: 84 Average Reward: -7.921204463103561


 81% (398 of 488) |#################     | Elapsed Time: 2:40:25 ETA:   0:36:08

SPS: 84 Average Reward: -6.884942156277015


 81% (399 of 488) |#################     | Elapsed Time: 2:40:49 ETA:   0:35:54

SPS: 84 Average Reward: -9.972441052869561


 81% (400 of 488) |##################    | Elapsed Time: 2:41:13 ETA:   0:35:10

SPS: 84 Average Reward: -7.9594016695531895


 82% (401 of 488) |##################    | Elapsed Time: 2:41:37 ETA:   0:34:53

SPS: 84 Average Reward: -7.592538417707359


 82% (402 of 488) |##################    | Elapsed Time: 2:42:01 ETA:   0:34:26

SPS: 84 Average Reward: -7.4221633007920085


 82% (403 of 488) |##################    | Elapsed Time: 2:42:26 ETA:   0:34:16

SPS: 84 Average Reward: -7.391108202804205


 82% (404 of 488) |##################    | Elapsed Time: 2:42:50 ETA:   0:33:51

SPS: 84 Average Reward: -7.706509425570859


 82% (405 of 488) |##################    | Elapsed Time: 2:43:14 ETA:   0:33:32

SPS: 84 Average Reward: -7.742953467731384


 83% (406 of 488) |##################    | Elapsed Time: 2:43:38 ETA:   0:32:46

SPS: 84 Average Reward: -8.066145010290047


 83% (407 of 488) |##################    | Elapsed Time: 2:44:02 ETA:   0:32:36

SPS: 84 Average Reward: -7.330369713825903


 83% (408 of 488) |##################    | Elapsed Time: 2:44:26 ETA:   0:32:14

SPS: 84 Average Reward: -7.569280170410363


 83% (409 of 488) |##################    | Elapsed Time: 2:44:50 ETA:   0:31:50

SPS: 84 Average Reward: -7.5402623769312


 84% (410 of 488) |##################    | Elapsed Time: 2:45:15 ETA:   0:31:22

SPS: 84 Average Reward: -7.519343332369666


 84% (411 of 488) |##################    | Elapsed Time: 2:45:39 ETA:   0:31:18

SPS: 84 Average Reward: -7.6924993002827815


 84% (412 of 488) |##################    | Elapsed Time: 2:46:03 ETA:   0:30:39

SPS: 84 Average Reward: -7.756142672169821


 84% (413 of 488) |##################    | Elapsed Time: 2:46:27 ETA:   0:30:12

SPS: 84 Average Reward: -7.66605724123486


 84% (414 of 488) |##################    | Elapsed Time: 2:46:52 ETA:   0:29:50

SPS: 84 Average Reward: -7.627016204631635


 85% (415 of 488) |##################    | Elapsed Time: 2:47:16 ETA:   0:29:24

SPS: 84 Average Reward: -7.366001572283733


 85% (416 of 488) |##################    | Elapsed Time: 2:47:40 ETA:   0:29:01

SPS: 84 Average Reward: -7.140166945741835


 85% (417 of 488) |##################    | Elapsed Time: 2:48:04 ETA:   0:28:39

SPS: 84 Average Reward: -7.0337455003651055


 85% (418 of 488) |##################    | Elapsed Time: 2:48:28 ETA:   0:28:07

SPS: 84 Average Reward: -7.359903649423514


 85% (419 of 488) |##################    | Elapsed Time: 2:48:53 ETA:   0:27:56

SPS: 84 Average Reward: -7.233129189283076


 86% (420 of 488) |##################    | Elapsed Time: 2:49:17 ETA:   0:27:30

SPS: 84 Average Reward: -7.268509218426989


 86% (421 of 488) |##################    | Elapsed Time: 2:49:41 ETA:   0:26:57

SPS: 84 Average Reward: -7.304563361925914


 86% (422 of 488) |###################   | Elapsed Time: 2:50:05 ETA:   0:26:36

SPS: 84 Average Reward: -7.232371932282554


 86% (423 of 488) |###################   | Elapsed Time: 2:50:29 ETA:   0:26:12

SPS: 84 Average Reward: -7.4913021145548955


 86% (424 of 488) |###################   | Elapsed Time: 2:50:54 ETA:   0:25:51

SPS: 84 Average Reward: -7.760570468007499


 87% (425 of 488) |###################   | Elapsed Time: 2:51:18 ETA:   0:25:25

SPS: 84 Average Reward: -7.520762452576663


 87% (426 of 488) |###################   | Elapsed Time: 2:51:42 ETA:   0:24:51

SPS: 84 Average Reward: -7.034587208907406


 87% (427 of 488) |###################   | Elapsed Time: 2:52:06 ETA:   0:24:37

SPS: 84 Average Reward: -7.385097488649071


 87% (428 of 488) |###################   | Elapsed Time: 2:52:30 ETA:   0:24:13

SPS: 84 Average Reward: -7.7379933079764305


 87% (429 of 488) |###################   | Elapsed Time: 2:52:55 ETA:   0:23:48

SPS: 84 Average Reward: -7.5123879884276885


 88% (430 of 488) |###################   | Elapsed Time: 2:53:19 ETA:   0:23:19

SPS: 84 Average Reward: -7.287143214832654


 88% (431 of 488) |###################   | Elapsed Time: 2:53:43 ETA:   0:22:50

SPS: 84 Average Reward: -6.990014808527707


 88% (432 of 488) |###################   | Elapsed Time: 2:54:07 ETA:   0:22:35

SPS: 84 Average Reward: -7.203766920795453


 88% (433 of 488) |###################   | Elapsed Time: 2:54:31 ETA:   0:22:13

SPS: 84 Average Reward: -7.652830763710302


 88% (434 of 488) |###################   | Elapsed Time: 2:54:55 ETA:   0:21:30

SPS: 84 Average Reward: -7.376325641273862


 89% (435 of 488) |###################   | Elapsed Time: 2:55:19 ETA:   0:21:06

SPS: 84 Average Reward: -7.447663156295719


 89% (436 of 488) |###################   | Elapsed Time: 2:55:43 ETA:   0:20:54

SPS: 84 Average Reward: -7.516577996193217


 89% (437 of 488) |###################   | Elapsed Time: 2:56:07 ETA:   0:20:26

SPS: 84 Average Reward: -8.72327555089749


 89% (438 of 488) |###################   | Elapsed Time: 2:56:31 ETA:   0:20:10

SPS: 84 Average Reward: -8.84363214222683


 89% (439 of 488) |###################   | Elapsed Time: 2:56:56 ETA:   0:19:44

SPS: 84 Average Reward: -7.832310999314979


 90% (440 of 488) |###################   | Elapsed Time: 2:57:20 ETA:   0:19:17

SPS: 84 Average Reward: -7.7390610926791625


 90% (441 of 488) |###################   | Elapsed Time: 2:57:44 ETA:   0:18:58

SPS: 84 Average Reward: -8.022729929961109


 90% (442 of 488) |###################   | Elapsed Time: 2:58:08 ETA:   0:18:32

SPS: 84 Average Reward: -7.759434852029978


 90% (443 of 488) |###################   | Elapsed Time: 2:58:32 ETA:   0:18:11

SPS: 84 Average Reward: -7.739277291279157


 90% (444 of 488) |####################  | Elapsed Time: 2:58:56 ETA:   0:17:41

SPS: 84 Average Reward: -7.158168261565551


 91% (445 of 488) |####################  | Elapsed Time: 2:59:20 ETA:   0:17:13

SPS: 84 Average Reward: -7.228831261690328


 91% (446 of 488) |####################  | Elapsed Time: 2:59:45 ETA:   0:16:53

SPS: 84 Average Reward: -7.545175645559228


 91% (447 of 488) |####################  | Elapsed Time: 3:00:09 ETA:   0:16:30

SPS: 84 Average Reward: -7.311472057567359


 91% (448 of 488) |####################  | Elapsed Time: 3:00:33 ETA:   0:16:06

SPS: 84 Average Reward: -7.3523177584518224


 92% (449 of 488) |####################  | Elapsed Time: 3:00:57 ETA:   0:15:37

SPS: 84 Average Reward: -7.430334612995667


 92% (450 of 488) |####################  | Elapsed Time: 3:01:21 ETA:   0:15:18

SPS: 84 Average Reward: -7.589620305245698


 92% (451 of 488) |####################  | Elapsed Time: 3:01:45 ETA:   0:14:56

SPS: 84 Average Reward: -7.710563067700024


 92% (452 of 488) |####################  | Elapsed Time: 3:02:09 ETA:   0:14:28

SPS: 84 Average Reward: -7.434936930721597


 92% (453 of 488) |####################  | Elapsed Time: 3:02:34 ETA:   0:14:08

SPS: 84 Average Reward: -7.747254881328835


 93% (454 of 488) |####################  | Elapsed Time: 3:02:58 ETA:   0:13:36

SPS: 84 Average Reward: -7.813961082207021


 93% (455 of 488) |####################  | Elapsed Time: 3:03:22 ETA:   0:13:17

SPS: 84 Average Reward: -7.631243552989339


 93% (456 of 488) |####################  | Elapsed Time: 3:03:46 ETA:   0:12:54

SPS: 84 Average Reward: -8.090671538356506


 93% (457 of 488) |####################  | Elapsed Time: 3:04:10 ETA:   0:12:29

SPS: 84 Average Reward: -7.931307327080062


 93% (458 of 488) |####################  | Elapsed Time: 3:04:34 ETA:   0:12:02

SPS: 84 Average Reward: -7.365116554396256


 94% (459 of 488) |####################  | Elapsed Time: 3:04:59 ETA:   0:11:41

SPS: 84 Average Reward: -7.631887383134333


 94% (460 of 488) |####################  | Elapsed Time: 3:05:23 ETA:   0:11:16

SPS: 84 Average Reward: -7.415262374879316


 94% (461 of 488) |####################  | Elapsed Time: 3:05:47 ETA:   0:10:54

SPS: 84 Average Reward: -7.407154466775124


 94% (462 of 488) |####################  | Elapsed Time: 3:06:11 ETA:   0:10:30

SPS: 84 Average Reward: -7.170495479377649


 94% (463 of 488) |####################  | Elapsed Time: 3:06:35 ETA:   0:10:04

SPS: 84 Average Reward: -7.407372391008115


 95% (464 of 488) |####################  | Elapsed Time: 3:07:00 ETA:   0:09:42

SPS: 84 Average Reward: -7.215952142639168


 95% (465 of 488) |####################  | Elapsed Time: 3:07:24 ETA:   0:09:17

SPS: 84 Average Reward: -7.677889403094906


 95% (466 of 488) |##################### | Elapsed Time: 3:07:48 ETA:   0:08:48

SPS: 84 Average Reward: -7.992317197423233


 95% (467 of 488) |##################### | Elapsed Time: 3:08:12 ETA:   0:08:27

SPS: 84 Average Reward: -7.270978164691953


 95% (468 of 488) |##################### | Elapsed Time: 3:08:36 ETA:   0:08:04

SPS: 84 Average Reward: -8.204372137367688


 96% (469 of 488) |##################### | Elapsed Time: 3:09:01 ETA:   0:07:40

SPS: 84 Average Reward: -7.875246793410785


 96% (470 of 488) |##################### | Elapsed Time: 3:09:25 ETA:   0:07:16

SPS: 84 Average Reward: -7.900402365135155


 96% (471 of 488) |##################### | Elapsed Time: 3:09:49 ETA:   0:06:50

SPS: 84 Average Reward: -6.785099093730537


 96% (472 of 488) |##################### | Elapsed Time: 3:10:13 ETA:   0:06:27

SPS: 84 Average Reward: -7.583452442900722


 96% (473 of 488) |##################### | Elapsed Time: 3:10:37 ETA:   0:06:03

SPS: 84 Average Reward: -7.343878827973477


 97% (474 of 488) |##################### | Elapsed Time: 3:11:02 ETA:   0:05:40

SPS: 84 Average Reward: -7.4160552756623686


 97% (475 of 488) |##################### | Elapsed Time: 3:11:26 ETA:   0:05:13

SPS: 84 Average Reward: -7.379297724585037


 97% (476 of 488) |##################### | Elapsed Time: 3:11:50 ETA:   0:04:51

SPS: 84 Average Reward: -7.311806784464812


 97% (477 of 488) |##################### | Elapsed Time: 3:12:14 ETA:   0:04:26

SPS: 84 Average Reward: -7.166779770232922


 97% (478 of 488) |##################### | Elapsed Time: 3:12:38 ETA:   0:04:01

SPS: 84 Average Reward: -7.505983123723759


 98% (479 of 488) |##################### | Elapsed Time: 3:13:03 ETA:   0:03:38

SPS: 84 Average Reward: -7.870739559768521


 98% (480 of 488) |##################### | Elapsed Time: 3:13:27 ETA:   0:03:13

SPS: 84 Average Reward: -7.896097248529829


 98% (481 of 488) |##################### | Elapsed Time: 3:13:51 ETA:   0:02:49

SPS: 84 Average Reward: -8.126792575408992


 98% (482 of 488) |##################### | Elapsed Time: 3:14:15 ETA:   0:02:24

SPS: 84 Average Reward: -7.971511963623685


 98% (483 of 488) |##################### | Elapsed Time: 3:14:40 ETA:   0:02:00

SPS: 84 Average Reward: -7.275755303934152


 99% (484 of 488) |##################### | Elapsed Time: 3:15:04 ETA:   0:01:36

SPS: 84 Average Reward: -7.6502896368834605


 99% (485 of 488) |##################### | Elapsed Time: 3:15:28 ETA:   0:01:12

SPS: 84 Average Reward: -7.242371943419926


 99% (486 of 488) |##################### | Elapsed Time: 3:15:52 ETA:   0:00:48

SPS: 84 Average Reward: -7.136743763021042


 99% (487 of 488) |##################### | Elapsed Time: 3:16:16 ETA:   0:00:24

SPS: 84 Average Reward: -7.675631977704209


100% (488 of 488) |######################| Elapsed Time: 3:16:40 Time:  3:16:40


In [35]:
torch.save(sender, "models/sender" + str(start_time) + ".pth")
torch.save(receiver, "models/receiver" + str(start_time) + ".pth")

# Direct communication test

In [36]:
class Test_Communication:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0, 0, 0, 0, 0, 0], "high": [1, 1, 1, 1, 1, 1]}
        self.action_space = {"low": [0, 0, 0, 0, 0, 0], "high": [1, 1, 1, 1, 1, 1]}

    def dynamic(self, agent, actions):
        if "utterance" not in self.environment.data_store[agent].keys():
            self.environment.data_store[agent]["utterance"] = None
        if agent == "receiver":
            utterance = [0, 0, 0, 0, 0, 0]
            if "utterance_max" in self.environment.data_store["sender"].keys():
                observation = self.environment.data_store["sender"]["utterance_max"]
            else:
                observation = utterance
        elif agent == "sender":
            utterance = [0, 0, 0, 0, 0, 0]
            color_action = actions[:3]
            shape_action = actions[3:5]
            utterance[np.argmax(color_action)] = 1
            utterance[np.argmax(shape_action) + 3] = 1
            self.environment.data_store[agent]["utterance"] = actions
            self.environment.data_store[agent]["utterance_max"] = utterance
            observation = [0, 0, 0, 0, 0, 0]
        else:
            print("Dafaq is going on here?")
        return 0, observation, False, {}

In [37]:
xml_files = ["levels_shape/" + file for file in os.listdir("levels_shape/")]
agents = ["sender", "receiver"]

config_dict = {"xmlPath":xml_files, 
                   "agents":agents, 
                   "rewardFunctions":[collision_reward, target_reward], 
                   "doneFunctions":[target_done, border_done], 
                   "skipFrames":5,
                   "environmentDynamics":[Image, Reward, Test_Communication, Accuracy],
                   "freeJoint":True,
                   "renderMode":False,
                   "maxSteps":1024,
                   "agentCameras":True}

env = make_env(config_dict)()

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  updates=self.state_updates,


In [17]:
epochs = 30
num_steps = 1024
lengths = []

for epoch in range(epochs):
    next_obs, infos = reset_environment(env, device)

    next_obs = {k: torch.Tensor(v).unsqueeze(0).to(device) for k, v in next_obs.items()}
    next_done = {"sender": torch.zeros(num_envs).to(device), "receiver": torch.zeros(num_envs).to(device)}

    for step in range(0, num_steps):
        sender_action = sender.get_action_and_value(next_obs["sender"])[0]
        receiver_action = receiver.get_action_and_value(next_obs["receiver"])[0]

        next_obs, reward, terminations, truncations, info = env.step({"sender": sender_action.cpu().numpy()[0], "receiver": receiver_action.cpu().numpy()[0]})
        next_obs = {"sender": torch.Tensor(next_obs["sender"]).unsqueeze(0).to(device), "receiver": torch.Tensor(next_obs["receiver"]).unsqueeze(0).to(device)}

        if terminations["sender"] or terminations["receiver"] or truncations["sender"] or truncations["receiver"]:
            next_obs, infos = reset_environment(env, device)
            lengths.append(step)
            break

In [18]:
dynamic = env.env.env.environment_dynamics[3]
print("Accuracy:", sum(dynamic.accuracies) / len(dynamic.accuracies))
print("Variance:", sum(dynamic.variances) / len(dynamic.variances))
print("Send Accuracy:", sum(dynamic.sendAccuracies) / len(dynamic.sendAccuracies))
print("Length:", sum(lengths) / len(lengths))

Accuracy: 0.875
Variance: -0.5
Send Accuracy: 0.7323293172690764
Length: 145.16666666666666
