In [1]:
from MuJoCo_Gym.mujoco_rl import MuJoCoRL
from MuJoCo_Gym.wrappers import GymnasiumWrapper, GymWrapper
from gymnasium.wrappers.frame_stack import FrameStack
from gymnasium.experimental.wrappers import NormalizeObservationV0
from dynamics import *
import argparse
import os
import random
import time
from distutils.util import strtobool
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
from torch.utils.tensorboard import SummaryWriter

from wrappers.record_episode_statistics import RecordEpisodeStatistics
from wrappers.frame_stack import FrameStack
from wrappers.normalizeObservation import NormalizeObservation
from wrappers.normalizeRewards import NormalizeReward

from progressbar import progressbar
import tensorflow as tf

# Check if eager execution is enabled
if not tf.executing_eagerly():
    tf.compat.v1.enable_eager_execution()


# Dynamics

In [2]:
from sklearn.metrics import mean_squared_error
from autoencoder import load_autoencoder_model, get_a_single_image_embedding
import math
import mujoco

## Vision

In [3]:
class Image:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0 for _ in range(30)], "high": [1 for _ in range(30)]}
        self.action_space = {"low": [], "high": []}
        self.autoencoder = load_autoencoder_model("models/3colors_30.tf")
        self.index = 0

    def dynamic(self, agent, actions):
        self.index = self.index + 1
        image = self.environment.get_camera_data(agent + "_camera")
        result = get_a_single_image_embedding(self.autoencoder, image)
        # cv2.imwrite(f"/Users/cowolff/Documents/GitHub/s.language_experiments/images/{agent}_{self.index}.png", image)
        return 0, result, False, {}

## Communication

In [4]:
class Communication:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0, 0, 0, 0], "high": [1, 1, 1, 1]}
        self.action_space = {"low": [0, 0, 0, 0], "high": [1, 1, 1, 1]}

    def dynamic(self, agent, actions):
        if "utterance" not in self.environment.data_store[agent].keys():
            self.environment.data_store[agent]["utterance"] = None
        if agent == "receiver":
            utterance = [0, 0, 0, 0]
            if "target_color" in self.environment.data_store.keys():
                utterance[np.argmax(self.environment.data_store["target_color"])] = 1
            observation = utterance
        elif agent == "sender":
            utterance = [0, 0, 0, 0]
            utterance[np.argmax(actions)] = 1
            self.environment.data_store[agent]["utterance"] = actions
            self.environment.data_store[agent]["utterance_max"] = utterance
            observation = [0, 0, 0, 0]
        else:
            print("Dafaq is going on here?")
        return 0, observation, False, {}

## Reward Dynamic Function
The reward function for the receiver gives a positive reward for the agent to move closer to the current target object.<br/>
The sender in turn gets a positive reward if it chooses the correct word encoding for the color of the current target object.

In [5]:
class Reward:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [], "high": []}
        self.action_space = {"low": [], "high": []}
        self.choices = ["choice_1", "choice_2"]

    def dynamic(self, agent, actions):
        if not "target" in self.environment.data_store.keys():
            color = self.environment.get_data("reference_geom")["color"]
            for choice in self.choices:
                if (color == self.environment.get_data(choice + "_geom")["color"]).all():
                    self.environment.data_store["target"] = choice
                    self.environment.data_store["target_color"] = self.environment.get_data(choice + "_geom")["color"]
                    self.environment.data_store["last_distance"] = copy.deepcopy(self.environment.distance("receiver_geom", choice + "_geom"))
        if agent == "receiver":
            target = self.environment.data_store["target"]
            new_distance = self.environment.distance("receiver_geom", target + "_geom")
            reward = (self.environment.data_store["last_distance"] - new_distance) * 10
            self.environment.data_store["last_distance"] = copy.deepcopy(new_distance)
        elif agent == "sender":
            reference = [0, 0, 0, 0]
            color = self.environment.data_store["target_color"]
            reference[np.argmax(color)] = 1
            reward = 0
            if "utterance" in self.environment.data_store[agent].keys():
                reward = -1 * mean_squared_error(reference, self.environment.data_store[agent]["utterance"])
        return reward, [], False, {}

## Accuracy Measurement Dynamic

In [6]:
class Accuracy:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [], "high": []}
        self.action_space = {"low": [], "high": []}
        self.accuracies = []
        self.variances = []
        self.sendAccuracies = []
        self.sendVariances = []
        self.currentSend = []
        self.report_accuracy = {"sender": 0, "receiver": 0}

    def dynamic(self, agent, actions):
        choices = ["choice_1", "choice_2"]
        variance = {"choice_1":1, "choice_2":-1}
        if "target" in self.environment.data_store.keys():
            if "sendVariances" not in self.environment.data_store.keys():
                self.environment.data_store["sendVariances"] = True
                self.currentSend = [0, 0, 0, 0]
            target = self.environment.data_store["target"]
            # if any(self.environment.collision(ankle, target + "_geom") for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]):
            if self.environment.collision("receiver_geom", target + "_geom"):
                self.accuracies.append(1)
                self.variances.append(variance[target])

                if len(self.variances) > 50:
                    report_variance = 1 - abs(sum(self.variances[-50:]) / 50)
                    self.report_accuracy["receiver"] = sum(self.accuracies[-50:]) / 50
            # elif any(self.environment.collision(ankle, [choice for choice in choices if choice != target][0] + "_geom") for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]):
            elif self.environment.collision("receiver_geom", [choice for choice in choices if choice != target][0] + "_geom"):
                self.accuracies.append(0)
                self.variances.append(variance[[choice for choice in choices if choice != target][0]])

                if len(self.variances) > 50:
                    report_variance = 1 - abs(sum(self.variances[-50:]) / 50)
                    self.report_accuracy[0] = sum(self.accuracies[-50:]) / 50
            if "utterance_max" in self.environment.data_store[agent].keys():
                reference = [0, 0, 0, 0]
                color = self.environment.data_store["target_color"]
                reference[np.argmax(color)] = 1
                self.currentSend = np.add(self.currentSend, self.environment.data_store[agent]["utterance_max"])

                if self.environment.data_store[agent]["utterance_max"]  == reference:
                    self.sendAccuracies.append(1)
                else:
                    self.sendAccuracies.append(0)
        return 0, [], False, {}

# Reward Functions

In [7]:
def target_reward(mujoco_gym, agent):
    if agent == "receiver":
        choices = ["choice_1", "choice_2"]
        if not "target" in mujoco_gym.data_store.keys():
                color = mujoco_gym.get_data("reference_geom")["color"]
                for choice in choices:
                    if (color == mujoco_gym.get_data(choice + "_geom")["color"]).all():
                        mujoco_gym.data_store["target"] = choice
                        mujoco_gym.data_store["target_color"] = mujoco_gym.get_data(choice + "_geom")["color"]
        
        target = mujoco_gym.data_store["target"]
        # for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]:
        for ankle in ["receiver_geom"]:
            if mujoco_gym.collision(ankle, target + "_geom"):
                return 1
            elif mujoco_gym.collision(ankle, [choice for choice in choices if choice != target][0] + "_geom"):
                return -1
    return 0

In [8]:
def collision_reward(mujoco_gym, agent):
    for border in ["border1_geom", "border2_geom", "border3_geom", "border4_geom", "border5_geom"]:
        # for ankle in ["left_leg_geom_2", "left_ankle_geom_2", "right_leg_geom_2", "right_ankle_geom_2", "back_leg_geom_2", "third_ankle_geom_2", "rightback_leg_geom_2", "fourth_ankle_geom_2"]:
        for ankle in [agent + "_geom"]:
            if mujoco_gym.collision(border, ankle):
                return -0.1
    return 0

# Environment Setup

In [9]:
def make_env(config_dict):
    def thunk():
        window = 5
        env = MuJoCoRL(config_dict=config_dict)
        # env = GymWrapper(env, "receiver")
        # env = FrameStack(env, 4)
        env = NormalizeObservation(env)
        env = NormalizeReward(env)
        # env = RecordEpisodeStatistics(env)
        return env

    return thunk

In [10]:
xml_files = ["levels/" + file for file in os.listdir("levels/")]
agents = ["sender", "receiver"]

config_dict = {"xmlPath":xml_files, 
                   "agents":agents, 
                   "rewardFunctions":[collision_reward, target_reward], 
                   "doneFunctions":[target_done, border_done], 
                   "skipFrames":5,
                   "environmentDynamics":[Image, Reward, Communication, Accuracy],
                   "freeJoint":True,
                   "renderMode":False,
                   "maxSteps":1024,
                   "agentCameras":True}

env = make_env(config_dict)()

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


# RL Model

In [11]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.critic = nn.Sequential(
            nn.Flatten(),
            layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 1), std=1.0),
        )
        self.actor_mean = nn.Sequential(
            nn.Flatten(),
            layer_init(nn.Linear(np.array(envs.observation_space.shape).prod(), 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, np.prod(envs.action_space.shape)), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.action_space.shape)))

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

# Buffer

In [12]:
class Buffer():
    def __init__(self, num_steps, envs, num_envs, device):
        self.obs = torch.zeros((num_steps, num_envs) + envs.observation_space.shape).to(device)
        self.actions = torch.zeros((num_steps, num_envs) + envs.action_space.shape).to(device)
        self.logprobs = torch.zeros((num_steps, num_envs)).to(device)
        self.rewards = torch.zeros((num_steps, num_envs)).to(device)
        self.dones = torch.zeros((num_steps, num_envs)).to(device)
        self.values = torch.zeros((num_steps, num_envs)).to(device)

# Update Function

In [13]:
def update_agent(agent, buffer, optimizer, next_obs, next_done, env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, num_steps):

    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        if gae:
            advantages = torch.zeros_like(buffer.rewards).to(device)
            lastgaelam = 0
            for t in reversed(range(num_steps)):
                if t == num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    nextvalues = next_value
                else:
                    nextnonterminal = 1.0 - buffer.dones[t + 1]
                    nextvalues = buffer.values[t + 1]
                delta = buffer.rewards[t] + gamma * nextvalues * nextnonterminal - buffer.values[t]
                advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
            returns = advantages + buffer.values
        else:
            returns = torch.zeros_like(buffer.rewards).to(device)
            for t in reversed(range(num_steps)):
                if t == num_steps - 1:
                    nextnonterminal = 1.0 - next_done
                    next_return = next_value
                else:
                    nextnonterminal = 1.0 - buffer.dones[t + 1]
                    next_return = returns[t + 1]
                returns[t] = buffer.rewards[t] + gamma * nextnonterminal * next_return
            advantages = returns - buffer.values

    # flatten the batch
    b_obs = buffer.obs.reshape((-1,) + env.observation_space.shape)
    b_logprobs = buffer.logprobs.reshape(-1)
    b_actions = buffer.actions.reshape((-1,) + env.action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = buffer.values.reshape(-1)

    # Optimizing the policy and value network
    b_inds = np.arange(batch_size)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, batch_size, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
            if norm_adv:
                mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)

            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            if clip_vloss:
                v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                v_clipped = b_values[mb_inds] + torch.clamp(
                    newvalue - b_values[mb_inds],
                    -clip_coef,
                    clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()
            else:
                v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
            optimizer.step()

        if target_kl is not None:
            if approx_kl > target_kl:
                break

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

# Training

## Helper Function

In [14]:
def initialize_agent(env, device, learning_rate):
    agent = Agent(env).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-5)
    return agent, optimizer

def get_action_and_update_buffer(agent, obs, buffer, step):
    with torch.no_grad():
        action, logprob, _, value = agent.get_action_and_value(obs)
        buffer.values[step] = value.flatten()
    buffer.actions[step] = action
    buffer.logprobs[step] = logprob
    return action

def reset_environment(env, device):
    next_obs, infos = env.reset()
    next_obs = {k: torch.Tensor(v).unsqueeze(0).to(device) for k, v in next_obs.items()}
    return next_obs, infos

## Hyperparameters

In [15]:
exp_name = "Sender box"

learning_rate = 1e-5
seed = 1
# total_timesteps = 20000000
total_timesteps = 1000000
torch_deterministic = True
cuda = False
mps = False
track = False
wandb_project_name = "ppo-implementation-details"
wandb_entity = None
capture_video = False

# Algorithm-specific arguments
num_envs = 1
num_steps = 2048
anneal_lr = True
gae = True
gamma = 0.99
gae_lambda = 0.95
num_minibatches = 128
update_epochs = 10
norm_adv = True
clip_coef = 0.2
clip_vloss = True
ent_coef = 0.0
vf_coef = 0.5
max_grad_norm = 0.5
target_kl = None
store_freq = 20

# Calculate derived variables
batch_size = int(num_envs * num_steps)
minibatch_size = int(batch_size // num_minibatches)

In [16]:
torch.set_default_dtype(torch.float32)

run_name = f"{exp_name}__{seed}__{int(time.time())}"

writer = SummaryWriter(f"runs/{run_name}")

writer.add_text("environment/level_number", str(len(xml_files)), 0)
writer.add_text("environment/agents", ', '.join(agents), 0)
writer.add_text("hyperparameters/learning_rate", str(learning_rate), 0)
writer.add_text("hyperparameters/network_size", ', '.join(str(e) for e in [512, 256]), 0)
writer.add_text("hyperparameters/batch", str(minibatch_size), 0)

# TRY NOT TO MODIFY: seeding
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and cuda else "cpu")

obs, infos = env.reset()

sender, sender_optimizer = initialize_agent(env, device, learning_rate)
receiver, receiver_optimizer = initialize_agent(env, device, learning_rate)

buffer_sender = Buffer(num_steps, env, num_envs, device)
buffer_receiver = Buffer(num_steps, env, num_envs, device)

global_step = 0
start_time = time.time()
next_obs, infos = reset_environment(env, device)

next_done = {"sender": torch.zeros(num_envs).to(device), "receiver": torch.zeros(num_envs).to(device)}

num_updates = total_timesteps // batch_size
train_start = time.time()

epoch_lengths = []
current_length = 0

for update in progressbar(range(1, num_updates + 1), redirect_stdout=True):
    # Annealing the rate if instructed to do so.
    if anneal_lr:
        frac = 1.0 - (update - 1.0) / num_updates
        lrnow = frac * learning_rate
        sender_optimizer.param_groups[0]["lr"] = lrnow
        receiver_optimizer.param_groups[0]["lr"] = lrnow
    
    epoch_rewards = {"sender":0, "receiver":0}
    current_rewards = {"sender":[], "receiver":[]}
    variances = {"sender":[], "receiver":[]}
    epoch_runs = 0
    episode_accuracies = 0
    episode_sendAccuracies = 0
    for step in range(0, num_steps):
        global_step += 1 * num_envs
        current_length += 1
        buffer_sender.obs[step] = next_obs["sender"]
        buffer_receiver.obs[step] = next_obs["receiver"]


        sender_action = get_action_and_update_buffer(sender, next_obs["sender"], buffer_sender, step)
        receiver_action = get_action_and_update_buffer(receiver, next_obs["receiver"], buffer_receiver, step)

        next_obs, reward, terminations, truncations, info = env.step({"sender": sender_action.cpu().numpy()[0], "receiver": receiver_action.cpu().numpy()[0]})
        current_rewards["sender"].append(reward["sender"])
        current_rewards["receiver"].append(reward["receiver"])
        next_obs = {"sender": torch.Tensor(next_obs["sender"]).unsqueeze(0).to(device), "receiver": torch.Tensor(next_obs["receiver"]).unsqueeze(0).to(device)}

        if terminations["sender"] or terminations["receiver"] or truncations["sender"] or truncations["receiver"]:
            next_obs, infos = reset_environment(env, device)
            epoch_rewards["sender"] += sum(current_rewards["sender"])
            epoch_rewards["receiver"] += sum(current_rewards["receiver"])

            epoch_lengths.append(current_length)
            current_length = 0

            dynamic = env.env.env.environment_dynamics[3]

            if len(dynamic.sendAccuracies) > 512:
                episode_sendAccuracies = sum(dynamic.sendAccuracies[-512:]) / 512
                del dynamic.sendAccuracies[:-513]
                writer.add_scalar("charts/sender/accuracies", episode_sendAccuracies, global_step)

            if len(dynamic.accuracies) > 4:
                window = min(15, len(dynamic.accuracies))
                episode_accuracies = sum(dynamic.accuracies[-1 * window:]) / window
                writer.add_scalar("charts/receiver/accuracies", episode_accuracies, global_step)
                if window == 15:
                    del dynamic.accuracies[:-16]

            if len(dynamic.variances) > 4:
                window = min(15, len(dynamic.variances))
                current_variance = sum(dynamic.variances[-1 * window:]) / window
                writer.add_scalar("charts/receiver_variance", current_variance, global_step)
                if window == 15:
                    del dynamic.variances[:-16]

            if len(epoch_lengths) > 3:
                window = min(10, len(epoch_lengths))
                epoch_length = sum(epoch_lengths[-1 * window:]) / window
                writer.add_scalar("charts/episodic_length", epoch_length, global_step)
                if window == 10:
                    del epoch_lengths[:-11]
            epoch_runs += 1
        
        buffer_sender.rewards[step] = torch.tensor(reward["sender"]).to(device).view(-1)
        buffer_receiver.rewards[step] = torch.tensor(reward["receiver"]).to(device).view(-1)
        next_done = {"sender": torch.Tensor([terminations["sender"]]).to(device), "receiver": torch.Tensor([terminations["receiver"]]).to(device)}
    if update % store_freq == 0:
        torch.save(sender, "models/model" + str(start_time) + ".pth")
        torch.save(receiver, "models/model" + str(start_time) + ".pth")

    update_agent(sender, buffer_sender, sender_optimizer, next_obs["sender"], next_done["sender"], env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, step)
    update_agent(receiver, buffer_receiver, receiver_optimizer, next_obs["receiver"], next_done["receiver"], env, batch_size, update_epochs, minibatch_size, clip_coef, vf_coef, ent_coef, max_grad_norm, target_kl, clip_vloss, norm_adv, gae_lambda, gae, gamma, device, step)

    writer.add_scalar("charts/learning_rate", sender_optimizer.param_groups[0]["lr"], global_step)
    writer.add_scalar("charts/sender/episodic_return", epoch_rewards["sender"] / epoch_runs, global_step)
    writer.add_scalar("charts/receiver/episodic_return", epoch_rewards["receiver"] / epoch_runs, global_step)
    print("SPS:", int(global_step / (time.time() - start_time)), "Average Reward:", epoch_rewards["sender"] / epoch_runs)
    writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)

torch.save(sender, "models/model" + str(start_time) + ".pth")
torch.save(receiver, "models/model" + str(start_time) + ".pth")
writer.close()

  0% (0 of 488) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--


SPS: 42 Average Reward: -38.47766596159493


  0% (1 of 488) |                        | Elapsed Time: 0:00:48 ETA:   6:34:19


SPS: 41 Average Reward: -10.822999929687757


  0% (2 of 488) |                        | Elapsed Time: 0:01:39 ETA:   6:42:21


SPS: 41 Average Reward: -10.975155609655372


  0% (3 of 488) |                        | Elapsed Time: 0:02:27 ETA:   6:38:18


SPS: 41 Average Reward: -10.778559488083117


  0% (4 of 488) |                        | Elapsed Time: 0:03:17 ETA:   6:38:05


SPS: 41 Average Reward: -11.306248387899432


  1% (5 of 488) |                        | Elapsed Time: 0:04:08 ETA:   6:39:52


SPS: 41 Average Reward: -11.300576280092617


  1% (6 of 488) |                        | Elapsed Time: 0:04:57 ETA:   6:38:42


SPS: 41 Average Reward: -10.874636331268388


  1% (7 of 488) |                        | Elapsed Time: 0:05:48 ETA:   6:39:39


SPS: 41 Average Reward: -10.09615192401044


  1% (8 of 488) |                        | Elapsed Time: 0:06:38 ETA:   6:38:23


SPS: 41 Average Reward: -10.138541300980553


  1% (9 of 488) |                        | Elapsed Time: 0:07:26 ETA:   6:36:06


SPS: 41 Average Reward: -10.72507597148192


  2% (10 of 488) |                       | Elapsed Time: 0:08:17 ETA:   6:36:11


SPS: 41 Average Reward: -10.383748422734653


  2% (11 of 488) |                       | Elapsed Time: 0:09:03 ETA:   6:32:42


SPS: 41 Average Reward: -10.233312488274022


  2% (12 of 488) |                       | Elapsed Time: 0:09:54 ETA:   6:32:49


SPS: 41 Average Reward: -10.429304817792579


  2% (13 of 488) |                       | Elapsed Time: 0:10:42 ETA:   6:31:28


SPS: 41 Average Reward: -10.169472630555752


  2% (14 of 488) |                       | Elapsed Time: 0:11:32 ETA:   6:30:52


SPS: 41 Average Reward: -10.879085712778528


  3% (15 of 488) |                       | Elapsed Time: 0:12:24 ETA:   6:31:23


SPS: 41 Average Reward: -19.711018911174033


  3% (16 of 488) |                       | Elapsed Time: 0:13:15 ETA:   6:31:01


SPS: 41 Average Reward: -20.696397256741804


  3% (17 of 488) |                       | Elapsed Time: 0:14:03 ETA:   6:29:38


SPS: 41 Average Reward: -10.340911207003952


  3% (18 of 488) |                       | Elapsed Time: 0:14:52 ETA:   6:28:19


SPS: 41 Average Reward: -9.68126322467367


  3% (19 of 488) |                       | Elapsed Time: 0:15:43 ETA:   6:27:59


SPS: 41 Average Reward: -9.29509759183674


  4% (20 of 488) |                       | Elapsed Time: 0:16:33 ETA:   6:27:34


SPS: 41 Average Reward: -10.423011711477555


  4% (21 of 488) |                       | Elapsed Time: 0:17:24 ETA:   6:26:57


SPS: 41 Average Reward: -9.689639473688452


  4% (22 of 488) |#                      | Elapsed Time: 0:18:11 ETA:   6:25:13


SPS: 41 Average Reward: -9.714772024466813


  4% (23 of 488) |#                      | Elapsed Time: 0:18:56 ETA:   6:23:03


SPS: 41 Average Reward: -15.011814439525395


  4% (24 of 488) |#                      | Elapsed Time: 0:19:46 ETA:   6:22:10


SPS: 41 Average Reward: -20.69535178229416


  5% (25 of 488) |#                      | Elapsed Time: 0:20:36 ETA:   6:21:44


SPS: 41 Average Reward: -22.257289057128023


  5% (26 of 488) |#                      | Elapsed Time: 0:21:29 ETA:   6:21:51


SPS: 41 Average Reward: -20.990670265406884


  5% (27 of 488) |#                      | Elapsed Time: 0:22:16 ETA:   6:20:14


SPS: 41 Average Reward: -20.64358516667388


  5% (28 of 488) |#                      | Elapsed Time: 0:23:06 ETA:   6:19:39


SPS: 41 Average Reward: -13.070276772690367


  5% (29 of 488) |#                      | Elapsed Time: 0:23:55 ETA:   6:18:33


SPS: 41 Average Reward: -18.744179279364875


  6% (30 of 488) |#                      | Elapsed Time: 0:24:48 ETA:   6:18:38


SPS: 41 Average Reward: -19.618929078457935


  6% (31 of 488) |#                      | Elapsed Time: 0:25:35 ETA:   6:17:17


SPS: 41 Average Reward: -12.680159471442728


  6% (32 of 488) |#                      | Elapsed Time: 0:26:24 ETA:   6:16:14


SPS: 41 Average Reward: -17.72958151272177


  6% (33 of 488) |#                      | Elapsed Time: 0:27:15 ETA:   6:15:44


SPS: 41 Average Reward: -17.1344834921515


  6% (34 of 488) |#                      | Elapsed Time: 0:28:01 ETA:   6:14:06


SPS: 41 Average Reward: -19.132307860198434


  7% (35 of 488) |#                      | Elapsed Time: 0:28:47 ETA:   6:12:38


SPS: 41 Average Reward: -12.260584528866167


  7% (36 of 488) |#                      | Elapsed Time: 0:29:39 ETA:   6:12:27


SPS: 41 Average Reward: -11.97106088346031


  7% (37 of 488) |#                      | Elapsed Time: 0:30:30 ETA:   6:11:56


SPS: 41 Average Reward: -15.306348270493435


  7% (38 of 488) |#                      | Elapsed Time: 0:31:21 ETA:   6:11:19


SPS: 41 Average Reward: -21.914731975565456


  7% (39 of 488) |#                      | Elapsed Time: 0:32:10 ETA:   6:10:25


SPS: 41 Average Reward: -11.261338776868785


  8% (40 of 488) |#                      | Elapsed Time: 0:33:03 ETA:   6:10:13


SPS: 41 Average Reward: -16.96193202501372


  8% (41 of 488) |#                      | Elapsed Time: 0:33:54 ETA:   6:09:40


SPS: 41 Average Reward: -22.08165824882251


  8% (42 of 488) |#                      | Elapsed Time: 0:34:39 ETA:   6:08:05


SPS: 41 Average Reward: -18.378968251465857


  8% (43 of 488) |##                     | Elapsed Time: 0:35:27 ETA:   6:07:01


SPS: 41 Average Reward: -18.16836248544085


  9% (44 of 488) |##                     | Elapsed Time: 0:36:20 ETA:   6:06:43


SPS: 41 Average Reward: -19.29391895599772


  9% (45 of 488) |##                     | Elapsed Time: 0:37:11 ETA:   6:06:03


SPS: 41 Average Reward: -16.368390717411394


  9% (46 of 488) |##                     | Elapsed Time: 0:38:01 ETA:   6:05:23


SPS: 41 Average Reward: -13.381731204191247


  9% (47 of 488) |##                     | Elapsed Time: 0:38:48 ETA:   6:04:05


SPS: 41 Average Reward: -12.980826060119677


  9% (48 of 488) |##                     | Elapsed Time: 0:39:40 ETA:   6:03:37


SPS: 41 Average Reward: -14.802864446476331


 10% (49 of 488) |##                     | Elapsed Time: 0:40:29 ETA:   6:02:42


SPS: 41 Average Reward: -16.586131287070746


 10% (50 of 488) |##                     | Elapsed Time: 0:41:18 ETA:   6:01:52


SPS: 41 Average Reward: -16.155328299479965


 10% (51 of 488) |##                     | Elapsed Time: 0:42:09 ETA:   6:01:16


SPS: 41 Average Reward: -13.492717061513114


 10% (52 of 488) |##                     | Elapsed Time: 0:42:59 ETA:   6:00:26


SPS: 41 Average Reward: -14.644871945723317


 10% (53 of 488) |##                     | Elapsed Time: 0:43:48 ETA:   5:59:32


SPS: 41 Average Reward: -13.680501010482681


 11% (54 of 488) |##                     | Elapsed Time: 0:44:37 ETA:   5:58:37


SPS: 41 Average Reward: -14.190331961448681


 11% (55 of 488) |##                     | Elapsed Time: 0:45:28 ETA:   5:57:58


SPS: 41 Average Reward: -19.335049062881023


 11% (56 of 488) |##                     | Elapsed Time: 0:46:19 ETA:   5:57:20


SPS: 41 Average Reward: -17.502416599574786


 11% (57 of 488) |##                     | Elapsed Time: 0:47:10 ETA:   5:56:40


SPS: 41 Average Reward: -13.196812772579278


 11% (58 of 488) |##                     | Elapsed Time: 0:48:00 ETA:   5:55:55


SPS: 41 Average Reward: -16.94873449904702


 12% (59 of 488) |##                     | Elapsed Time: 0:48:51 ETA:   5:55:17


SPS: 41 Average Reward: -10.848137399616737


 12% (60 of 488) |##                     | Elapsed Time: 0:49:42 ETA:   5:54:36


SPS: 41 Average Reward: -13.49977565542632


 12% (61 of 488) |##                     | Elapsed Time: 0:50:30 ETA:   5:53:34


SPS: 41 Average Reward: -18.45587506373591


 12% (62 of 488) |##                     | Elapsed Time: 0:51:20 ETA:   5:52:48


SPS: 41 Average Reward: -15.980229063411565


 12% (63 of 488) |##                     | Elapsed Time: 0:52:10 ETA:   5:52:00


SPS: 41 Average Reward: -15.141831426154365


 13% (64 of 488) |###                    | Elapsed Time: 0:53:03 ETA:   5:51:33


SPS: 41 Average Reward: -14.669607032512593


 13% (65 of 488) |###                    | Elapsed Time: 0:53:53 ETA:   5:50:44


SPS: 41 Average Reward: -18.732471821225147


 13% (66 of 488) |###                    | Elapsed Time: 0:54:45 ETA:   5:50:07


SPS: 41 Average Reward: -17.0286240312094


 13% (67 of 488) |###                    | Elapsed Time: 0:55:37 ETA:   5:49:33


SPS: 41 Average Reward: -18.137020479369475


 13% (68 of 488) |###                    | Elapsed Time: 0:56:28 ETA:   5:48:47


SPS: 41 Average Reward: -10.852342549039797


 14% (69 of 488) |###                    | Elapsed Time: 0:57:17 ETA:   5:47:54


SPS: 41 Average Reward: -18.782975863436025


 14% (70 of 488) |###                    | Elapsed Time: 0:58:07 ETA:   5:47:03


SPS: 41 Average Reward: -17.621329169311018


 14% (71 of 488) |###                    | Elapsed Time: 0:58:56 ETA:   5:46:11


SPS: 41 Average Reward: -13.096480884099131


 14% (72 of 488) |###                    | Elapsed Time: 0:59:44 ETA:   5:45:07


SPS: 41 Average Reward: -11.341931748098984


 14% (73 of 488) |###                    | Elapsed Time: 1:00:31 ETA:   5:44:04


SPS: 41 Average Reward: -12.973742565858005


 15% (74 of 488) |###                    | Elapsed Time: 1:01:18 ETA:   5:43:01


SPS: 41 Average Reward: -13.37693528693844


 15% (75 of 488) |###                    | Elapsed Time: 1:02:08 ETA:   5:42:12


SPS: 41 Average Reward: -15.838863202273927


 15% (76 of 488) |###                    | Elapsed Time: 1:02:56 ETA:   5:41:10


SPS: 41 Average Reward: -11.861308462123292


 15% (77 of 488) |###                    | Elapsed Time: 1:03:45 ETA:   5:40:20


SPS: 41 Average Reward: -11.857696006276182


 15% (78 of 488) |###                    | Elapsed Time: 1:04:37 ETA:   5:39:41


SPS: 41 Average Reward: -12.845522722731607


 16% (79 of 488) |###                    | Elapsed Time: 1:05:25 ETA:   5:38:43


SPS: 41 Average Reward: -9.586611900812995


 16% (80 of 488) |###                    | Elapsed Time: 1:06:13 ETA:   5:37:44


SPS: 41 Average Reward: -14.025229681854876


 16% (81 of 488) |###                    | Elapsed Time: 1:07:03 ETA:   5:36:55


SPS: 41 Average Reward: -16.137796671461935


 16% (82 of 488) |###                    | Elapsed Time: 1:07:52 ETA:   5:36:05


SPS: 41 Average Reward: -11.906130983049222


 17% (83 of 488) |###                    | Elapsed Time: 1:08:43 ETA:   5:35:20


SPS: 41 Average Reward: -9.673112541345134


 17% (84 of 488) |###                    | Elapsed Time: 1:09:34 ETA:   5:34:36


SPS: 41 Average Reward: -16.36778227939183


 17% (85 of 488) |####                   | Elapsed Time: 1:10:21 ETA:   5:33:36


SPS: 41 Average Reward: -14.813975498103153


 17% (86 of 488) |####                   | Elapsed Time: 1:11:09 ETA:   5:32:37


SPS: 41 Average Reward: -13.18972878188939


 17% (87 of 488) |####                   | Elapsed Time: 1:11:56 ETA:   5:31:37


SPS: 41 Average Reward: -8.814535850657894


 18% (88 of 488) |####                   | Elapsed Time: 1:12:48 ETA:   5:30:56


SPS: 41 Average Reward: -15.838811604930157


 18% (89 of 488) |####                   | Elapsed Time: 1:13:33 ETA:   5:29:47


SPS: 41 Average Reward: -10.630232433969791


 18% (90 of 488) |####                   | Elapsed Time: 1:14:23 ETA:   5:29:00


SPS: 41 Average Reward: -12.450666595444874


 18% (91 of 488) |####                   | Elapsed Time: 1:15:14 ETA:   5:28:14


SPS: 41 Average Reward: -11.999880183055975


 18% (92 of 488) |####                   | Elapsed Time: 1:16:04 ETA:   5:27:25


SPS: 41 Average Reward: -10.631760878866224


 19% (93 of 488) |####                   | Elapsed Time: 1:16:54 ETA:   5:26:37


SPS: 41 Average Reward: -12.947839262679134


 19% (94 of 488) |####                   | Elapsed Time: 1:17:44 ETA:   5:25:50


SPS: 41 Average Reward: -13.639067250104285


 19% (95 of 488) |####                   | Elapsed Time: 1:18:34 ETA:   5:25:02


SPS: 41 Average Reward: -11.707979062733408


 19% (96 of 488) |####                   | Elapsed Time: 1:19:24 ETA:   5:24:13


SPS: 41 Average Reward: -12.514798385067426


 19% (97 of 488) |####                   | Elapsed Time: 1:20:11 ETA:   5:23:15


SPS: 41 Average Reward: -13.799897566658995


 20% (98 of 488) |####                   | Elapsed Time: 1:21:00 ETA:   5:22:21


SPS: 41 Average Reward: -12.724175160107126


 20% (99 of 488) |####                   | Elapsed Time: 1:21:46 ETA:   5:21:19


SPS: 41 Average Reward: -11.008975483547522


 20% (100 of 488) |####                  | Elapsed Time: 1:22:31 ETA:   5:20:11


SPS: 41 Average Reward: -11.279622535926546


 20% (101 of 488) |####                  | Elapsed Time: 1:23:18 ETA:   5:19:12


SPS: 41 Average Reward: -10.632816132720038


 20% (102 of 488) |####                  | Elapsed Time: 1:24:08 ETA:   5:18:24


SPS: 41 Average Reward: -13.538574190706587


 21% (103 of 488) |####                  | Elapsed Time: 1:24:58 ETA:   5:17:38


SPS: 41 Average Reward: -12.65395675900345


 21% (104 of 488) |####                  | Elapsed Time: 1:25:48 ETA:   5:16:49


SPS: 41 Average Reward: -12.527298570867767


 21% (105 of 488) |####                  | Elapsed Time: 1:26:38 ETA:   5:16:01


SPS: 41 Average Reward: -9.413490955349339


 21% (106 of 488) |####                  | Elapsed Time: 1:27:27 ETA:   5:15:12


SPS: 41 Average Reward: -11.079331283300345


 21% (107 of 488) |####                  | Elapsed Time: 1:28:17 ETA:   5:14:23


SPS: 41 Average Reward: -11.826173269690498


 22% (108 of 488) |####                  | Elapsed Time: 1:29:07 ETA:   5:13:35


SPS: 41 Average Reward: -7.690468068916333


 22% (109 of 488) |####                  | Elapsed Time: 1:29:58 ETA:   5:12:50


SPS: 41 Average Reward: -12.512759018805781


 22% (110 of 488) |####                  | Elapsed Time: 1:30:43 ETA:   5:11:46


SPS: 41 Average Reward: -10.542565951943816


 22% (111 of 488) |#####                 | Elapsed Time: 1:31:33 ETA:   5:10:58


SPS: 41 Average Reward: -9.635506822005656


 22% (112 of 488) |#####                 | Elapsed Time: 1:32:24 ETA:   5:10:12


SPS: 41 Average Reward: -9.061067288832763


 23% (113 of 488) |#####                 | Elapsed Time: 1:33:13 ETA:   5:09:23


SPS: 41 Average Reward: -11.850794718088308


 23% (114 of 488) |#####                 | Elapsed Time: 1:34:03 ETA:   5:08:34


SPS: 41 Average Reward: -10.488145892518157


 23% (115 of 488) |#####                 | Elapsed Time: 1:34:47 ETA:   5:07:28


SPS: 41 Average Reward: -9.502017338012587


 23% (116 of 488) |#####                 | Elapsed Time: 1:35:38 ETA:   5:06:41


SPS: 41 Average Reward: -11.281683544317223


 23% (117 of 488) |#####                 | Elapsed Time: 1:36:27 ETA:   5:05:52


SPS: 41 Average Reward: -11.280950880828051


 24% (118 of 488) |#####                 | Elapsed Time: 1:37:17 ETA:   5:05:03


SPS: 41 Average Reward: -11.540791434039638


 24% (119 of 488) |#####                 | Elapsed Time: 1:38:01 ETA:   5:03:58


SPS: 41 Average Reward: -9.347479197740306


 24% (120 of 488) |#####                 | Elapsed Time: 1:38:52 ETA:   5:03:11


SPS: 41 Average Reward: -9.322590305065763


 24% (121 of 488) |#####                 | Elapsed Time: 1:39:42 ETA:   5:02:24


SPS: 41 Average Reward: -11.301285270503607


 25% (122 of 488) |#####                 | Elapsed Time: 1:40:31 ETA:   5:01:35


SPS: 41 Average Reward: -10.180592999171063


 25% (123 of 488) |#####                 | Elapsed Time: 1:41:21 ETA:   5:00:47


SPS: 41 Average Reward: -10.527001545168584


 25% (124 of 488) |#####                 | Elapsed Time: 1:42:11 ETA:   4:59:59


SPS: 41 Average Reward: -11.368462536445797


 25% (125 of 488) |#####                 | Elapsed Time: 1:42:59 ETA:   4:59:04


SPS: 41 Average Reward: -11.799520851703456


 25% (126 of 488) |#####                 | Elapsed Time: 1:43:49 ETA:   4:58:16


SPS: 41 Average Reward: -10.447703285212825


 26% (127 of 488) |#####                 | Elapsed Time: 1:44:36 ETA:   4:57:22


SPS: 41 Average Reward: -10.4591599755984


 26% (128 of 488) |#####                 | Elapsed Time: 1:45:27 ETA:   4:56:36


SPS: 41 Average Reward: -10.720376434404589


 26% (129 of 488) |#####                 | Elapsed Time: 1:46:17 ETA:   4:55:47


SPS: 41 Average Reward: -9.24442219874233


 26% (130 of 488) |#####                 | Elapsed Time: 1:47:06 ETA:   4:54:58


SPS: 41 Average Reward: -9.515182693173038


 26% (131 of 488) |#####                 | Elapsed Time: 1:47:56 ETA:   4:54:09


SPS: 41 Average Reward: -10.011028864839142


 27% (132 of 488) |#####                 | Elapsed Time: 1:48:46 ETA:   4:53:21


SPS: 41 Average Reward: -11.462101206643334


 27% (133 of 488) |#####                 | Elapsed Time: 1:49:37 ETA:   4:52:35


SPS: 41 Average Reward: -10.187564782099537


 27% (134 of 488) |######                | Elapsed Time: 1:50:21 ETA:   4:51:33


SPS: 41 Average Reward: -11.005235970027378


 27% (135 of 488) |######                | Elapsed Time: 1:51:11 ETA:   4:50:45


SPS: 41 Average Reward: -9.551919909033202


 27% (136 of 488) |######                | Elapsed Time: 1:51:59 ETA:   4:49:50


SPS: 41 Average Reward: -9.143990415489583


 28% (137 of 488) |######                | Elapsed Time: 1:52:46 ETA:   4:48:55


SPS: 41 Average Reward: -9.510924761384906


 28% (138 of 488) |######                | Elapsed Time: 1:53:36 ETA:   4:48:07


SPS: 41 Average Reward: -10.65275860461588


 28% (139 of 488) |######                | Elapsed Time: 1:54:26 ETA:   4:47:19


SPS: 41 Average Reward: -9.507081065032251


 28% (140 of 488) |######                | Elapsed Time: 1:55:15 ETA:   4:46:30


SPS: 41 Average Reward: -9.763008278132725


 28% (141 of 488) |######                | Elapsed Time: 1:56:00 ETA:   4:45:28


SPS: 41 Average Reward: -9.922688182322485


 29% (142 of 488) |######                | Elapsed Time: 1:56:49 ETA:   4:44:40


SPS: 41 Average Reward: -10.151712587234632


 29% (143 of 488) |######                | Elapsed Time: 1:57:34 ETA:   4:43:39


SPS: 41 Average Reward: -10.865839459045885


 29% (144 of 488) |######                | Elapsed Time: 1:58:19 ETA:   4:42:40


SPS: 41 Average Reward: -9.888656911636694


 29% (145 of 488) |######                | Elapsed Time: 1:59:09 ETA:   4:41:52


SPS: 41 Average Reward: -10.673080439529


 29% (146 of 488) |######                | Elapsed Time: 1:59:55 ETA:   4:40:54


SPS: 41 Average Reward: -8.942489372831252


 30% (147 of 488) |######                | Elapsed Time: 2:00:44 ETA:   4:40:05


SPS: 41 Average Reward: -10.91404340771485


 30% (148 of 488) |######                | Elapsed Time: 2:01:32 ETA:   4:39:12


SPS: 41 Average Reward: -10.123517289299452


 30% (149 of 488) |######                | Elapsed Time: 2:02:19 ETA:   4:38:18


SPS: 41 Average Reward: -8.591213469241872


 30% (150 of 488) |######                | Elapsed Time: 2:03:09 ETA:   4:37:30


SPS: 41 Average Reward: -9.88584556363184


 30% (151 of 488) |######                | Elapsed Time: 2:03:59 ETA:   4:36:42


SPS: 41 Average Reward: -10.365125463628177


 31% (152 of 488) |######                | Elapsed Time: 2:04:49 ETA:   4:35:54


SPS: 41 Average Reward: -9.473872196859183


 31% (153 of 488) |######                | Elapsed Time: 2:05:36 ETA:   4:35:00


SPS: 41 Average Reward: -9.22805605777778


 31% (154 of 488) |######                | Elapsed Time: 2:06:26 ETA:   4:34:13


SPS: 41 Average Reward: -9.027634715147897


 31% (155 of 488) |######                | Elapsed Time: 2:07:13 ETA:   4:33:19


SPS: 41 Average Reward: -8.701012201836559


 31% (156 of 488) |#######               | Elapsed Time: 2:07:58 ETA:   4:32:21


SPS: 41 Average Reward: -11.327694768410801


 32% (157 of 488) |#######               | Elapsed Time: 2:08:48 ETA:   4:31:33


SPS: 41 Average Reward: -9.516268027066795


 32% (158 of 488) |#######               | Elapsed Time: 2:09:38 ETA:   4:30:46


SPS: 41 Average Reward: -9.936054044620024


 32% (159 of 488) |#######               | Elapsed Time: 2:10:23 ETA:   4:29:47


SPS: 41 Average Reward: -9.78435057193204


 32% (160 of 488) |#######               | Elapsed Time: 2:11:07 ETA:   4:28:49


SPS: 41 Average Reward: -8.263014738390705


 32% (161 of 488) |#######               | Elapsed Time: 2:11:57 ETA:   4:28:00


SPS: 41 Average Reward: -10.915232429194099


 33% (162 of 488) |#######               | Elapsed Time: 2:12:47 ETA:   4:27:13


SPS: 41 Average Reward: -9.71785538575058


 33% (163 of 488) |#######               | Elapsed Time: 2:13:34 ETA:   4:26:20


SPS: 41 Average Reward: -9.66713661104295


 33% (164 of 488) |#######               | Elapsed Time: 2:14:24 ETA:   4:25:32


SPS: 41 Average Reward: -10.251288897640116


 33% (165 of 488) |#######               | Elapsed Time: 2:15:11 ETA:   4:24:38


SPS: 41 Average Reward: -8.847127001222342


 34% (166 of 488) |#######               | Elapsed Time: 2:16:01 ETA:   4:23:50


SPS: 41 Average Reward: -8.777364177095794


 34% (167 of 488) |#######               | Elapsed Time: 2:16:50 ETA:   4:23:02


SPS: 41 Average Reward: -9.156545478176138


 34% (168 of 488) |#######               | Elapsed Time: 2:17:40 ETA:   4:22:14


SPS: 41 Average Reward: -8.559801298231756


 34% (169 of 488) |#######               | Elapsed Time: 2:18:27 ETA:   4:21:21


SPS: 41 Average Reward: -8.466608804937774


 34% (170 of 488) |#######               | Elapsed Time: 2:19:17 ETA:   4:20:33


SPS: 41 Average Reward: -8.975098337146427


 35% (171 of 488) |#######               | Elapsed Time: 2:20:05 ETA:   4:19:41


SPS: 41 Average Reward: -7.977577893136984


 35% (172 of 488) |#######               | Elapsed Time: 2:20:50 ETA:   4:18:44


SPS: 41 Average Reward: -9.666192612073774


 35% (173 of 488) |#######               | Elapsed Time: 2:21:40 ETA:   4:17:57


SPS: 41 Average Reward: -8.387758579371221


 35% (174 of 488) |#######               | Elapsed Time: 2:22:27 ETA:   4:17:04


SPS: 41 Average Reward: -8.639448964304732


 35% (175 of 488) |#######               | Elapsed Time: 2:23:17 ETA:   4:16:16


SPS: 41 Average Reward: -8.20355702820422


 36% (176 of 488) |#######               | Elapsed Time: 2:24:02 ETA:   4:15:20


SPS: 41 Average Reward: -8.052347960043399


 36% (177 of 488) |#######               | Elapsed Time: 2:24:46 ETA:   4:14:23


SPS: 41 Average Reward: -8.791518328684553


 36% (178 of 488) |########              | Elapsed Time: 2:25:34 ETA:   4:13:31


SPS: 41 Average Reward: -8.194706432625448


 36% (179 of 488) |########              | Elapsed Time: 2:26:18 ETA:   4:12:34


SPS: 41 Average Reward: -8.52599157924804


 36% (180 of 488) |########              | Elapsed Time: 2:27:08 ETA:   4:11:45


SPS: 41 Average Reward: -9.097075605374458


 37% (181 of 488) |########              | Elapsed Time: 2:27:57 ETA:   4:10:57


SPS: 41 Average Reward: -9.16489052133262


 37% (182 of 488) |########              | Elapsed Time: 2:28:47 ETA:   4:10:09


SPS: 41 Average Reward: -8.497320292319936


 37% (183 of 488) |########              | Elapsed Time: 2:29:34 ETA:   4:09:17


SPS: 41 Average Reward: -12.11305408533657


 37% (184 of 488) |########              | Elapsed Time: 2:30:21 ETA:   4:08:25


SPS: 41 Average Reward: -8.796411223557412


 37% (185 of 488) |########              | Elapsed Time: 2:31:08 ETA:   4:07:33


SPS: 41 Average Reward: -7.67609992027248


 38% (186 of 488) |########              | Elapsed Time: 2:31:55 ETA:   4:06:40


SPS: 41 Average Reward: -7.7172986124466245


 38% (187 of 488) |########              | Elapsed Time: 2:32:44 ETA:   4:05:52


SPS: 41 Average Reward: -7.082733677569036


 38% (188 of 488) |########              | Elapsed Time: 2:33:31 ETA:   4:04:59


SPS: 41 Average Reward: -8.240932251599498


 38% (189 of 488) |########              | Elapsed Time: 2:34:18 ETA:   4:04:06


SPS: 41 Average Reward: -8.404997684561096


 38% (190 of 488) |########              | Elapsed Time: 2:35:07 ETA:   4:03:18


SPS: 41 Average Reward: -8.343030650718218


 39% (191 of 488) |########              | Elapsed Time: 2:35:52 ETA:   4:02:23


SPS: 41 Average Reward: -8.034969091868817


 39% (192 of 488) |########              | Elapsed Time: 2:36:40 ETA:   4:01:31


SPS: 41 Average Reward: -7.282396316584908


 39% (193 of 488) |########              | Elapsed Time: 2:37:27 ETA:   4:00:40


SPS: 41 Average Reward: -8.076866775141681


 39% (194 of 488) |########              | Elapsed Time: 2:38:16 ETA:   3:59:52


SPS: 41 Average Reward: -7.910422260477782


 39% (195 of 488) |########              | Elapsed Time: 2:39:06 ETA:   3:59:03


SPS: 41 Average Reward: -8.78829450272477


 40% (196 of 488) |########              | Elapsed Time: 2:39:55 ETA:   3:58:15


SPS: 41 Average Reward: -7.227340321889306


 40% (197 of 488) |########              | Elapsed Time: 2:40:42 ETA:   3:57:23


SPS: 41 Average Reward: -8.001512546233025


 40% (198 of 488) |########              | Elapsed Time: 2:41:31 ETA:   3:56:35


SPS: 41 Average Reward: -7.266830985540311


 40% (199 of 488) |########              | Elapsed Time: 2:42:21 ETA:   3:55:47


SPS: 41 Average Reward: -7.4249844775975475


 40% (200 of 488) |#########             | Elapsed Time: 2:43:10 ETA:   3:54:58


SPS: 41 Average Reward: -7.895415445457227


 41% (201 of 488) |#########             | Elapsed Time: 2:43:55 ETA:   3:54:03


SPS: 41 Average Reward: -8.064309419864442


 41% (202 of 488) |#########             | Elapsed Time: 2:44:41 ETA:   3:53:10


SPS: 41 Average Reward: -7.854314308781347


 41% (203 of 488) |#########             | Elapsed Time: 2:45:28 ETA:   3:52:18


SPS: 41 Average Reward: -6.697563330741938


 41% (204 of 488) |#########             | Elapsed Time: 2:46:12 ETA:   3:51:23


SPS: 41 Average Reward: -7.466648307960256


 42% (205 of 488) |#########             | Elapsed Time: 2:47:02 ETA:   3:50:35


SPS: 41 Average Reward: -7.849972214445718


 42% (206 of 488) |#########             | Elapsed Time: 2:47:51 ETA:   3:49:46


SPS: 41 Average Reward: -7.483331487425673


 42% (207 of 488) |#########             | Elapsed Time: 2:48:36 ETA:   3:48:52


SPS: 41 Average Reward: -7.800808398371167


 42% (208 of 488) |#########             | Elapsed Time: 2:49:23 ETA:   3:48:01


SPS: 41 Average Reward: -7.191398160913697


 42% (209 of 488) |#########             | Elapsed Time: 2:50:07 ETA:   3:47:06


SPS: 41 Average Reward: -7.923081728198315


 43% (210 of 488) |#########             | Elapsed Time: 2:50:55 ETA:   3:46:16


SPS: 41 Average Reward: -6.972259160046869


 43% (211 of 488) |#########             | Elapsed Time: 2:51:44 ETA:   3:45:27


SPS: 41 Average Reward: -7.939574757827016


 43% (212 of 488) |#########             | Elapsed Time: 2:52:31 ETA:   3:44:36


SPS: 41 Average Reward: -7.219549337769528


 43% (213 of 488) |#########             | Elapsed Time: 2:53:20 ETA:   3:43:48


SPS: 41 Average Reward: -8.216095884420028


 43% (214 of 488) |#########             | Elapsed Time: 2:54:05 ETA:   3:42:54


SPS: 41 Average Reward: -7.690298656936586


 44% (215 of 488) |#########             | Elapsed Time: 2:54:54 ETA:   3:42:05


SPS: 41 Average Reward: -7.449765178696438


 44% (216 of 488) |#########             | Elapsed Time: 2:55:43 ETA:   3:41:17


SPS: 41 Average Reward: -7.548099434265051


 44% (217 of 488) |#########             | Elapsed Time: 2:56:30 ETA:   3:40:26


SPS: 41 Average Reward: -7.132183036660379


 44% (218 of 488) |#########             | Elapsed Time: 2:57:20 ETA:   3:39:39


SPS: 41 Average Reward: -7.803725642844289


 44% (219 of 488) |#########             | Elapsed Time: 2:58:07 ETA:   3:38:48


SPS: 41 Average Reward: -7.202371303642617


 45% (220 of 488) |#########             | Elapsed Time: 2:58:54 ETA:   3:37:56


SPS: 41 Average Reward: -8.136642348314627


 45% (221 of 488) |#########             | Elapsed Time: 2:59:44 ETA:   3:37:09


SPS: 41 Average Reward: -7.3091828174130695


 45% (222 of 488) |##########            | Elapsed Time: 3:00:31 ETA:   3:36:18


SPS: 41 Average Reward: -6.590671181500003


 45% (223 of 488) |##########            | Elapsed Time: 3:01:16 ETA:   3:35:25


SPS: 42 Average Reward: -7.17154226352634


 45% (224 of 488) |##########            | Elapsed Time: 3:02:01 ETA:   3:34:31


SPS: 42 Average Reward: -6.624928671701255


 46% (225 of 488) |##########            | Elapsed Time: 3:02:48 ETA:   3:33:40


SPS: 42 Average Reward: -6.534570122732732


 46% (226 of 488) |##########            | Elapsed Time: 3:03:35 ETA:   3:32:50


SPS: 42 Average Reward: -7.323895891414303


 46% (227 of 488) |##########            | Elapsed Time: 3:04:22 ETA:   3:31:59


SPS: 42 Average Reward: -5.957376915047441


 46% (228 of 488) |##########            | Elapsed Time: 3:05:10 ETA:   3:31:10


SPS: 42 Average Reward: -7.804524994542031


 46% (229 of 488) |##########            | Elapsed Time: 3:05:56 ETA:   3:30:17


SPS: 42 Average Reward: -8.024015366485422


 47% (230 of 488) |##########            | Elapsed Time: 3:06:43 ETA:   3:29:27


SPS: 42 Average Reward: -7.338202835106866


 47% (231 of 488) |##########            | Elapsed Time: 3:07:31 ETA:   3:28:38


SPS: 42 Average Reward: -7.482224451156555


 47% (232 of 488) |##########            | Elapsed Time: 3:08:18 ETA:   3:27:47


SPS: 42 Average Reward: -7.452064952529777


 47% (233 of 488) |##########            | Elapsed Time: 3:09:03 ETA:   3:26:54


SPS: 42 Average Reward: -7.394044138291114


 47% (234 of 488) |##########            | Elapsed Time: 3:09:52 ETA:   3:26:06


SPS: 42 Average Reward: -7.582270251928049


 48% (235 of 488) |##########            | Elapsed Time: 3:10:40 ETA:   3:25:16


SPS: 42 Average Reward: -6.978135240871091


 48% (236 of 488) |##########            | Elapsed Time: 3:11:25 ETA:   3:24:23


SPS: 42 Average Reward: -7.4859362057399705


 48% (237 of 488) |##########            | Elapsed Time: 3:12:14 ETA:   3:23:36


SPS: 42 Average Reward: -6.337285881327263


 48% (238 of 488) |##########            | Elapsed Time: 3:13:04 ETA:   3:22:48


SPS: 42 Average Reward: -6.233329393167488


 48% (239 of 488) |##########            | Elapsed Time: 3:13:52 ETA:   3:21:58


SPS: 42 Average Reward: -6.54571504464605


 49% (240 of 488) |##########            | Elapsed Time: 3:14:38 ETA:   3:21:07


SPS: 42 Average Reward: -7.678985871804806


 49% (241 of 488) |##########            | Elapsed Time: 3:15:28 ETA:   3:20:20


SPS: 42 Average Reward: -6.9947512454174


 49% (242 of 488) |##########            | Elapsed Time: 3:16:18 ETA:   3:19:33


SPS: 42 Average Reward: -6.725172928830563


 49% (243 of 488) |##########            | Elapsed Time: 3:17:05 ETA:   3:18:43


SPS: 42 Average Reward: -7.275753434410753


 50% (244 of 488) |###########           | Elapsed Time: 3:17:56 ETA:   3:17:56


SPS: 42 Average Reward: -6.415195943902104


 50% (245 of 488) |###########           | Elapsed Time: 3:18:46 ETA:   3:17:08


SPS: 42 Average Reward: -6.703384120784225


 50% (246 of 488) |###########           | Elapsed Time: 3:19:35 ETA:   3:16:20


SPS: 42 Average Reward: -7.059713783887045


 50% (247 of 488) |###########           | Elapsed Time: 3:20:25 ETA:   3:15:33


SPS: 42 Average Reward: -6.849775758146163


 50% (248 of 488) |###########           | Elapsed Time: 3:21:12 ETA:   3:14:43


SPS: 42 Average Reward: -6.278253470487768


 51% (249 of 488) |###########           | Elapsed Time: 3:22:02 ETA:   3:13:55


SPS: 42 Average Reward: -7.0005377924851055


 51% (250 of 488) |###########           | Elapsed Time: 3:22:52 ETA:   3:13:08


SPS: 42 Average Reward: -6.491561703535011


 51% (251 of 488) |###########           | Elapsed Time: 3:23:42 ETA:   3:12:20


SPS: 42 Average Reward: -6.63590751398729


 51% (252 of 488) |###########           | Elapsed Time: 3:24:32 ETA:   3:11:32


SPS: 42 Average Reward: -6.970793796198525


 51% (253 of 488) |###########           | Elapsed Time: 3:25:17 ETA:   3:10:40


SPS: 42 Average Reward: -6.536515393491999


 52% (254 of 488) |###########           | Elapsed Time: 3:26:06 ETA:   3:09:53


SPS: 42 Average Reward: -6.504721022872117


 52% (255 of 488) |###########           | Elapsed Time: 3:26:51 ETA:   3:09:00


SPS: 42 Average Reward: -6.830552295302839


 52% (256 of 488) |###########           | Elapsed Time: 3:27:41 ETA:   3:08:13


SPS: 42 Average Reward: -6.270531525892995


 52% (257 of 488) |###########           | Elapsed Time: 3:28:28 ETA:   3:07:23


SPS: 42 Average Reward: -6.942584050011517


 52% (258 of 488) |###########           | Elapsed Time: 3:29:13 ETA:   3:06:31


SPS: 42 Average Reward: -6.455482777728214


 53% (259 of 488) |###########           | Elapsed Time: 3:30:03 ETA:   3:05:43


SPS: 42 Average Reward: -5.985829955909048


 53% (260 of 488) |###########           | Elapsed Time: 3:30:53 ETA:   3:04:55


SPS: 42 Average Reward: -6.572224989238245


 53% (261 of 488) |###########           | Elapsed Time: 3:31:40 ETA:   3:04:05


SPS: 42 Average Reward: -6.271684661520954


 53% (262 of 488) |###########           | Elapsed Time: 3:32:30 ETA:   3:03:18


SPS: 42 Average Reward: -6.072205400942901


 53% (263 of 488) |###########           | Elapsed Time: 3:33:14 ETA:   3:02:25


SPS: 42 Average Reward: -6.765166285342417


 54% (264 of 488) |###########           | Elapsed Time: 3:34:04 ETA:   3:01:38


SPS: 42 Average Reward: -6.631667428208597


 54% (265 of 488) |###########           | Elapsed Time: 3:34:54 ETA:   3:00:50


SPS: 42 Average Reward: -6.8053665165204364


 54% (266 of 488) |###########           | Elapsed Time: 3:35:39 ETA:   2:59:59


SPS: 42 Average Reward: -6.5241296480387705


 54% (267 of 488) |############          | Elapsed Time: 3:36:29 ETA:   2:59:11


SPS: 42 Average Reward: -6.316272313013865


 54% (268 of 488) |############          | Elapsed Time: 3:37:19 ETA:   2:58:23


SPS: 42 Average Reward: -6.475771938334827


 55% (269 of 488) |############          | Elapsed Time: 3:38:09 ETA:   2:57:36


SPS: 42 Average Reward: -6.3021177673407


 55% (270 of 488) |############          | Elapsed Time: 3:38:57 ETA:   2:56:46


SPS: 42 Average Reward: -6.242355148544275


 55% (271 of 488) |############          | Elapsed Time: 3:39:44 ETA:   2:55:57


SPS: 42 Average Reward: -6.393448163612755


 55% (272 of 488) |############          | Elapsed Time: 3:40:35 ETA:   2:55:10


SPS: 42 Average Reward: -5.742435133304953


 55% (273 of 488) |############          | Elapsed Time: 3:41:20 ETA:   2:54:18


SPS: 42 Average Reward: -6.301359359347264


 56% (274 of 488) |############          | Elapsed Time: 3:42:08 ETA:   2:53:29


SPS: 42 Average Reward: -5.983998572577086


 56% (275 of 488) |############          | Elapsed Time: 3:42:58 ETA:   2:52:42


SPS: 42 Average Reward: -6.260986216014237


 56% (276 of 488) |############          | Elapsed Time: 3:43:48 ETA:   2:51:54


SPS: 42 Average Reward: -6.22055763674415


 56% (277 of 488) |############          | Elapsed Time: 3:44:33 ETA:   2:51:03


SPS: 42 Average Reward: -5.960137833786253


 56% (278 of 488) |############          | Elapsed Time: 3:45:23 ETA:   2:50:15


SPS: 42 Average Reward: -6.563507023858909


 57% (279 of 488) |############          | Elapsed Time: 3:46:13 ETA:   2:49:27


SPS: 42 Average Reward: -5.96358846913748


 57% (280 of 488) |############          | Elapsed Time: 3:47:00 ETA:   2:48:38


SPS: 42 Average Reward: -5.8528156421458055


 57% (281 of 488) |############          | Elapsed Time: 3:47:50 ETA:   2:47:50


SPS: 42 Average Reward: -5.539045408554347


 57% (282 of 488) |############          | Elapsed Time: 3:48:40 ETA:   2:47:02


SPS: 42 Average Reward: -5.8753568969988414


 57% (283 of 488) |############          | Elapsed Time: 3:49:29 ETA:   2:46:14


SPS: 42 Average Reward: -5.3526219107495345


 58% (284 of 488) |############          | Elapsed Time: 3:50:19 ETA:   2:45:26


SPS: 42 Average Reward: -5.418290000931005


 58% (285 of 488) |############          | Elapsed Time: 3:51:04 ETA:   2:44:35


SPS: 42 Average Reward: -6.232071778357748


 58% (286 of 488) |############          | Elapsed Time: 3:51:49 ETA:   2:43:43


SPS: 42 Average Reward: -5.92593813743182


 58% (287 of 488) |############          | Elapsed Time: 3:52:38 ETA:   2:42:56


SPS: 42 Average Reward: -5.681165773746255


 59% (288 of 488) |############          | Elapsed Time: 3:53:25 ETA:   2:42:06


SPS: 42 Average Reward: -5.898288349671782


 59% (289 of 488) |#############         | Elapsed Time: 3:54:15 ETA:   2:41:18


SPS: 42 Average Reward: -5.757997053342698


 59% (290 of 488) |#############         | Elapsed Time: 3:55:02 ETA:   2:40:28


SPS: 42 Average Reward: -6.16999013582749


 59% (291 of 488) |#############         | Elapsed Time: 3:55:52 ETA:   2:39:40


SPS: 42 Average Reward: -6.286324441991866


 59% (292 of 488) |#############         | Elapsed Time: 3:56:42 ETA:   2:38:52


SPS: 42 Average Reward: -6.310248054180957


 60% (293 of 488) |#############         | Elapsed Time: 3:57:32 ETA:   2:38:05


SPS: 42 Average Reward: -6.295390984919476


 60% (294 of 488) |#############         | Elapsed Time: 3:58:22 ETA:   2:37:17


SPS: 42 Average Reward: -6.191225668237927


 60% (295 of 488) |#############         | Elapsed Time: 3:59:11 ETA:   2:36:29


SPS: 42 Average Reward: -5.6212817842557445


 60% (296 of 488) |#############         | Elapsed Time: 3:59:56 ETA:   2:35:38


SPS: 42 Average Reward: -5.7895477338803225


 60% (297 of 488) |#############         | Elapsed Time: 4:00:40 ETA:   2:34:46


SPS: 42 Average Reward: -6.129004067814778


 61% (298 of 488) |#############         | Elapsed Time: 4:01:30 ETA:   2:33:58


SPS: 42 Average Reward: -6.472790004146298


 61% (299 of 488) |#############         | Elapsed Time: 4:02:17 ETA:   2:33:09


SPS: 42 Average Reward: -5.640307967244811


 61% (300 of 488) |#############         | Elapsed Time: 4:03:07 ETA:   2:32:21


SPS: 42 Average Reward: -5.274692622298911


 61% (301 of 488) |#############         | Elapsed Time: 4:03:57 ETA:   2:31:33


SPS: 42 Average Reward: -6.752807783952916


 61% (302 of 488) |#############         | Elapsed Time: 4:04:47 ETA:   2:30:45


SPS: 42 Average Reward: -6.334846554479192


 62% (303 of 488) |#############         | Elapsed Time: 4:05:37 ETA:   2:29:58


SPS: 42 Average Reward: -5.798779244587705


 62% (304 of 488) |#############         | Elapsed Time: 4:06:27 ETA:   2:29:10


SPS: 42 Average Reward: -6.348124568806241


 62% (305 of 488) |#############         | Elapsed Time: 4:07:14 ETA:   2:28:20


SPS: 42 Average Reward: -6.231340735256678


 62% (306 of 488) |#############         | Elapsed Time: 4:08:03 ETA:   2:27:32


SPS: 42 Average Reward: -5.653576313065274


 62% (307 of 488) |#############         | Elapsed Time: 4:08:53 ETA:   2:26:44


SPS: 42 Average Reward: -6.192420597781907


 63% (308 of 488) |#############         | Elapsed Time: 4:09:40 ETA:   2:25:54


SPS: 42 Average Reward: -5.921629077661504


 63% (309 of 488) |#############         | Elapsed Time: 4:10:27 ETA:   2:25:05


SPS: 42 Average Reward: -5.507954675834561


 63% (310 of 488) |#############         | Elapsed Time: 4:11:18 ETA:   2:24:17


SPS: 42 Average Reward: -5.330469123082407


 63% (311 of 488) |##############        | Elapsed Time: 4:12:02 ETA:   2:23:26


SPS: 42 Average Reward: -5.820235762383624


 63% (312 of 488) |##############        | Elapsed Time: 4:12:50 ETA:   2:22:37


SPS: 42 Average Reward: -6.017704473404718


 64% (313 of 488) |##############        | Elapsed Time: 4:13:37 ETA:   2:21:48


SPS: 42 Average Reward: -5.361890291417982


 64% (314 of 488) |##############        | Elapsed Time: 4:14:27 ETA:   2:21:00


SPS: 42 Average Reward: -5.608234881494712


 64% (315 of 488) |##############        | Elapsed Time: 4:15:12 ETA:   2:20:09


SPS: 42 Average Reward: -6.072237580267532


 64% (316 of 488) |##############        | Elapsed Time: 4:16:02 ETA:   2:19:21


SPS: 42 Average Reward: -5.504016175983991


 64% (317 of 488) |##############        | Elapsed Time: 4:16:51 ETA:   2:18:33


SPS: 42 Average Reward: -5.766448092548762


 65% (318 of 488) |##############        | Elapsed Time: 4:17:41 ETA:   2:17:45


SPS: 42 Average Reward: -5.739747504571835


 65% (319 of 488) |##############        | Elapsed Time: 4:18:30 ETA:   2:16:57


SPS: 42 Average Reward: -5.331941409730574


 65% (320 of 488) |##############        | Elapsed Time: 4:19:20 ETA:   2:16:09


SPS: 42 Average Reward: -5.220878106235344


 65% (321 of 488) |##############        | Elapsed Time: 4:20:10 ETA:   2:15:21


SPS: 42 Average Reward: -5.626205813361734


 65% (322 of 488) |##############        | Elapsed Time: 4:21:00 ETA:   2:14:33


SPS: 42 Average Reward: -5.074919743385571


 66% (323 of 488) |##############        | Elapsed Time: 4:21:47 ETA:   2:13:43


SPS: 42 Average Reward: -5.682400926753099


 66% (324 of 488) |##############        | Elapsed Time: 4:22:36 ETA:   2:12:55


SPS: 42 Average Reward: -5.365893552821449


 66% (325 of 488) |##############        | Elapsed Time: 4:23:26 ETA:   2:12:07


SPS: 42 Average Reward: -5.299193511013867


 66% (326 of 488) |##############        | Elapsed Time: 4:24:16 ETA:   2:11:19


SPS: 42 Average Reward: -6.017731881882626


 67% (327 of 488) |##############        | Elapsed Time: 4:25:01 ETA:   2:10:29


SPS: 42 Average Reward: -5.4324873513215275


 67% (328 of 488) |##############        | Elapsed Time: 4:25:51 ETA:   2:09:41


SPS: 42 Average Reward: -5.496944525862344


 67% (329 of 488) |##############        | Elapsed Time: 4:26:36 ETA:   2:08:50


SPS: 42 Average Reward: -5.441973946139614


 67% (330 of 488) |##############        | Elapsed Time: 4:27:21 ETA:   2:08:00


SPS: 42 Average Reward: -5.6219478089243085


 67% (331 of 488) |##############        | Elapsed Time: 4:28:11 ETA:   2:07:12


SPS: 42 Average Reward: -5.687917099066662


 68% (332 of 488) |##############        | Elapsed Time: 4:29:01 ETA:   2:06:24


SPS: 42 Average Reward: -5.681185425896551


 68% (333 of 488) |###############       | Elapsed Time: 4:29:50 ETA:   2:05:36


SPS: 42 Average Reward: -6.490680690967541


 68% (334 of 488) |###############       | Elapsed Time: 4:30:40 ETA:   2:04:48


SPS: 42 Average Reward: -5.949806421882052


 68% (335 of 488) |###############       | Elapsed Time: 4:31:30 ETA:   2:04:00


SPS: 42 Average Reward: -5.52711592242588


 68% (336 of 488) |###############       | Elapsed Time: 4:32:19 ETA:   2:03:11


SPS: 42 Average Reward: -5.3037144557107245


 69% (337 of 488) |###############       | Elapsed Time: 4:33:07 ETA:   2:02:22


SPS: 42 Average Reward: -5.31001807834999


 69% (338 of 488) |###############       | Elapsed Time: 4:33:56 ETA:   2:01:34


SPS: 42 Average Reward: -5.8465251853191065


 69% (339 of 488) |###############       | Elapsed Time: 4:34:46 ETA:   2:00:46


SPS: 42 Average Reward: -5.636078049932511


 69% (340 of 488) |###############       | Elapsed Time: 4:35:36 ETA:   1:59:58


SPS: 42 Average Reward: -5.143242838803219


 69% (341 of 488) |###############       | Elapsed Time: 4:36:23 ETA:   1:59:09


SPS: 42 Average Reward: -6.781195021399214


 70% (342 of 488) |###############       | Elapsed Time: 4:37:13 ETA:   1:58:20


SPS: 42 Average Reward: -5.82197341883676


 70% (343 of 488) |###############       | Elapsed Time: 4:38:03 ETA:   1:57:32


SPS: 42 Average Reward: -5.469966860700862


 70% (344 of 488) |###############       | Elapsed Time: 4:38:53 ETA:   1:56:44


SPS: 42 Average Reward: -5.357990622489951


 70% (345 of 488) |###############       | Elapsed Time: 4:39:40 ETA:   1:55:55


SPS: 42 Average Reward: -5.115384996283477


 70% (346 of 488) |###############       | Elapsed Time: 4:40:30 ETA:   1:55:07


SPS: 42 Average Reward: -5.546398290955057


 71% (347 of 488) |###############       | Elapsed Time: 4:41:20 ETA:   1:54:19


SPS: 42 Average Reward: -5.468433402485004


 71% (348 of 488) |###############       | Elapsed Time: 4:42:09 ETA:   1:53:30


SPS: 42 Average Reward: -5.512374674881179


 71% (349 of 488) |###############       | Elapsed Time: 4:42:59 ETA:   1:52:42


SPS: 42 Average Reward: -5.837491036708834


 71% (350 of 488) |###############       | Elapsed Time: 4:43:47 ETA:   1:51:53


SPS: 42 Average Reward: -5.125330629611203


 71% (351 of 488) |###############       | Elapsed Time: 4:44:35 ETA:   1:51:04


SPS: 42 Average Reward: -5.288250818524228


 72% (352 of 488) |###############       | Elapsed Time: 4:45:20 ETA:   1:50:14


SPS: 42 Average Reward: -5.305926487443682


 72% (353 of 488) |###############       | Elapsed Time: 4:46:10 ETA:   1:49:26


SPS: 42 Average Reward: -5.958896275716132


 72% (354 of 488) |###############       | Elapsed Time: 4:46:57 ETA:   1:48:37


SPS: 42 Average Reward: -5.239859660468574


 72% (355 of 488) |################      | Elapsed Time: 4:47:46 ETA:   1:47:49


SPS: 42 Average Reward: -5.234057490234495


 72% (356 of 488) |################      | Elapsed Time: 4:48:31 ETA:   1:46:58


SPS: 42 Average Reward: -5.57121998265378


 73% (357 of 488) |################      | Elapsed Time: 4:49:19 ETA:   1:46:09


SPS: 42 Average Reward: -5.249067983224272


 73% (358 of 488) |################      | Elapsed Time: 4:50:09 ETA:   1:45:21


SPS: 42 Average Reward: -5.6212326754156186


 73% (359 of 488) |################      | Elapsed Time: 4:50:56 ETA:   1:44:32


SPS: 42 Average Reward: -5.503769874622855


 73% (360 of 488) |################      | Elapsed Time: 4:51:47 ETA:   1:43:44


SPS: 42 Average Reward: -5.106482268259412


 73% (361 of 488) |################      | Elapsed Time: 4:52:36 ETA:   1:42:56


SPS: 42 Average Reward: -5.674760385228471


 74% (362 of 488) |################      | Elapsed Time: 4:53:26 ETA:   1:42:08


SPS: 42 Average Reward: -5.5448415987815975


 74% (363 of 488) |################      | Elapsed Time: 4:54:14 ETA:   1:41:19


SPS: 42 Average Reward: -5.341432738532074


 74% (364 of 488) |################      | Elapsed Time: 4:55:04 ETA:   1:40:31


SPS: 42 Average Reward: -5.390959080053301


 74% (365 of 488) |################      | Elapsed Time: 4:55:51 ETA:   1:39:41


SPS: 42 Average Reward: -5.050600274829502


 75% (366 of 488) |################      | Elapsed Time: 4:56:41 ETA:   1:38:53


SPS: 42 Average Reward: -5.328537259426519


 75% (367 of 488) |################      | Elapsed Time: 4:57:28 ETA:   1:38:04


SPS: 42 Average Reward: -5.1352837187804


 75% (368 of 488) |################      | Elapsed Time: 4:58:15 ETA:   1:37:15


SPS: 42 Average Reward: -4.930901686092526


 75% (369 of 488) |################      | Elapsed Time: 4:59:03 ETA:   1:36:26


SPS: 42 Average Reward: -5.1470593056246345


 75% (370 of 488) |################      | Elapsed Time: 4:59:48 ETA:   1:35:36


SPS: 42 Average Reward: -5.513544477168117


 76% (371 of 488) |################      | Elapsed Time: 5:00:36 ETA:   1:34:47


SPS: 42 Average Reward: -4.978203120979509


 76% (372 of 488) |################      | Elapsed Time: 5:01:21 ETA:   1:33:58


SPS: 42 Average Reward: -4.844408424199323


 76% (373 of 488) |################      | Elapsed Time: 5:02:06 ETA:   1:33:08


SPS: 42 Average Reward: -5.159908098801418


 76% (374 of 488) |################      | Elapsed Time: 5:02:51 ETA:   1:32:18


SPS: 42 Average Reward: -5.527151147096064


 76% (375 of 488) |################      | Elapsed Time: 5:03:41 ETA:   1:31:30


SPS: 42 Average Reward: -4.621428333251826


 77% (376 of 488) |################      | Elapsed Time: 5:04:30 ETA:   1:30:42


SPS: 42 Average Reward: -4.869732155264215


 77% (377 of 488) |################      | Elapsed Time: 5:05:20 ETA:   1:29:54


SPS: 42 Average Reward: -5.277696370589755


 77% (378 of 488) |#################     | Elapsed Time: 5:06:10 ETA:   1:29:06


SPS: 42 Average Reward: -5.198019558899278


 77% (379 of 488) |#################     | Elapsed Time: 5:06:55 ETA:   1:28:16


SPS: 42 Average Reward: -5.655613461293451


 77% (380 of 488) |#################     | Elapsed Time: 5:07:45 ETA:   1:27:28


SPS: 42 Average Reward: -5.179638610473023


 78% (381 of 488) |#################     | Elapsed Time: 5:08:35 ETA:   1:26:39


SPS: 42 Average Reward: -4.997383190435819


 78% (382 of 488) |#################     | Elapsed Time: 5:09:25 ETA:   1:25:51


SPS: 42 Average Reward: -4.9279451006134085


 78% (383 of 488) |#################     | Elapsed Time: 5:10:12 ETA:   1:25:02


SPS: 42 Average Reward: -5.133673670662187


 78% (384 of 488) |#################     | Elapsed Time: 5:10:57 ETA:   1:24:13


SPS: 42 Average Reward: -5.148686915430503


 78% (385 of 488) |#################     | Elapsed Time: 5:11:45 ETA:   1:23:24


SPS: 42 Average Reward: -5.079146539559483


 79% (386 of 488) |#################     | Elapsed Time: 5:12:35 ETA:   1:22:36


SPS: 42 Average Reward: -5.0538640185693735


 79% (387 of 488) |#################     | Elapsed Time: 5:13:20 ETA:   1:21:46


SPS: 42 Average Reward: -5.605736271881028


 79% (388 of 488) |#################     | Elapsed Time: 5:14:07 ETA:   1:20:57


SPS: 42 Average Reward: -5.133289710926131


 79% (389 of 488) |#################     | Elapsed Time: 5:14:57 ETA:   1:20:09


SPS: 42 Average Reward: -5.084035750737093


 79% (390 of 488) |#################     | Elapsed Time: 5:15:47 ETA:   1:19:21


SPS: 42 Average Reward: -4.908120140462484


 80% (391 of 488) |#################     | Elapsed Time: 5:16:33 ETA:   1:18:31


SPS: 42 Average Reward: -5.8380734467889965


 80% (392 of 488) |#################     | Elapsed Time: 5:17:23 ETA:   1:17:43


SPS: 42 Average Reward: -4.96502149166807


 80% (393 of 488) |#################     | Elapsed Time: 5:18:10 ETA:   1:16:54


SPS: 42 Average Reward: -5.231029712612915


 80% (394 of 488) |#################     | Elapsed Time: 5:19:00 ETA:   1:16:06


SPS: 42 Average Reward: -4.749092623108689


 80% (395 of 488) |#################     | Elapsed Time: 5:19:48 ETA:   1:15:17


SPS: 42 Average Reward: -5.507149774127686


 81% (396 of 488) |#################     | Elapsed Time: 5:20:38 ETA:   1:14:29


SPS: 42 Average Reward: -5.119163238629995


 81% (397 of 488) |#################     | Elapsed Time: 5:21:28 ETA:   1:13:41


SPS: 42 Average Reward: -5.209475523873881


 81% (398 of 488) |#################     | Elapsed Time: 5:22:18 ETA:   1:12:53


SPS: 42 Average Reward: -5.091496849826442


 81% (399 of 488) |#################     | Elapsed Time: 5:23:08 ETA:   1:12:04


SPS: 42 Average Reward: -5.471278352670584


 81% (400 of 488) |##################    | Elapsed Time: 5:23:58 ETA:   1:11:16


SPS: 42 Average Reward: -5.017664938876617


 82% (401 of 488) |##################    | Elapsed Time: 5:24:48 ETA:   1:10:28


SPS: 42 Average Reward: -4.711126828968085


 82% (402 of 488) |##################    | Elapsed Time: 5:25:38 ETA:   1:09:39


SPS: 42 Average Reward: -5.475343217311335


 82% (403 of 488) |##################    | Elapsed Time: 5:26:28 ETA:   1:08:51


SPS: 42 Average Reward: -4.978696466605981


 82% (404 of 488) |##################    | Elapsed Time: 5:27:12 ETA:   1:08:02


SPS: 42 Average Reward: -4.606759858138189


 82% (405 of 488) |##################    | Elapsed Time: 5:28:02 ETA:   1:07:13


SPS: 42 Average Reward: -4.985869462366152


 83% (406 of 488) |##################    | Elapsed Time: 5:28:48 ETA:   1:06:24


SPS: 42 Average Reward: -5.351427192343709


 83% (407 of 488) |##################    | Elapsed Time: 5:29:37 ETA:   1:05:36


SPS: 42 Average Reward: -5.171487571840376


 83% (408 of 488) |##################    | Elapsed Time: 5:30:27 ETA:   1:04:47


SPS: 42 Average Reward: -5.027873374357174


 83% (409 of 488) |##################    | Elapsed Time: 5:31:17 ETA:   1:03:59


SPS: 42 Average Reward: -5.108405610204419


 84% (410 of 488) |##################    | Elapsed Time: 5:32:05 ETA:   1:03:10


SPS: 42 Average Reward: -4.633851482370578


 84% (411 of 488) |##################    | Elapsed Time: 5:32:55 ETA:   1:02:22


SPS: 42 Average Reward: -5.052888012841333


 84% (412 of 488) |##################    | Elapsed Time: 5:33:42 ETA:   1:01:33


KeyboardInterrupt: 

In [None]:
torch.save(sender, "models/sender" + str(start_time) + ".pth")
torch.save(receiver, "models/receiver" + str(start_time) + ".pth")

# Direct communication test

In [None]:
class Test_Communication:
    def __init__(self, environment):
        self.environment = environment
        self.observation_space = {"low": [0, 0, 0, 0], "high": [1, 1, 1, 1]}
        self.action_space = {"low": [0, 0, 0, 0], "high": [1, 1, 1, 1]}

    def dynamic(self, agent, actions):
        if "utterance" not in self.environment.data_store[agent].keys():
            self.environment.data_store[agent]["utterance"] = None
        if agent == "receiver":
            utterance = [0, 0, 0, 0]
            if "utterance_max" in self.environment.data_store["sender"].keys():
                observation = self.environment.data_store["sender"]["utterance_max"]
            else:
                observation = utterance
        elif agent == "sender":
            utterance = [0, 0, 0, 0]
            utterance[np.argmax(actions)] = 1
            self.environment.data_store[agent]["utterance"] = actions
            self.environment.data_store[agent]["utterance_max"] = utterance
            observation = [0, 0, 0, 0]
        else:
            print("Dafaq is going on here?")
        return 0, observation, False, {}

In [None]:
xml_files = ["levels/" + file for file in os.listdir("levels/")]
agents = ["sender", "receiver"]

config_dict = {"xmlPath":xml_files, 
                   "agents":agents, 
                   "rewardFunctions":[collision_reward, target_reward], 
                   "doneFunctions":[target_done, border_done], 
                   "skipFrames":5,
                   "environmentDynamics":[Image, Reward, Test_Communication, Accuracy],
                   "freeJoint":True,
                   "renderMode":False,
                   "maxSteps":1024,
                   "agentCameras":True}

env = make_env(config_dict)()

In [None]:
epochs = 30
num_steps = 1024
lengths = []

for epoch in range(epochs):
    next_obs, infos = reset_environment(env, device)

    next_obs = {k: torch.Tensor(v).unsqueeze(0).to(device) for k, v in next_obs.items()}
    next_done = {"sender": torch.zeros(num_envs).to(device), "receiver": torch.zeros(num_envs).to(device)}

    for step in range(0, num_steps):
        sender_action = sender.get_action_and_value(next_obs["sender"])[0]
        receiver_action = receiver.get_action_and_value(next_obs["receiver"])[0]

        next_obs, reward, terminations, truncations, info = env.step({"sender": sender_action.cpu().numpy()[0], "receiver": receiver_action.cpu().numpy()[0]})
        next_obs = {"sender": torch.Tensor(next_obs["sender"]).unsqueeze(0).to(device), "receiver": torch.Tensor(next_obs["receiver"]).unsqueeze(0).to(device)}

        if terminations["sender"] or terminations["receiver"] or truncations["sender"] or truncations["receiver"]:
            next_obs, infos = reset_environment(env, device)
            lengths.append(step)
            break

In [None]:
dynamic = env.env.env.environment_dynamics[3]
print("Accuracy:", sum(dynamic.accuracies) / len(dynamic.accuracies))
print("Variance:", sum(dynamic.variances) / len(dynamic.variances))
print("Send Accuracy:", sum(dynamic.sendAccuracies) / len(dynamic.sendAccuracies))
print("Length:", sum(lengths) / len(lengths))