In [1]:
try:
    from easypip import easyimport
except:
    !pip install easypip
    from easypip import easyimport

import functools
import time

easyimport("importlib_metadata==4.13.0")
OmegaConf = easyimport("omegaconf").OmegaConf
bbrl = easyimport("bbrl")
import gym

import os
import copy
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm

import gym

from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class

# The workspace is the main class in BBRL, this is where all data is collected and stored
from bbrl.workspace import Workspace

# Agents(agent1,agent2,agent3,...) executes the different agents the one after the other
# TemporalAgent(agent) executes an agent over multiple timesteps in the workspace, 
# or until a given condition is reached
from bbrl.agents import Agents, RemoteAgent, TemporalAgent

# AutoResetGymAgent is an agent able to execute a batch of gym environments
# with auto-resetting. These agents produce multiple variables in the workspace: 
# ’env/env_obs’, ’env/reward’, ’env/timestep’, ’env/done’, ’env/initial_state’, ’env/cumulated_reward’, 
# ... When called at timestep t=0, then the environments are automatically reset. 
# At timestep t>0, these agents will read the ’action’ variable in the workspace at time t − 1
from bbrl.agents.gymb import AutoResetGymAgent, NoAutoResetGymAgent
# Not present in the A2C version...
from bbrl.utils.logger import TFLogger

from torch.distributions import Distribution
from torch.distributions.normal import Normal
from torch.distributions.independent import Independent

from bbrl.visu.visu_policies import plot_policy
from bbrl.visu.visu_critics import plot_critic

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
def build_backbone(sizes, activation):
    layers = []
    for j in range(len(sizes) - 2):
        layers += [nn.Linear(sizes[j], sizes[j + 1]), activation]
    return layers


def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

class DiscreteActor(Agent):
    def __init__(self, state_dim, hidden_size, n_actions):
        super().__init__()
        self.model = build_mlp([state_dim] + list(hidden_size) + [n_actions], activation=nn.ReLU())
        self.model = self.model.to(device)

    
    def forward(self, t, stochastic, replay=False, **kwargs):
        """ 
        Compute the action given either a time step (looking into the workspace)
        or an observation (in kwargs)
        """
        if "observation" in kwargs:
            observation = kwargs["observation"]
        else:
            observation = self.get(("env/env_obs", t))
        scores = self.model(observation)
        probs = torch.softmax(scores, dim=-1)

        if stochastic:
            action = torch.distributions.Categorical(probs).sample()
        else:
          action = probs.argmax(1)

        entropy = torch.distributions.Categorical(probs).entropy()
        logprobs = probs[torch.arange(probs.size()[0]), action].log()

        if not replay:
            self.set(("action", t), action)
        self.set(("action_logprobs", t), logprobs)
        self.set(("entropy", t), entropy)

    
    def predict_action(self, obs, stochastic):
        obs = obs.to(device)
        scores = self.model(obs)

        if stochastic:
            probs = torch.softmax(scores, dim=-1)
            action = torch.distributions.Categorical(probs).sample()
        else:
            action = scores.argmax(0)
        return action
    
class ContinuousAgent(Agent):
    def dist(self, obs: torch.Tensor) -> Distribution:
        """Returns the distributions for the given observations"""
        assert False, "to implement in subclass"
            
    def forward(self, t, stochastic, **kwargs):
        obs = self.get(("env/env_obs", t))
        dist = self.dist(obs)

        action = dist.sample() if stochastic else dist.mean

        logp_pi = dist.log_prob(action)
        
        self.set(("entropy", t), dist.entropy())

        self.set(("action", t), action)
        self.set(("action_logprobs", t), logp_pi)


    def predict_action(self, obs, stochastic):
        """Predict just one action (without using the workspace)"""
        obs = obs.to(device)
        dist = self.dist(obs)
        action = dist.sample() if stochastic else dist.mean
        return action
    
class ConstantVarianceContinuousActor(ContinuousAgent):
    def __init__(self, state_dim, hidden_layers, action_dim, **kwargs):
        super().__init__()
        layers = [state_dim] + list(hidden_layers) + [action_dim]
        self.model = build_mlp(layers, activation=nn.ReLU())
        self.model = self.model.to(device)
        self.std_param = 2

    def dist(self, obs: torch.Tensor):
        mean = self.model(obs)    
        return Independent(Normal(mean, self.std_param), 1)  # std must be positive
    
class TunableVarianceContinuousActor(ContinuousAgent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        layers = [state_dim] + list(hidden_layers) + [action_dim]
        self.model = build_mlp(layers, activation=nn.ReLU())
        self.model = self.model.to(device)

        # The standard deviation associated with each dimension
        self.std_param = nn.parameter.Parameter(torch.randn(action_dim, 1))
        
        # We use the softplus function to compute the variance for the normal
        # The base version computes exp(1+log(x)) component-wise
        # https://pytorch.org/docs/stable/generated/torch.nn.Softplus.html
        self.soft_plus = torch.nn.Softplus()

    def dist(self, obs: torch.Tensor):
        mean = self.model(obs)
        return Independent(Normal(mean, self.soft_plus(self.std_param)), 1)

class StateDependentVarianceContinuousActor(ContinuousAgent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()

        # Buils the "backbone" neural network
        backbone_dim = [state_dim] + list(hidden_layers)
        self.layers = build_backbone(backbone_dim, activation=nn.ReLU())
        self.backbone = nn.Sequential(*self.layers)
        self.backbone = self.backbone.to(device)
        
        self.mean = nn.Sequential(self.backbone, nn.Linear(backbone_dim[-1], action_dim)).to(device)
        self.std = nn.Sequential(self.backbone, nn.Linear(backbone_dim[-1], action_dim)).to(device)
        self.mean = self.mean.to(device)
        self.std = self.std.to(device)

    
    def dist(self, obs: torch.Tensor) -> Distribution:        
        mean = self.mean(obs)
        std = self.std(obs)
        return Independent(Normal(mean, nn.functional.softplus(std)), 1)

def make_env(env_name):
    return gym.make(env_name)


def get_env_agents(cfg):
    train_env_agent = AutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.n_envs,
        cfg.algorithm.seed,
    )
    eval_env_agent = NoAutoResetGymAgent(
        get_class(cfg.gym_env),
        get_arguments(cfg.gym_env),
        cfg.algorithm.nb_evals,
        cfg.algorithm.seed,
    )
    return train_env_agent, eval_env_agent


class VAgent(Agent):
    def __init__(self, state_dim, hidden_layers):
        super().__init__()
        self.is_q_function = False
        self.model = build_mlp(
            [state_dim] + list(hidden_layers) + [1], activation=nn.ReLU()
        )
        self.model = self.model.to(device)

    def forward(self, t, **kwargs):
        observation = self.get(("env/env_obs", t))
        critic = self.model(observation).squeeze(-1)
        self.set(("v_value", t), critic)
        
# Create the A2C Agent
def make_agents(cfg, train_env_agent, eval_env_agent):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()
    if train_env_agent.is_continuous_action():
        action_agent = globals()[cfg.algorithm.action_agent](
            obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size
        )
    else:
        action_agent = DiscreteActor(obs_size, cfg.algorithm.architecture.actor_hidden_size, act_size)

    tr_agent = TemporalAgent(Agents(train_env_agent, action_agent))
    ev_agent = TemporalAgent(Agents(eval_env_agent, action_agent))

    critic_agent = TemporalAgent(VAgent(obs_size, cfg.algorithm.architecture.critic_hidden_size))
    return tr_agent, ev_agent, critic_agent

class Logger():

  def __init__(self, cfg):
    self.logger = instantiate_class(cfg.logger)

  def add_log(self, log_string, loss, epoch):
    self.logger.add_scalar(log_string, loss.item(), epoch)

  # Log losses
  def log_losses(self, epoch, critic_loss, entropy_loss, a2c_loss):
    self.add_log("critic_loss", critic_loss, epoch)
    self.add_log("entropy_loss", entropy_loss, epoch)
    self.add_log("a2c_loss", a2c_loss, epoch)


# Configure the optimizer over the a2c agent
def setup_optimizer(cfg, action_agent, critic_agent):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = nn.Sequential(action_agent, critic_agent).parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer

def execute_agent(cfg, epoch, workspace, agent):
    if epoch > 0:
        workspace.zero_grad()
        workspace.copy_n_last_steps(1)
        agent(
            workspace, t=1, n_steps=cfg.algorithm.n_steps - 1, stochastic=True
        )
    else:
        agent(workspace, t=0, n_steps=cfg.algorithm.n_steps, stochastic=True)
        
from bbrl.utils.functionalb import gae

def compute_critic_loss(cfg, reward, must_bootstrap, v_value):
    # Compute temporal difference
    # target = reward[:-1] + cfg.algorithm.discount_factor * v_value[1:].detach() * must_bootstrap.int()
    target = gae(v_value, reward, must_bootstrap, cfg.algorithm.discount_factor, cfg.algorithm.gae_coef)
    td = target - v_value[:-1]
    td_error = td**2
    critic_loss = td_error.mean()
    return critic_loss, td

def compute_actor_loss(action_logp, td):
    a2c_loss = action_logp[:-1] * td.detach()
    return a2c_loss.mean()

In [3]:
def run_a2c(cfg):
    logger = Logger(cfg)
    best_reward = float('-inf')

    # 2) Create the environment agent
    train_env_agent, eval_env_agent = get_env_agents(cfg)
    
    tr_agent, eval_agent, critic_agent = make_agents(cfg, train_env_agent, eval_env_agent)

    # 5) Configure the workspace to the right dimension
    # Note that no parameter is needed to create the workspace.
    # In the training loop, calling the agent() and critic_agent()
    # will take the workspace as parameter
    train_workspace = Workspace()  # Used for training
    
    # send to device:
    train_env_agent = train_env_agent.to(device)
    eval_env_agent = eval_env_agent.to(device)
    tr_agent = tr_agent.to(device)
    eval_agent = eval_agent.to(device)
    critic_agent = critic_agent.to(device)
    train_workspace = train_workspace.to(device)

    # 6) Configure the optimizer over the a2c agent
    optimizer = setup_optimizer(cfg, tr_agent, critic_agent)
    nb_steps = 0
    tmp_steps = 0

    # 7) Training loop
    for epoch in (pbar := tqdm(range(cfg.algorithm.max_epochs))):
        # Execute the agent in the workspace
        if epoch > 0:
            train_workspace.zero_grad()
            train_workspace.copy_n_last_steps(1)
            tr_agent(
                train_workspace, t=1, n_steps=cfg.algorithm.n_steps - 1, stochastic=True
            )
        else:
            tr_agent(
                train_workspace, t=0, n_steps=cfg.algorithm.n_steps, stochastic=True
            )

        # Compute the critic value over the whole workspace
        critic_agent(train_workspace, n_steps=cfg.algorithm.n_steps)

        transition_workspace = train_workspace.get_transitions()

        v_value, done, truncated, reward, action, action_logp = transition_workspace[
            "v_value",
            "env/done",
            "env/truncated",
            "env/reward",
            "action",
            "action_logprobs",
        ]
        nb_steps += action[0].shape[0]
        # Determines whether values of the critic should be propagated
        # True if the episode reached a time limit or if the task was not done
        # See https://colab.research.google.com/drive/1erLbRKvdkdDy0Zn1X_JhC01s1QAt4BBj
        must_bootstrap = torch.logical_or(~done[1], truncated[1])

        # Compute critic loss
        critic_loss, td = compute_critic_loss(cfg, reward, must_bootstrap, v_value)
        a2c_loss = compute_actor_loss(action_logp, td)

        # Compute entropy loss
        entropy_loss = torch.mean(train_workspace["entropy"])

        # Store the losses for tensorboard display
        logger.log_losses(nb_steps, critic_loss, entropy_loss, a2c_loss)

        # Compute the total loss
        loss = (
            -cfg.algorithm.entropy_coef * entropy_loss
            + cfg.algorithm.critic_coef * critic_loss
            - cfg.algorithm.a2c_coef * a2c_loss
        )

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(tr_agent.parameters(), cfg.algorithm.max_grad_norm)
        optimizer.step()

        if nb_steps - tmp_steps > cfg.algorithm.eval_interval:
            tmp_steps = nb_steps
            eval_workspace = Workspace()  # Used for evaluation
            eval_workspace = eval_workspace.to(device)
            eval_agent(eval_workspace, t=0, stop_variable="env/done", stochastic=False)
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.add_log("reward", mean, nb_steps)
            pbar.set_description(f"epoch: {epoch}, best_reward: {best_reward}, reward: {mean}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                directory = "./a2c_policies/"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "a2c_" + str(mean.item()) + ".agt"
                policy = eval_agent.agent.agents[1]
                policy.save_model(filename)
                critic = critic_agent.agent
                if cfg.plot_policy:
                    plot_policy(
                        policy,
                        eval_env_agent,
                        "./a2c_advanced_plots/",
                        cfg.gym_env.env_name,
                        best_reward,
                        stochastic=False,
                    )
                    """
                    plot_critic(
                        critic,
                        eval_env_agent,
                        "./a2c_advanced_plots/",
                        cfg.gym_env.env_name,
                        best_reward,
                    )"""
                    
    return mean.item()


In [4]:
import my_gym

params={
  "save_best": True,
  "plot_policy": True,

  "logger":{
    "classname": "bbrl.utils.logger.TFLogger",
    "log_dir": "./tblogs-tp6-advanced/constant-var-actor/" + str(time.time()),
    "cache_size": 10000,
    "every_n_seconds": 10,
    "verbose": False,    
    },

  "algorithm":{
    "seed": 4,
    "n_envs": 8,
    "nb_evals":10,
    "n_steps": 16,
    "eval_interval": 1000,
    "max_epochs": 1000,
    "discount_factor": 0.95,
    "entropy_coef": 0.001,
    "critic_coef": 1.0,
    "a2c_coef": 0.1,
    "gae_coef": 0.8,
    "max_grad_norm": 0.5,
    
    # You can change the chosen action agent here
    #"action_agent": "ConstantVarianceContinuousActor",
    #"action_agent": "StateDependentVarianceContinuousActor",
    "action_agent": "TunableVarianceContinuousActor",  
    "architecture":{
      "actor_hidden_size": [25, 25],
      "critic_hidden_size": [24, 36],
    },
  },
  "gym_env":{
    "classname": "__main__.make_env",
    "env_name": "CartPoleContinuous-v1",
    },
  "optimizer":{
    "classname": "torch.optim.Adam",
    "lr": 0.01,
  }
}


# For Colab - otherwise, it is easier and better to launch tensorboard from
# the terminal
if get_ipython().__class__.__module__ == "google.colab._shell":
    %load_ext tensorboard
    %tensorboard --logdir ./tmp
else:
    import sys
    import os
    import os.path as osp
    print(f'''Launch tensorboard from the shell:\n{osp.dirname(sys.executable)}/tensorboard --logdir="{os.getcwd()}/tblogs-tp6-advanced"''')

Matplotlib backend: module://matplotlib_inline.backend_inline
Launch tensorboard from the shell:
/home/manuel/deepdac/bin/tensorboard --logdir="/home/manuel/RLD/TP6/tblogs-tp6-advanced"


In [None]:
print(device)
config=OmegaConf.create(params)
torch.manual_seed(config.algorithm.seed)
run_a2c(config)


In [7]:
import optuna

  
def setup_params(sample_params, base_params, env):
    params = {
        "save_best": True,
        "plot_policy": True,
        "logger":{
            "classname": "bbrl.utils.logger.TFLogger",
            "log_dir": "./tblogs-tp6-advanced/"+env+"/"+sample_params["actor"]+"/" + str(time.time()),
            "cache_size": 10000,
            "every_n_seconds": 10,
            "verbose": False,    
            },
        "algorithm":{
            "seed": 4,
            "n_envs": 8,
            "nb_evals":10,
            "n_steps": sample_params['n_steps'],
            "eval_interval": 1000,
            "max_epochs": 2000,
            "discount_factor": sample_params['discount_factor'],
            "entropy_coef": sample_params['entropy_coef'],
            "critic_coef": sample_params['critic_coef'],
            "a2c_coef": sample_params['a2c_coef'],
            "gae_coef": sample_params['gae_coef'],
            "max_grad_norm": sample_params['max_grad_norm'],
            "action_agent": sample_params["actor"],  
            "architecture":{
                "actor_hidden_size": sample_params['actor_hidden_size'],
                "critic_hidden_size": sample_params['critic_hidden_size'],
            },
        },
        "gym_env":{
            "classname": "__main__.make_env",
            "env_name": env,
            },
        "optimizer":{
            "classname": "torch.optim.Adam",
            "lr": sample_params['learning_rate'],
        }
    }
    return params
    

def sample_a2c_params(trial: optuna.Trial):
    """
    Sampler for A2C hyperparams.

    :param trial:
    :return:
    """
    normalize: True
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"
    discount_factor = trial.suggest_categorical("discount_factor", [0.9, 0.95, 0.98])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    gae_coef = trial.suggest_categorical("gae_coef", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    #n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-7, 1)
    entropy_coef = trial.suggest_loguniform("entropy_coef", 0.00000001, 1)
    critic_coef = trial.suggest_loguniform("critic_coef", 0.00000001, 1)
    a2c_coef = trial.suggest_loguniform("a2c_coef", 0.00000001, 1)
    
    net_arch = trial.suggest_categorical("net_arch", [64, 256])
    actor = trial.suggest_categorical("actor", ["TunableVarianceContinuousActor", "ConstantVarianceContinuousActor", "StateDependentVarianceContinuousActor"])
    return {
        "learning_rate": learning_rate,
        "n_steps": 16,
        "discount_factor": discount_factor,
        "entropy_coef": entropy_coef,
        "critic_coef": critic_coef,
        "a2c_coef": a2c_coef,
        "gae_coef": gae_coef,
        "max_grad_norm": max_grad_norm,
        "actor": actor,
        "actor_hidden_size": [net_arch, net_arch],
        "critic_hidden_size": [net_arch, net_arch],
        
    }

In [8]:
import torch

import optuna

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values of the hyperparameters using a trial object.
    sampled_hyperparams = sample_a2c_params(trial)
    trial_params = setup_params(sampled_hyperparams, params, "Pendulum-v1")
    config=OmegaConf.create(trial_params)
    torch.manual_seed(config.algorithm.seed)
    print(sampled_hyperparams)
    reward = run_a2c(config)
    return reward

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2022-12-01 14:17:23,404][0m A new study created in memory with name: no-name-2e8573a6-2af8-42b4-881b-d856701bdb37[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
  entropy_coef = trial.suggest_loguniform("entropy_coef", 0.00000001, 0.1)
  critic_coef = trial.suggest_loguniform("critic_coef", 0.00000001, 0.1)
  a2c_coef = trial.suggest_loguniform("a2c_coef", 0.00000001, 0.1)


{'learning_rate': 0.00791960856009742, 'n_steps': 16, 'discount_factor': 0.999, 'entropy_coef': 8.410300998010625e-05, 'critic_coef': 0.00035755850932532807, 'a2c_coef': 0.00011479799330093288, 'gae_coef': 0.92, 'max_grad_norm': 0.9, 'actor': 'StateDependentVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -994.9615478515625, reward: -1492.1990966796875: 100%|██████████| 2000/2000 [05:18<00:00,  6.28it/s]
[32m[I 2022-12-01 14:22:41,902][0m Trial 0 finished with value: -1492.1990966796875 and parameters: {'discount_factor': 0.999, 'max_grad_norm': 0.9, 'gae_coef': 0.92, 'learning_rate': 0.00791960856009742, 'entropy_coef': 8.410300998010625e-05, 'critic_coef': 0.00035755850932532807, 'a2c_coef': 0.00011479799330093288, 'net_arch': 64, 'actor': 'StateDependentVarianceContinuousActor'}. Best is trial 0 with value: -1492.1990966796875.[0m


{'learning_rate': 0.5008165177244058, 'n_steps': 16, 'discount_factor': 0.95, 'entropy_coef': 4.52149880233622e-08, 'critic_coef': 1.3350914092531017e-05, 'a2c_coef': 0.022038986256550926, 'gae_coef': 0.99, 'max_grad_norm': 0.8, 'actor': 'ConstantVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -1129.4071044921875, reward: -1567.734619140625: 100%|██████████| 2000/2000 [04:54<00:00,  6.79it/s] 
[32m[I 2022-12-01 14:27:36,570][0m Trial 1 finished with value: -1567.734619140625 and parameters: {'discount_factor': 0.95, 'max_grad_norm': 0.8, 'gae_coef': 0.99, 'learning_rate': 0.5008165177244058, 'entropy_coef': 4.52149880233622e-08, 'critic_coef': 1.3350914092531017e-05, 'a2c_coef': 0.022038986256550926, 'net_arch': 64, 'actor': 'ConstantVarianceContinuousActor'}. Best is trial 0 with value: -1492.1990966796875.[0m


{'learning_rate': 0.010879478454815817, 'n_steps': 16, 'discount_factor': 0.9999, 'entropy_coef': 0.013617771055471581, 'critic_coef': 0.0018278715310616662, 'a2c_coef': 0.0012355182187374942, 'gae_coef': 0.99, 'max_grad_norm': 0.3, 'actor': 'TunableVarianceContinuousActor', 'actor_hidden_size': [256, 256], 'critic_hidden_size': [256, 256]}


epoch: 1997, best_reward: -874.0810546875, reward: -1401.4061279296875: 100%|██████████| 2000/2000 [05:29<00:00,  6.07it/s]    
[32m[I 2022-12-01 14:33:05,915][0m Trial 2 finished with value: -1401.4061279296875 and parameters: {'discount_factor': 0.9999, 'max_grad_norm': 0.3, 'gae_coef': 0.99, 'learning_rate': 0.010879478454815817, 'entropy_coef': 0.013617771055471581, 'critic_coef': 0.0018278715310616662, 'a2c_coef': 0.0012355182187374942, 'net_arch': 256, 'actor': 'TunableVarianceContinuousActor'}. Best is trial 2 with value: -1401.4061279296875.[0m


{'learning_rate': 0.001050139425019364, 'n_steps': 16, 'discount_factor': 0.99, 'entropy_coef': 2.1158350291600146e-05, 'critic_coef': 3.4123895268260562e-06, 'a2c_coef': 2.1277866724887434e-05, 'gae_coef': 0.92, 'max_grad_norm': 0.3, 'actor': 'TunableVarianceContinuousActor', 'actor_hidden_size': [256, 256], 'critic_hidden_size': [256, 256]}


epoch: 1997, best_reward: -843.44970703125, reward: -1279.4151611328125: 100%|██████████| 2000/2000 [05:18<00:00,  6.27it/s]   
[32m[I 2022-12-01 14:38:24,814][0m Trial 3 finished with value: -1279.4151611328125 and parameters: {'discount_factor': 0.99, 'max_grad_norm': 0.3, 'gae_coef': 0.92, 'learning_rate': 0.001050139425019364, 'entropy_coef': 2.1158350291600146e-05, 'critic_coef': 3.4123895268260562e-06, 'a2c_coef': 2.1277866724887434e-05, 'net_arch': 256, 'actor': 'TunableVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.0009066540159199412, 'n_steps': 16, 'discount_factor': 0.995, 'entropy_coef': 0.011026416819215262, 'critic_coef': 0.0018178440671131248, 'a2c_coef': 1.2598164464476649e-06, 'gae_coef': 0.92, 'max_grad_norm': 0.8, 'actor': 'StateDependentVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -1166.8460693359375, reward: -1601.8599853515625: 100%|██████████| 2000/2000 [05:00<00:00,  6.66it/s]
[32m[I 2022-12-01 14:43:25,149][0m Trial 4 finished with value: -1601.8599853515625 and parameters: {'discount_factor': 0.995, 'max_grad_norm': 0.8, 'gae_coef': 0.92, 'learning_rate': 0.0009066540159199412, 'entropy_coef': 0.011026416819215262, 'critic_coef': 0.0018178440671131248, 'a2c_coef': 1.2598164464476649e-06, 'net_arch': 64, 'actor': 'StateDependentVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.033357951095507196, 'n_steps': 16, 'discount_factor': 0.9999, 'entropy_coef': 5.0768982499848466e-05, 'critic_coef': 2.2705416215806046e-05, 'a2c_coef': 1.2897644599515345e-08, 'gae_coef': 0.95, 'max_grad_norm': 1, 'actor': 'StateDependentVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -1257.1170654296875, reward: -1481.1353759765625: 100%|██████████| 2000/2000 [04:14<00:00,  7.87it/s]
[32m[I 2022-12-01 14:47:39,359][0m Trial 5 finished with value: -1481.1353759765625 and parameters: {'discount_factor': 0.9999, 'max_grad_norm': 1, 'gae_coef': 0.95, 'learning_rate': 0.033357951095507196, 'entropy_coef': 5.0768982499848466e-05, 'critic_coef': 2.2705416215806046e-05, 'a2c_coef': 1.2897644599515345e-08, 'net_arch': 64, 'actor': 'StateDependentVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.007172166197672235, 'n_steps': 16, 'discount_factor': 0.9999, 'entropy_coef': 3.739600752564219e-07, 'critic_coef': 1.9224578879716248e-06, 'a2c_coef': 7.200578995322678e-07, 'gae_coef': 0.99, 'max_grad_norm': 0.3, 'actor': 'TunableVarianceContinuousActor', 'actor_hidden_size': [256, 256], 'critic_hidden_size': [256, 256]}


epoch: 1997, best_reward: -1257.1170654296875, reward: -1481.1353759765625: 100%|██████████| 2000/2000 [05:26<00:00,  6.12it/s]
[32m[I 2022-12-01 14:53:06,383][0m Trial 6 finished with value: -1481.1353759765625 and parameters: {'discount_factor': 0.9999, 'max_grad_norm': 0.3, 'gae_coef': 0.99, 'learning_rate': 0.007172166197672235, 'entropy_coef': 3.739600752564219e-07, 'critic_coef': 1.9224578879716248e-06, 'a2c_coef': 7.200578995322678e-07, 'net_arch': 256, 'actor': 'TunableVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.0001466272889532353, 'n_steps': 16, 'discount_factor': 0.999, 'entropy_coef': 1.2397766137156187e-06, 'critic_coef': 4.889890462677407e-07, 'a2c_coef': 0.0007196057691934382, 'gae_coef': 0.98, 'max_grad_norm': 0.3, 'actor': 'StateDependentVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -1116.211181640625, reward: -1640.2789306640625: 100%|██████████| 2000/2000 [06:14<00:00,  5.34it/s]
[32m[I 2022-12-01 14:59:21,106][0m Trial 7 finished with value: -1640.2789306640625 and parameters: {'discount_factor': 0.999, 'max_grad_norm': 0.3, 'gae_coef': 0.98, 'learning_rate': 0.0001466272889532353, 'entropy_coef': 1.2397766137156187e-06, 'critic_coef': 4.889890462677407e-07, 'a2c_coef': 0.0007196057691934382, 'net_arch': 64, 'actor': 'StateDependentVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.05304988597148725, 'n_steps': 16, 'discount_factor': 0.9, 'entropy_coef': 1.064294660371012e-07, 'critic_coef': 0.005509667806274398, 'a2c_coef': 4.168109175052286e-07, 'gae_coef': 0.92, 'max_grad_norm': 0.3, 'actor': 'ConstantVarianceContinuousActor', 'actor_hidden_size': [64, 64], 'critic_hidden_size': [64, 64]}


epoch: 1997, best_reward: -861.0369262695312, reward: -1433.6409912109375: 100%|██████████| 2000/2000 [05:40<00:00,  5.87it/s]
[32m[I 2022-12-01 15:05:01,715][0m Trial 8 finished with value: -1433.6409912109375 and parameters: {'discount_factor': 0.9, 'max_grad_norm': 0.3, 'gae_coef': 0.92, 'learning_rate': 0.05304988597148725, 'entropy_coef': 1.064294660371012e-07, 'critic_coef': 0.005509667806274398, 'a2c_coef': 4.168109175052286e-07, 'net_arch': 64, 'actor': 'ConstantVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 0.03814224227269237, 'n_steps': 16, 'discount_factor': 0.99, 'entropy_coef': 0.0006591931989783329, 'critic_coef': 0.011255869815500367, 'a2c_coef': 4.568113471843917e-07, 'gae_coef': 0.95, 'max_grad_norm': 0.9, 'actor': 'TunableVarianceContinuousActor', 'actor_hidden_size': [256, 256], 'critic_hidden_size': [256, 256]}


epoch: 1997, best_reward: -915.5299682617188, reward: -1291.9088134765625: 100%|██████████| 2000/2000 [05:40<00:00,  5.88it/s]
[32m[I 2022-12-01 15:10:41,845][0m Trial 9 finished with value: -1291.9088134765625 and parameters: {'discount_factor': 0.99, 'max_grad_norm': 0.9, 'gae_coef': 0.95, 'learning_rate': 0.03814224227269237, 'entropy_coef': 0.0006591931989783329, 'critic_coef': 0.011255869815500367, 'a2c_coef': 4.568113471843917e-07, 'net_arch': 256, 'actor': 'TunableVarianceContinuousActor'}. Best is trial 3 with value: -1279.4151611328125.[0m


{'learning_rate': 2.191825231064838e-05, 'n_steps': 16, 'discount_factor': 0.99, 'entropy_coef': 2.977249462045399e-06, 'critic_coef': 6.07404519245161e-08, 'a2c_coef': 2.6959789919387764e-05, 'gae_coef': 1.0, 'max_grad_norm': 0.7, 'actor': 'TunableVarianceContinuousActor', 'actor_hidden_size': [256, 256], 'critic_hidden_size': [256, 256]}


epoch: 1754, best_reward: -1051.1533203125, reward: -1818.632080078125:  88%|████████▊ | 1763/2000 [04:43<00:23,  9.99it/s]  