In [1]:
import os
from typing import Optional, Tuple

import gym
import numpy as np
import torch
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.common import Net

from pettingzoo.classic import tictactoe_v3

In [2]:
env = tictactoe_v3.env(render_mode="human")
env = PettingZooEnv(env)
policies = MultiAgentPolicyManager([RandomPolicy(), RandomPolicy()], env)
env = DummyVectorEnv([lambda: env])
collector = Collector(policies, env)
result = collector.collect(n_episode=1, render=0.1)

     |     |     
  -  |  -  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  -  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
     |     |     
  -  |  -  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
     |     |     
  -  |  -  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
     |     |     
  -  |  X  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
     |     |     
  -  |  X  |  X  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
     |     |     
  O  |  X 

In [3]:
def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gym.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space["observation"].shape
            or observation_space["observation"].n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 128, 128, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        )

    if agent_opponent is None:
        agent_opponent = RandomPolicy()

    agents = [agent_opponent, agent_learn]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents

In [4]:
def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    return PettingZooEnv(tictactoe_v3.env())

In [None]:
# ======== Step 1: Environment setup =========
train_envs = DummyVectorEnv([_get_env for _ in range(10)])
test_envs = DummyVectorEnv([_get_env for _ in range(10)])

# seed
seed = 1
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents()

# ======== Step 3: Collector setup =========
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(200000, len(train_envs)),
    exploration_noise=True,
)
test_collector = Collector(policy, test_envs, exploration_noise=True)
# policy.set_eps(1)
train_collector.collect(n_step=64 * 10)  # batch size * training_num

# ======== Step 4: Callback functions setup =========
def save_best_fn(policy):
    model_save_path = os.path.join("log", "rps", "dqn", "policy.pth")
    os.makedirs(os.path.join("log", "rps", "dqn"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 0.8

def train_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.1)

def test_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.05)

def reward_metric(rews):
    return rews[:, 1]

# ======== Step 5: Run the trainer =========
result = offpolicy_trainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=500,
    step_per_epoch=1000,
    step_per_collect=1,
    episode_per_test=5000,
    batch_size=2,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=0.1,
    test_in_train=False,
    reward_metric=reward_metric,
)

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

Epoch #1: 1001it [00:01, 784.81it/s, env_step=1000, len=5, n/ep=1, n/st=10, player_2/loss=0.182, rew=-1.00]             


Epoch #1: test_reward: -0.171000 ± 0.925721, best_reward: -0.171000 ± 0.925721 in #1


Epoch #2: 1001it [00:01, 769.68it/s, env_step=2000, len=9, n/ep=1, n/st=10, player_2/loss=0.229, rew=0.00]              


Epoch #2: test_reward: -0.155000 ± 0.936042, best_reward: -0.155000 ± 0.936042 in #2


Epoch #3: 1001it [00:01, 790.12it/s, env_step=3000, len=9, n/ep=1, n/st=10, player_2/loss=0.329, rew=0.00]              


Epoch #3: test_reward: -0.025200 ± 0.978450, best_reward: -0.025200 ± 0.978450 in #3


Epoch #4: 1001it [00:01, 797.34it/s, env_step=4000, len=8, n/ep=1, n/st=10, player_2/loss=0.270, rew=1.00]              


Epoch #4: test_reward: 0.140800 ± 0.977433, best_reward: 0.140800 ± 0.977433 in #4


Epoch #5: 1001it [00:01, 792.52it/s, env_step=5000, len=8, n/ep=0, n/st=10, player_2/loss=0.271, rew=1.00]              


Epoch #5: test_reward: -0.016600 ± 0.975871, best_reward: 0.140800 ± 0.977433 in #4


Epoch #6: 1001it [00:01, 796.81it/s, env_step=6000, len=6, n/ep=0, n/st=10, player_2/loss=0.345, rew=-1.00]             


Epoch #6: test_reward: -0.159200 ± 0.937793, best_reward: 0.140800 ± 0.977433 in #4


Epoch #7: 1001it [00:01, 791.25it/s, env_step=7000, len=5, n/ep=1, n/st=10, player_2/loss=0.325, rew=-1.00]             


Epoch #7: test_reward: -0.160400 ± 0.973587, best_reward: 0.140800 ± 0.977433 in #4


Epoch #8: 1001it [00:01, 883.68it/s, env_step=8000, len=8, n/ep=2, n/st=10, rew=-1.00]                                  


Epoch #8: test_reward: -0.220000 ± 0.941701, best_reward: 0.140800 ± 0.977433 in #4


Epoch #9: 1001it [00:01, 807.80it/s, env_step=9000, len=8, n/ep=2, n/st=10, rew=0.50]                                   


Epoch #9: test_reward: -0.049800 ± 0.989606, best_reward: 0.140800 ± 0.977433 in #4


Epoch #10: 1001it [00:01, 828.66it/s, env_step=10000, len=8, n/ep=3, n/st=10, player_2/loss=0.321, rew=0.33]            


Epoch #10: test_reward: 0.021400 ± 0.977621, best_reward: 0.140800 ± 0.977433 in #4


Epoch #11: 1001it [00:01, 788.28it/s, env_step=11000, len=8, n/ep=3, n/st=10, player_2/loss=0.285, rew=1.00]            


Epoch #11: test_reward: -0.024200 ± 0.984487, best_reward: 0.140800 ± 0.977433 in #4


Epoch #12: 1001it [00:01, 814.01it/s, env_step=12000, len=7, n/ep=1, n/st=10, player_2/loss=0.340, rew=-1.00]           


Epoch #12: test_reward: -0.240200 ± 0.952945, best_reward: 0.140800 ± 0.977433 in #4


Epoch #13: 1001it [00:01, 818.95it/s, env_step=13000, len=7, n/ep=2, n/st=10, player_2/loss=0.263, rew=0.00]            


Epoch #13: test_reward: -0.046600 ± 0.957720, best_reward: 0.140800 ± 0.977433 in #4


Epoch #14: 1001it [00:01, 794.57it/s, env_step=14000, len=8, n/ep=1, n/st=10, player_2/loss=0.221, rew=1.00]            


Epoch #14: test_reward: -0.078800 ± 0.958640, best_reward: 0.140800 ± 0.977433 in #4


Epoch #15: 1001it [00:01, 843.42it/s, env_step=15000, len=5, n/ep=3, n/st=10, player_2/loss=0.214, rew=-1.00]           


Epoch #15: test_reward: 0.095200 ± 0.961528, best_reward: 0.140800 ± 0.977433 in #4


Epoch #16: 1001it [00:01, 844.20it/s, env_step=16000, len=8, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #16: test_reward: 0.148800 ± 0.962215, best_reward: 0.148800 ± 0.962215 in #16


Epoch #17: 1001it [00:01, 810.53it/s, env_step=17000, len=6, n/ep=2, n/st=10, player_2/loss=0.247, rew=-1.00]           


Epoch #17: test_reward: 0.256400 ± 0.928148, best_reward: 0.256400 ± 0.928148 in #17


Epoch #18: 1001it [00:01, 839.90it/s, env_step=18000, len=7, n/ep=2, n/st=10, player_2/loss=0.250, rew=0.00]            


Epoch #18: test_reward: 0.076400 ± 0.957999, best_reward: 0.256400 ± 0.928148 in #17


Epoch #19: 1001it [00:01, 850.62it/s, env_step=19000, len=8, n/ep=0, n/st=10, player_2/loss=0.326, rew=0.00]            


Epoch #19: test_reward: 0.021400 ± 0.969403, best_reward: 0.256400 ± 0.928148 in #17


Epoch #20: 1001it [00:01, 799.04it/s, env_step=20000, len=8, n/ep=1, n/st=10, player_2/loss=0.340, rew=1.00]            


Epoch #20: test_reward: 0.179800 ± 0.951983, best_reward: 0.256400 ± 0.928148 in #17


Epoch #21: 1001it [00:01, 811.86it/s, env_step=21000, len=8, n/ep=0, n/st=10, player_2/loss=0.344, rew=1.00]            


Epoch #21: test_reward: 0.187800 ± 0.960068, best_reward: 0.256400 ± 0.928148 in #17


Epoch #22: 1001it [00:01, 807.85it/s, env_step=22000, len=6, n/ep=2, n/st=10, player_2/loss=0.326, rew=1.00]            


Epoch #22: test_reward: 0.171000 ± 0.979265, best_reward: 0.256400 ± 0.928148 in #17


Epoch #23: 1001it [00:01, 793.41it/s, env_step=23000, len=9, n/ep=1, n/st=10, rew=0.00]                                 


Epoch #23: test_reward: 0.053400 ± 0.964027, best_reward: 0.256400 ± 0.928148 in #17


Epoch #24: 1001it [00:01, 795.72it/s, env_step=24000, len=6, n/ep=2, n/st=10, rew=0.00]                                 


Epoch #24: test_reward: -0.030600 ± 0.970806, best_reward: 0.256400 ± 0.928148 in #17


Epoch #25: 1001it [00:01, 816.18it/s, env_step=25000, len=6, n/ep=0, n/st=10, player_2/loss=0.310, rew=1.00]            


Epoch #25: test_reward: 0.101800 ± 0.958247, best_reward: 0.256400 ± 0.928148 in #17


Epoch #26: 1001it [00:01, 822.81it/s, env_step=26000, len=5, n/ep=1, n/st=10, rew=-1.00]                                


Epoch #26: test_reward: 0.139800 ± 0.941199, best_reward: 0.256400 ± 0.928148 in #17


Epoch #27: 1001it [00:01, 811.55it/s, env_step=27000, len=7, n/ep=0, n/st=10, rew=-1.00]                                


Epoch #27: test_reward: 0.037200 ± 0.960737, best_reward: 0.256400 ± 0.928148 in #17


Epoch #28: 1001it [00:01, 862.28it/s, env_step=28000, len=7, n/ep=3, n/st=10, rew=-0.33]                                


Epoch #28: test_reward: 0.199000 ± 0.939893, best_reward: 0.256400 ± 0.928148 in #17


Epoch #29: 1001it [00:01, 775.54it/s, env_step=29000, len=9, n/ep=2, n/st=10, player_2/loss=0.311, rew=-1.00]           


Epoch #29: test_reward: 0.273000 ± 0.928908, best_reward: 0.273000 ± 0.928908 in #29


Epoch #30: 1001it [00:01, 825.59it/s, env_step=30000, len=7, n/ep=3, n/st=10, rew=0.67]                                 


Epoch #30: test_reward: 0.226600 ± 0.956061, best_reward: 0.273000 ± 0.928908 in #29


Epoch #31: 1001it [00:01, 821.82it/s, env_step=31000, len=6, n/ep=4, n/st=10, player_2/loss=0.291, rew=0.00]            


Epoch #31: test_reward: 0.179200 ± 0.969684, best_reward: 0.273000 ± 0.928908 in #29


Epoch #32: 1001it [00:01, 799.01it/s, env_step=32000, len=6, n/ep=1, n/st=10, player_2/loss=0.232, rew=1.00]            


Epoch #32: test_reward: 0.321000 ± 0.921064, best_reward: 0.321000 ± 0.921064 in #32


Epoch #33: 1001it [00:01, 817.22it/s, env_step=33000, len=7, n/ep=1, n/st=10, player_2/loss=0.282, rew=-1.00]           


Epoch #33: test_reward: 0.027000 ± 0.989076, best_reward: 0.321000 ± 0.921064 in #32


Epoch #34: 1001it [00:01, 801.59it/s, env_step=34000, len=7, n/ep=0, n/st=10, rew=1.00]                                 


Epoch #34: test_reward: 0.214400 ± 0.960642, best_reward: 0.321000 ± 0.921064 in #32


Epoch #35: 1001it [00:01, 838.63it/s, env_step=35000, len=7, n/ep=0, n/st=10, player_2/loss=0.321, rew=0.33]            


Epoch #35: test_reward: 0.301600 ± 0.937570, best_reward: 0.321000 ± 0.921064 in #32


Epoch #36: 1001it [00:01, 887.32it/s, env_step=36000, len=7, n/ep=1, n/st=10, player_2/loss=0.324, rew=-1.00]           


Epoch #36: test_reward: 0.392400 ± 0.903561, best_reward: 0.392400 ± 0.903561 in #36


Epoch #37: 1001it [00:01, 745.35it/s, env_step=37000, len=6, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #37: test_reward: 0.204800 ± 0.967914, best_reward: 0.392400 ± 0.903561 in #36


Epoch #38: 1001it [00:01, 770.38it/s, env_step=38000, len=9, n/ep=1, n/st=10, player_2/loss=0.248, rew=-1.00]           


Epoch #38: test_reward: 0.333600 ± 0.932261, best_reward: 0.392400 ± 0.903561 in #36


Epoch #39: 1001it [00:01, 800.24it/s, env_step=39000, len=8, n/ep=1, n/st=10, player_2/loss=0.226, rew=1.00]            


Epoch #39: test_reward: 0.324400 ± 0.923669, best_reward: 0.392400 ± 0.903561 in #36


Epoch #40: 1001it [00:01, 848.60it/s, env_step=40000, len=7, n/ep=2, n/st=10, rew=1.00]                                 


Epoch #40: test_reward: 0.364000 ± 0.906148, best_reward: 0.392400 ± 0.903561 in #36


Epoch #41: 1001it [00:01, 780.87it/s, env_step=41000, len=6, n/ep=4, n/st=10, rew=0.50]                                 


Epoch #41: test_reward: 0.399600 ± 0.905936, best_reward: 0.399600 ± 0.905936 in #41


Epoch #42: 1001it [00:01, 791.07it/s, env_step=42000, len=5, n/ep=2, n/st=10, rew=0.00]                                 


Epoch #42: test_reward: 0.257800 ± 0.958404, best_reward: 0.399600 ± 0.905936 in #41


Epoch #43: 1001it [00:01, 797.76it/s, env_step=43000, len=8, n/ep=1, n/st=10, player_2/loss=0.317, rew=1.00]            


Epoch #43: test_reward: 0.363400 ± 0.919641, best_reward: 0.399600 ± 0.905936 in #41


Epoch #44: 1001it [00:01, 798.88it/s, env_step=44000, len=6, n/ep=2, n/st=10, player_2/loss=0.262, rew=0.00]            


Epoch #44: test_reward: 0.358000 ± 0.919258, best_reward: 0.399600 ± 0.905936 in #41


Epoch #45: 1001it [00:01, 789.30it/s, env_step=45000, len=6, n/ep=2, n/st=10, player_2/loss=0.247, rew=-1.00]           


Epoch #45: test_reward: 0.362200 ± 0.913789, best_reward: 0.399600 ± 0.905936 in #41


Epoch #46: 1001it [00:01, 826.35it/s, env_step=46000, len=5, n/ep=2, n/st=10, player_2/loss=0.264, rew=0.00]            


Epoch #46: test_reward: 0.313200 ± 0.937820, best_reward: 0.399600 ± 0.905936 in #41


Epoch #47: 1001it [00:01, 826.01it/s, env_step=47000, len=6, n/ep=0, n/st=10, rew=1.00]                                 


Epoch #47: test_reward: 0.494000 ± 0.842831, best_reward: 0.494000 ± 0.842831 in #47


Epoch #48: 1001it [00:01, 822.17it/s, env_step=48000, len=6, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #48: test_reward: 0.491800 ± 0.840912, best_reward: 0.494000 ± 0.842831 in #47


Epoch #49: 1001it [00:01, 837.68it/s, env_step=49000, len=7, n/ep=0, n/st=10, player_2/loss=0.207, rew=0.00]            


Epoch #49: test_reward: 0.456400 ± 0.870229, best_reward: 0.494000 ± 0.842831 in #47


Epoch #50: 1001it [00:01, 825.46it/s, env_step=50000, len=9, n/ep=1, n/st=10, player_2/loss=0.244, rew=-1.00]           


Epoch #50: test_reward: 0.428000 ± 0.879554, best_reward: 0.494000 ± 0.842831 in #47


Epoch #51: 1001it [00:01, 826.06it/s, env_step=51000, len=6, n/ep=1, n/st=10, player_2/loss=0.246, rew=1.00]            


Epoch #51: test_reward: 0.313200 ± 0.934615, best_reward: 0.494000 ± 0.842831 in #47


Epoch #52: 1001it [00:01, 822.93it/s, env_step=52000, len=7, n/ep=1, n/st=10, player_2/loss=0.220, rew=-1.00]           


Epoch #52: test_reward: 0.384400 ± 0.899242, best_reward: 0.494000 ± 0.842831 in #47


Epoch #53: 1001it [00:01, 849.66it/s, env_step=53000, len=6, n/ep=0, n/st=10, player_2/loss=0.259, rew=1.00]            


Epoch #53: test_reward: 0.309600 ± 0.928088, best_reward: 0.494000 ± 0.842831 in #47


Epoch #54: 1001it [00:01, 802.43it/s, env_step=54000, len=6, n/ep=6, n/st=10, player_2/loss=0.180, rew=0.00]            


Epoch #54: test_reward: 0.304600 ± 0.928557, best_reward: 0.494000 ± 0.842831 in #47


Epoch #55: 1001it [00:01, 808.03it/s, env_step=55000, len=7, n/ep=1, n/st=10, player_2/loss=0.274, rew=-1.00]           


Epoch #55: test_reward: 0.288000 ± 0.924909, best_reward: 0.494000 ± 0.842831 in #47


Epoch #56: 1001it [00:01, 877.41it/s, env_step=56000, len=8, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #56: test_reward: 0.389800 ± 0.882641, best_reward: 0.494000 ± 0.842831 in #47


Epoch #57: 1001it [00:01, 799.95it/s, env_step=57000, len=7, n/ep=3, n/st=10, rew=0.00]                                 


Epoch #57: test_reward: 0.403400 ± 0.889420, best_reward: 0.494000 ± 0.842831 in #47


Epoch #58: 1001it [00:01, 806.06it/s, env_step=58000, len=9, n/ep=1, n/st=10, player_2/loss=0.238, rew=-1.00]           


Epoch #58: test_reward: 0.223600 ± 0.943188, best_reward: 0.494000 ± 0.842831 in #47


Epoch #59: 1001it [00:01, 811.31it/s, env_step=59000, len=7, n/ep=2, n/st=10, player_2/loss=0.228, rew=1.00]            


Epoch #59: test_reward: 0.224000 ± 0.942456, best_reward: 0.494000 ± 0.842831 in #47


Epoch #60: 1001it [00:01, 861.10it/s, env_step=60000, len=6, n/ep=2, n/st=10, player_2/loss=0.301, rew=1.00]            


Epoch #60: test_reward: 0.271400 ± 0.922465, best_reward: 0.494000 ± 0.842831 in #47


Epoch #61: 1001it [00:01, 847.37it/s, env_step=61000, len=7, n/ep=1, n/st=10, player_2/loss=0.231, rew=-1.00]           


Epoch #61: test_reward: 0.267400 ± 0.933969, best_reward: 0.494000 ± 0.842831 in #47


Epoch #62: 1001it [00:01, 848.72it/s, env_step=62000, len=6, n/ep=2, n/st=10, player_2/loss=0.246, rew=0.00]            


Epoch #62: test_reward: 0.230000 ± 0.946942, best_reward: 0.494000 ± 0.842831 in #47


Epoch #63: 1001it [00:01, 832.31it/s, env_step=63000, len=8, n/ep=1, n/st=10, player_2/loss=0.251, rew=1.00]            


Epoch #63: test_reward: 0.204600 ± 0.950336, best_reward: 0.494000 ± 0.842831 in #47


Epoch #64: 1001it [00:01, 820.37it/s, env_step=64000, len=9, n/ep=1, n/st=10, player_2/loss=0.215, rew=0.00]            


Epoch #64: test_reward: 0.282600 ± 0.929267, best_reward: 0.494000 ± 0.842831 in #47


Epoch #65: 1001it [00:01, 816.35it/s, env_step=65000, len=7, n/ep=2, n/st=10, player_2/loss=0.243, rew=1.00]            


Epoch #65: test_reward: 0.237800 ± 0.939814, best_reward: 0.494000 ± 0.842831 in #47


Epoch #66: 1001it [00:01, 818.07it/s, env_step=66000, len=8, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #66: test_reward: 0.248800 ± 0.942178, best_reward: 0.494000 ± 0.842831 in #47


Epoch #67: 1001it [00:01, 845.78it/s, env_step=67000, len=8, n/ep=1, n/st=10, player_2/loss=0.264, rew=1.00]            


Epoch #67: test_reward: 0.211200 ± 0.950681, best_reward: 0.494000 ± 0.842831 in #47


Epoch #68: 1001it [00:01, 834.69it/s, env_step=68000, len=6, n/ep=1, n/st=10, rew=1.00]                                 


Epoch #68: test_reward: 0.292800 ± 0.923617, best_reward: 0.494000 ± 0.842831 in #47


Epoch #69: 1001it [00:01, 844.01it/s, env_step=69000, len=6, n/ep=0, n/st=10, player_2/loss=0.209, rew=1.00]            


Epoch #69: test_reward: 0.362400 ± 0.911628, best_reward: 0.494000 ± 0.842831 in #47


Epoch #70: 1001it [00:01, 848.45it/s, env_step=70000, len=8, n/ep=1, n/st=10, player_2/loss=0.290, rew=1.00]            


Epoch #70: test_reward: 0.276800 ± 0.934121, best_reward: 0.494000 ± 0.842831 in #47


Epoch #71: 1001it [00:01, 835.70it/s, env_step=71000, len=7, n/ep=1, n/st=10, rew=-1.00]                                


Epoch #71: test_reward: 0.279800 ± 0.942079, best_reward: 0.494000 ± 0.842831 in #47


Epoch #72: 1001it [00:01, 779.34it/s, env_step=72000, len=6, n/ep=2, n/st=10, player_2/loss=0.219, rew=1.00]            


Epoch #72: test_reward: 0.216600 ± 0.953354, best_reward: 0.494000 ± 0.842831 in #47


Epoch #73: 1001it [00:01, 837.66it/s, env_step=73000, len=7, n/ep=4, n/st=10, rew=0.50]                                 


Epoch #73: test_reward: 0.166000 ± 0.964180, best_reward: 0.494000 ± 0.842831 in #47


Epoch #74: 1001it [00:01, 792.39it/s, env_step=74000, len=6, n/ep=1, n/st=10, player_2/loss=0.268, rew=1.00]            


Epoch #74: test_reward: 0.287600 ± 0.936208, best_reward: 0.494000 ± 0.842831 in #47


Epoch #75: 1001it [00:01, 779.36it/s, env_step=75000, len=7, n/ep=4, n/st=10, player_2/loss=0.221, rew=0.00]            


Epoch #75: test_reward: 0.175200 ± 0.959846, best_reward: 0.494000 ± 0.842831 in #47


Epoch #76: 1001it [00:01, 795.44it/s, env_step=76000, len=6, n/ep=1, n/st=10, player_2/loss=0.250, rew=1.00]            


Epoch #76: test_reward: 0.259800 ± 0.948422, best_reward: 0.494000 ± 0.842831 in #47


Epoch #77: 1001it [00:01, 826.93it/s, env_step=77000, len=8, n/ep=1, n/st=10, player_2/loss=0.252, rew=1.00]            


Epoch #77: test_reward: 0.234200 ± 0.956530, best_reward: 0.494000 ± 0.842831 in #47


Epoch #78: 1001it [00:01, 751.58it/s, env_step=78000, len=7, n/ep=2, n/st=10, player_2/loss=0.274, rew=0.00]            


Epoch #78: test_reward: 0.162800 ± 0.975652, best_reward: 0.494000 ± 0.842831 in #47


Epoch #79: 1001it [00:01, 783.66it/s, env_step=79000, len=6, n/ep=2, n/st=10, player_2/loss=0.248, rew=1.00]            


Epoch #79: test_reward: 0.313000 ± 0.937993, best_reward: 0.494000 ± 0.842831 in #47


Epoch #80: 1001it [00:01, 768.35it/s, env_step=80000, len=6, n/ep=2, n/st=10, player_2/loss=0.300, rew=-1.00]           


Epoch #80: test_reward: 0.133200 ± 0.964706, best_reward: 0.494000 ± 0.842831 in #47


Epoch #81: 1001it [00:01, 781.42it/s, env_step=81000, len=7, n/ep=2, n/st=10, player_2/loss=0.264, rew=-1.00]           


Epoch #81: test_reward: 0.240800 ± 0.953947, best_reward: 0.494000 ± 0.842831 in #47


Epoch #82: 1001it [00:01, 823.29it/s, env_step=82000, len=7, n/ep=5, n/st=10, player_2/loss=0.210, rew=0.20]            


Epoch #82: test_reward: 0.277800 ± 0.948592, best_reward: 0.494000 ± 0.842831 in #47


Epoch #83: 1001it [00:01, 782.55it/s, env_step=83000, len=8, n/ep=2, n/st=10, player_2/loss=0.232, rew=-1.00]           


Epoch #83: test_reward: 0.333000 ± 0.928715, best_reward: 0.494000 ± 0.842831 in #47


Epoch #84: 1001it [00:01, 819.54it/s, env_step=84000, len=7, n/ep=2, n/st=10, rew=-0.50]                                


Epoch #84: test_reward: 0.276600 ± 0.940262, best_reward: 0.494000 ± 0.842831 in #47


Epoch #85: 1001it [00:01, 797.94it/s, env_step=85000, len=6, n/ep=3, n/st=10, player_2/loss=0.231, rew=0.33]            


Epoch #85: test_reward: 0.360600 ± 0.919874, best_reward: 0.494000 ± 0.842831 in #47


Epoch #86: 1001it [00:01, 817.23it/s, env_step=86000, len=8, n/ep=1, n/st=10, player_2/loss=0.128, rew=1.00]            


Epoch #86: test_reward: 0.400400 ± 0.902264, best_reward: 0.494000 ± 0.842831 in #47


Epoch #87: 1001it [00:01, 818.69it/s, env_step=87000, len=6, n/ep=4, n/st=10, player_2/loss=0.198, rew=1.00]            


Epoch #87: test_reward: 0.307000 ± 0.936563, best_reward: 0.494000 ± 0.842831 in #47


Epoch #88: 1001it [00:01, 806.11it/s, env_step=88000, len=5, n/ep=2, n/st=10, player_2/loss=0.276, rew=0.00]            


Epoch #88: test_reward: 0.312000 ± 0.930514, best_reward: 0.494000 ± 0.842831 in #47


Epoch #89: 1001it [00:01, 825.45it/s, env_step=89000, len=6, n/ep=1, n/st=10, player_2/loss=0.272, rew=1.00]            


Epoch #89: test_reward: 0.277800 ± 0.939269, best_reward: 0.494000 ± 0.842831 in #47


Epoch #90: 1001it [00:01, 808.34it/s, env_step=90000, len=6, n/ep=1, n/st=10, player_2/loss=0.207, rew=1.00]            


Epoch #90: test_reward: 0.260800 ± 0.939353, best_reward: 0.494000 ± 0.842831 in #47


Epoch #91: 1001it [00:01, 807.74it/s, env_step=91000, len=6, n/ep=0, n/st=10, player_2/loss=0.254, rew=1.00]            


Epoch #91: test_reward: 0.239600 ± 0.937972, best_reward: 0.494000 ± 0.842831 in #47


Epoch #92: 1001it [00:01, 814.48it/s, env_step=92000, len=7, n/ep=3, n/st=10, rew=0.33]                                 


Epoch #92: test_reward: 0.267200 ± 0.931131, best_reward: 0.494000 ± 0.842831 in #47


Epoch #93: 1001it [00:01, 850.05it/s, env_step=93000, len=9, n/ep=1, n/st=10, rew=-1.00]                                


Epoch #93: test_reward: 0.298800 ± 0.941232, best_reward: 0.494000 ± 0.842831 in #47


Epoch #94: 1001it [00:01, 784.73it/s, env_step=94000, len=5, n/ep=3, n/st=10, player_2/loss=0.253, rew=0.33]            


Epoch #94: test_reward: 0.284200 ± 0.931574, best_reward: 0.494000 ± 0.842831 in #47


Epoch #95: 1001it [00:01, 795.06it/s, env_step=95000, len=6, n/ep=1, n/st=10, player_2/loss=0.200, rew=1.00]            


Epoch #95: test_reward: 0.268400 ± 0.942529, best_reward: 0.494000 ± 0.842831 in #47


Epoch #96: 1001it [00:01, 809.83it/s, env_step=96000, len=6, n/ep=2, n/st=10, player_2/loss=0.223, rew=0.00]            


Epoch #96: test_reward: 0.294200 ± 0.922413, best_reward: 0.494000 ± 0.842831 in #47


Epoch #97: 1001it [00:01, 790.51it/s, env_step=97000, len=7, n/ep=2, n/st=10, rew=0.50]                                 


Epoch #97: test_reward: 0.307000 ± 0.924960, best_reward: 0.494000 ± 0.842831 in #47


Epoch #98: 1001it [00:01, 801.07it/s, env_step=98000, len=7, n/ep=0, n/st=10, player_2/loss=0.217, rew=0.00]            


Epoch #98: test_reward: 0.169600 ± 0.952699, best_reward: 0.494000 ± 0.842831 in #47


Epoch #99: 1001it [00:01, 857.23it/s, env_step=99000, len=6, n/ep=1, n/st=10, player_2/loss=0.252, rew=1.00]            


Epoch #99: test_reward: 0.214800 ± 0.951977, best_reward: 0.494000 ± 0.842831 in #47


Epoch #100: 1001it [00:01, 829.62it/s, env_step=100000, len=9, n/ep=1, n/st=10, player_2/loss=0.245, rew=0.00]          


Epoch #100: test_reward: 0.223000 ± 0.953557, best_reward: 0.494000 ± 0.842831 in #47


Epoch #101: 1001it [00:01, 818.44it/s, env_step=101000, len=6, n/ep=1, n/st=10, player_2/loss=0.203, rew=1.00]          


Epoch #101: test_reward: 0.245400 ± 0.945505, best_reward: 0.494000 ± 0.842831 in #47


Epoch #102: 1001it [00:01, 854.93it/s, env_step=102000, len=6, n/ep=5, n/st=10, rew=0.20]                               


Epoch #102: test_reward: 0.253600 ± 0.934926, best_reward: 0.494000 ± 0.842831 in #47


Epoch #103: 1001it [00:01, 833.13it/s, env_step=103000, len=8, n/ep=2, n/st=10, player_2/loss=0.240, rew=0.50]          


Epoch #103: test_reward: 0.103600 ± 0.972660, best_reward: 0.494000 ± 0.842831 in #47


Epoch #104: 1001it [00:01, 824.31it/s, env_step=104000, len=7, n/ep=3, n/st=10, player_2/loss=0.175, rew=-0.33]         


Epoch #104: test_reward: 0.242200 ± 0.949073, best_reward: 0.494000 ± 0.842831 in #47


Epoch #105: 1001it [00:01, 788.27it/s, env_step=105000, len=8, n/ep=1, n/st=10, rew=1.00]                               


Epoch #105: test_reward: 0.180200 ± 0.966089, best_reward: 0.494000 ± 0.842831 in #47


Epoch #106: 1001it [00:01, 825.57it/s, env_step=106000, len=6, n/ep=1, n/st=10, player_2/loss=0.298, rew=1.00]          


Epoch #106: test_reward: 0.253200 ± 0.945880, best_reward: 0.494000 ± 0.842831 in #47


Epoch #107: 1001it [00:01, 826.90it/s, env_step=107000, len=6, n/ep=1, n/st=10, player_2/loss=0.242, rew=1.00]          


Epoch #107: test_reward: 0.174800 ± 0.963039, best_reward: 0.494000 ± 0.842831 in #47


Epoch #108: 1001it [00:01, 832.93it/s, env_step=108000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #108: test_reward: 0.143800 ± 0.963079, best_reward: 0.494000 ± 0.842831 in #47


Epoch #109: 1001it [00:01, 891.66it/s, env_step=109000, len=9, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #109: test_reward: 0.184600 ± 0.962353, best_reward: 0.494000 ± 0.842831 in #47


Epoch #110: 1001it [00:01, 809.42it/s, env_step=110000, len=7, n/ep=3, n/st=10, player_2/loss=0.231, rew=-0.33]         


Epoch #110: test_reward: 0.302000 ± 0.935947, best_reward: 0.494000 ± 0.842831 in #47


Epoch #111: 1001it [00:01, 861.96it/s, env_step=111000, len=9, n/ep=1, n/st=10, player_2/loss=0.228, rew=-1.00]         


Epoch #111: test_reward: 0.277600 ± 0.934954, best_reward: 0.494000 ± 0.842831 in #47


Epoch #112: 1001it [00:01, 785.29it/s, env_step=112000, len=7, n/ep=2, n/st=10, player_2/loss=0.246, rew=0.00]          


Epoch #112: test_reward: 0.260800 ± 0.939140, best_reward: 0.494000 ± 0.842831 in #47


Epoch #113: 1001it [00:01, 826.97it/s, env_step=113000, len=6, n/ep=2, n/st=10, rew=0.00]                               


Epoch #113: test_reward: 0.311000 ± 0.929451, best_reward: 0.494000 ± 0.842831 in #47


Epoch #114: 1001it [00:01, 796.58it/s, env_step=114000, len=8, n/ep=2, n/st=10, player_2/loss=0.204, rew=1.00]          


Epoch #114: test_reward: 0.237200 ± 0.948966, best_reward: 0.494000 ± 0.842831 in #47


Epoch #115: 1001it [00:01, 811.76it/s, env_step=115000, len=5, n/ep=0, n/st=10, rew=0.00]                               


Epoch #115: test_reward: 0.282600 ± 0.934846, best_reward: 0.494000 ± 0.842831 in #47


Epoch #116: 1001it [00:01, 708.68it/s, env_step=116000, len=6, n/ep=5, n/st=10, rew=1.00]                               


Epoch #116: test_reward: 0.278800 ± 0.947349, best_reward: 0.494000 ± 0.842831 in #47


Epoch #117: 1001it [00:01, 810.90it/s, env_step=117000, len=6, n/ep=1, n/st=10, player_2/loss=0.293, rew=1.00]          


Epoch #117: test_reward: 0.264600 ± 0.952569, best_reward: 0.494000 ± 0.842831 in #47


Epoch #118: 1001it [00:01, 788.46it/s, env_step=118000, len=6, n/ep=1, n/st=10, player_2/loss=0.239, rew=1.00]          


Epoch #118: test_reward: 0.344200 ± 0.914618, best_reward: 0.494000 ± 0.842831 in #47


Epoch #119: 1001it [00:01, 822.02it/s, env_step=119000, len=7, n/ep=2, n/st=10, player_2/loss=0.246, rew=0.50]          


Epoch #119: test_reward: 0.351200 ± 0.914690, best_reward: 0.494000 ± 0.842831 in #47


Epoch #120: 1001it [00:01, 810.69it/s, env_step=120000, len=6, n/ep=1, n/st=10, player_2/loss=0.182, rew=1.00]          


Epoch #120: test_reward: 0.328800 ± 0.924927, best_reward: 0.494000 ± 0.842831 in #47


Epoch #121: 1001it [00:01, 826.34it/s, env_step=121000, len=6, n/ep=4, n/st=10, player_2/loss=0.241, rew=0.50]          


Epoch #121: test_reward: 0.289200 ± 0.942742, best_reward: 0.494000 ± 0.842831 in #47


Epoch #122: 1001it [00:01, 846.02it/s, env_step=122000, len=5, n/ep=3, n/st=10, rew=-0.33]                              


Epoch #122: test_reward: 0.314400 ± 0.930351, best_reward: 0.494000 ± 0.842831 in #47


Epoch #123: 1001it [00:01, 802.82it/s, env_step=123000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #123: test_reward: 0.317000 ± 0.941760, best_reward: 0.494000 ± 0.842831 in #47


Epoch #124: 1001it [00:01, 842.46it/s, env_step=124000, len=6, n/ep=1, n/st=10, player_2/loss=0.231, rew=1.00]          


Epoch #124: test_reward: 0.328000 ± 0.938944, best_reward: 0.494000 ± 0.842831 in #47


Epoch #125: 1001it [00:01, 786.54it/s, env_step=125000, len=7, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #125: test_reward: 0.263400 ± 0.951221, best_reward: 0.494000 ± 0.842831 in #47


Epoch #126: 1001it [00:01, 789.03it/s, env_step=126000, len=7, n/ep=2, n/st=10, player_2/loss=0.318, rew=1.00]          


Epoch #126: test_reward: 0.326200 ± 0.937760, best_reward: 0.494000 ± 0.842831 in #47


Epoch #127: 1001it [00:01, 883.66it/s, env_step=127000, len=8, n/ep=3, n/st=10, player_2/loss=0.343, rew=1.00]          


Epoch #127: test_reward: 0.351600 ± 0.925623, best_reward: 0.494000 ± 0.842831 in #47


Epoch #128: 1001it [00:01, 820.74it/s, env_step=128000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #128: test_reward: 0.309400 ± 0.938548, best_reward: 0.494000 ± 0.842831 in #47


Epoch #129: 1001it [00:01, 793.80it/s, env_step=129000, len=6, n/ep=1, n/st=10, player_2/loss=0.244, rew=1.00]          


Epoch #129: test_reward: 0.300200 ± 0.935136, best_reward: 0.494000 ± 0.842831 in #47


Epoch #130: 1001it [00:01, 798.25it/s, env_step=130000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #130: test_reward: 0.350800 ± 0.913531, best_reward: 0.494000 ± 0.842831 in #47


Epoch #131: 1001it [00:01, 839.34it/s, env_step=131000, len=6, n/ep=1, n/st=10, player_2/loss=0.245, rew=1.00]          


Epoch #131: test_reward: 0.369200 ± 0.912190, best_reward: 0.494000 ± 0.842831 in #47


Epoch #132: 1001it [00:01, 851.29it/s, env_step=132000, len=6, n/ep=2, n/st=10, player_2/loss=0.207, rew=0.00]          


Epoch #132: test_reward: 0.343800 ± 0.924122, best_reward: 0.494000 ± 0.842831 in #47


Epoch #133: 1001it [00:01, 792.33it/s, env_step=133000, len=8, n/ep=0, n/st=10, player_2/loss=0.210, rew=1.00]          


Epoch #133: test_reward: 0.240800 ± 0.946581, best_reward: 0.494000 ± 0.842831 in #47


Epoch #134: 1001it [00:01, 813.97it/s, env_step=134000, len=7, n/ep=2, n/st=10, rew=0.00]                               


Epoch #134: test_reward: 0.254600 ± 0.950252, best_reward: 0.494000 ± 0.842831 in #47


Epoch #135: 1001it [00:01, 822.71it/s, env_step=135000, len=6, n/ep=0, n/st=10, rew=-1.00]                              


Epoch #135: test_reward: 0.276600 ± 0.949996, best_reward: 0.494000 ± 0.842831 in #47


Epoch #136: 1001it [00:01, 826.23it/s, env_step=136000, len=6, n/ep=1, n/st=10, player_2/loss=0.273, rew=1.00]          


Epoch #136: test_reward: 0.295400 ± 0.943048, best_reward: 0.494000 ± 0.842831 in #47


Epoch #137: 1001it [00:01, 881.86it/s, env_step=137000, len=6, n/ep=0, n/st=10, player_2/loss=0.184, rew=1.00]          


Epoch #137: test_reward: 0.294800 ± 0.946305, best_reward: 0.494000 ± 0.842831 in #47


Epoch #138: 1001it [00:01, 812.97it/s, env_step=138000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #138: test_reward: 0.346400 ± 0.927581, best_reward: 0.494000 ± 0.842831 in #47


Epoch #139: 1001it [00:01, 813.40it/s, env_step=139000, len=6, n/ep=1, n/st=10, player_2/loss=0.299, rew=1.00]          


Epoch #139: test_reward: 0.317800 ± 0.934667, best_reward: 0.494000 ± 0.842831 in #47


Epoch #140: 1001it [00:01, 840.22it/s, env_step=140000, len=6, n/ep=1, n/st=10, player_2/loss=0.251, rew=1.00]          


Epoch #140: test_reward: 0.299800 ± 0.942719, best_reward: 0.494000 ± 0.842831 in #47


Epoch #141: 1001it [00:01, 803.90it/s, env_step=141000, len=9, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #141: test_reward: 0.314000 ± 0.931989, best_reward: 0.494000 ± 0.842831 in #47


Epoch #142: 1001it [00:01, 824.60it/s, env_step=142000, len=6, n/ep=0, n/st=10, player_2/loss=0.285, rew=0.00]          


Epoch #142: test_reward: 0.283000 ± 0.942821, best_reward: 0.494000 ± 0.842831 in #47


Epoch #143: 1001it [00:01, 784.68it/s, env_step=143000, len=6, n/ep=1, n/st=10, player_2/loss=0.161, rew=1.00]          


Epoch #143: test_reward: 0.326200 ± 0.941379, best_reward: 0.494000 ± 0.842831 in #47


Epoch #144: 1001it [00:01, 859.59it/s, env_step=144000, len=7, n/ep=2, n/st=10, player_2/loss=0.209, rew=0.00]          


Epoch #144: test_reward: 0.364600 ± 0.921123, best_reward: 0.494000 ± 0.842831 in #47


Epoch #145: 1001it [00:01, 817.84it/s, env_step=145000, len=6, n/ep=1, n/st=10, player_2/loss=0.258, rew=1.00]          


Epoch #145: test_reward: 0.383400 ± 0.912800, best_reward: 0.494000 ± 0.842831 in #47


Epoch #146: 1001it [00:01, 777.11it/s, env_step=146000, len=7, n/ep=1, n/st=10, player_2/loss=0.185, rew=-1.00]         


Epoch #146: test_reward: 0.413600 ± 0.897182, best_reward: 0.494000 ± 0.842831 in #47


Epoch #147: 1001it [00:01, 779.87it/s, env_step=147000, len=6, n/ep=0, n/st=10, rew=0.00]                               


Epoch #147: test_reward: 0.379600 ± 0.910551, best_reward: 0.494000 ± 0.842831 in #47


Epoch #148: 1001it [00:01, 840.42it/s, env_step=148000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #148: test_reward: 0.349200 ± 0.925667, best_reward: 0.494000 ± 0.842831 in #47


Epoch #149: 1001it [00:01, 808.00it/s, env_step=149000, len=6, n/ep=4, n/st=10, player_2/loss=0.222, rew=0.00]          


Epoch #149: test_reward: 0.391600 ± 0.913810, best_reward: 0.494000 ± 0.842831 in #47


Epoch #150: 1001it [00:01, 838.06it/s, env_step=150000, len=7, n/ep=3, n/st=10, rew=-0.33]                              


Epoch #150: test_reward: 0.346000 ± 0.928592, best_reward: 0.494000 ± 0.842831 in #47


Epoch #151: 1001it [00:01, 851.73it/s, env_step=151000, len=7, n/ep=3, n/st=10, player_2/loss=0.190, rew=0.33]          


Epoch #151: test_reward: 0.313600 ± 0.939391, best_reward: 0.494000 ± 0.842831 in #47


Epoch #152: 1001it [00:01, 810.83it/s, env_step=152000, len=7, n/ep=2, n/st=10, player_2/loss=0.208, rew=0.50]          


Epoch #152: test_reward: 0.345800 ± 0.930066, best_reward: 0.494000 ± 0.842831 in #47


Epoch #153: 1001it [00:01, 824.38it/s, env_step=153000, len=7, n/ep=3, n/st=10, player_2/loss=0.223, rew=0.33]          


Epoch #153: test_reward: 0.307200 ± 0.939376, best_reward: 0.494000 ± 0.842831 in #47


Epoch #154: 1001it [00:01, 847.06it/s, env_step=154000, len=7, n/ep=2, n/st=10, player_2/loss=0.216, rew=0.50]          


Epoch #154: test_reward: 0.334000 ± 0.932761, best_reward: 0.494000 ± 0.842831 in #47


Epoch #155: 1001it [00:01, 804.80it/s, env_step=155000, len=7, n/ep=4, n/st=10, player_2/loss=0.154, rew=0.00]          


Epoch #155: test_reward: 0.397800 ± 0.904409, best_reward: 0.494000 ± 0.842831 in #47


Epoch #156: 1001it [00:01, 809.61it/s, env_step=156000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #156: test_reward: 0.393800 ± 0.906599, best_reward: 0.494000 ± 0.842831 in #47


Epoch #157: 1001it [00:01, 792.65it/s, env_step=157000, len=9, n/ep=1, n/st=10, player_2/loss=0.192, rew=-1.00]         


Epoch #157: test_reward: 0.380000 ± 0.912140, best_reward: 0.494000 ± 0.842831 in #47


Epoch #158: 1001it [00:01, 810.05it/s, env_step=158000, len=6, n/ep=1, n/st=10, player_2/loss=0.185, rew=1.00]          


Epoch #158: test_reward: 0.311600 ± 0.935150, best_reward: 0.494000 ± 0.842831 in #47


Epoch #159: 1001it [00:01, 787.98it/s, env_step=159000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #159: test_reward: 0.293400 ± 0.943036, best_reward: 0.494000 ± 0.842831 in #47


Epoch #160: 1001it [00:01, 833.42it/s, env_step=160000, len=6, n/ep=1, n/st=10, player_2/loss=0.230, rew=1.00]          


Epoch #160: test_reward: 0.318200 ± 0.929811, best_reward: 0.494000 ± 0.842831 in #47


Epoch #161: 1001it [00:01, 840.66it/s, env_step=161000, len=6, n/ep=3, n/st=10, player_2/loss=0.246, rew=1.00]          


Epoch #161: test_reward: 0.313200 ± 0.934187, best_reward: 0.494000 ± 0.842831 in #47


Epoch #162: 1001it [00:01, 831.00it/s, env_step=162000, len=8, n/ep=4, n/st=10, rew=0.00]                               


Epoch #162: test_reward: 0.280000 ± 0.947206, best_reward: 0.494000 ± 0.842831 in #47


Epoch #163: 1001it [00:01, 855.37it/s, env_step=163000, len=7, n/ep=0, n/st=10, player_2/loss=0.187, rew=-1.00]         


Epoch #163: test_reward: 0.243600 ± 0.963462, best_reward: 0.494000 ± 0.842831 in #47


Epoch #164: 1001it [00:01, 850.25it/s, env_step=164000, len=7, n/ep=2, n/st=10, player_2/loss=0.238, rew=0.00]          


Epoch #164: test_reward: 0.358200 ± 0.924712, best_reward: 0.494000 ± 0.842831 in #47


Epoch #165: 1001it [00:01, 813.55it/s, env_step=165000, len=9, n/ep=1, n/st=10, player_2/loss=0.270, rew=-1.00]         


Epoch #165: test_reward: 0.382000 ± 0.910866, best_reward: 0.494000 ± 0.842831 in #47


Epoch #166: 1001it [00:01, 807.84it/s, env_step=166000, len=7, n/ep=0, n/st=10, player_2/loss=0.206, rew=0.33]          


Epoch #166: test_reward: 0.355600 ± 0.918231, best_reward: 0.494000 ± 0.842831 in #47


Epoch #167: 1001it [00:01, 803.87it/s, env_step=167000, len=7, n/ep=3, n/st=10, rew=0.33]                               


Epoch #167: test_reward: 0.357000 ± 0.921711, best_reward: 0.494000 ± 0.842831 in #47


Epoch #168: 1001it [00:01, 819.96it/s, env_step=168000, len=6, n/ep=4, n/st=10, rew=0.50]                               


Epoch #168: test_reward: 0.340200 ± 0.926749, best_reward: 0.494000 ± 0.842831 in #47


Epoch #169: 1001it [00:01, 785.16it/s, env_step=169000, len=7, n/ep=1, n/st=10, player_2/loss=0.186, rew=-1.00]         


Epoch #169: test_reward: 0.336600 ± 0.928278, best_reward: 0.494000 ± 0.842831 in #47


Epoch #170: 1001it [00:01, 858.15it/s, env_step=170000, len=8, n/ep=1, n/st=10, player_2/loss=0.216, rew=1.00]          


Epoch #170: test_reward: 0.384000 ± 0.909145, best_reward: 0.494000 ± 0.842831 in #47


Epoch #171: 1001it [00:01, 787.03it/s, env_step=171000, len=7, n/ep=2, n/st=10, player_2/loss=0.252, rew=0.50]          


Epoch #171: test_reward: 0.405200 ± 0.892980, best_reward: 0.494000 ± 0.842831 in #47


Epoch #172: 1001it [00:01, 825.65it/s, env_step=172000, len=6, n/ep=1, n/st=10, player_2/loss=0.295, rew=1.00]          


Epoch #172: test_reward: 0.386000 ± 0.896774, best_reward: 0.494000 ± 0.842831 in #47


Epoch #173: 1001it [00:01, 767.81it/s, env_step=173000, len=6, n/ep=4, n/st=10, player_2/loss=0.231, rew=0.50]          


Epoch #173: test_reward: 0.439400 ± 0.875858, best_reward: 0.494000 ± 0.842831 in #47


Epoch #174: 1001it [00:01, 800.42it/s, env_step=174000, len=7, n/ep=2, n/st=10, rew=1.00]                               


Epoch #174: test_reward: 0.410000 ± 0.894595, best_reward: 0.494000 ± 0.842831 in #47


Epoch #175: 1001it [00:01, 807.65it/s, env_step=175000, len=6, n/ep=3, n/st=10, player_2/loss=0.265, rew=1.00]          


Epoch #175: test_reward: 0.409000 ± 0.893375, best_reward: 0.494000 ± 0.842831 in #47


Epoch #176: 1001it [00:01, 824.29it/s, env_step=176000, len=6, n/ep=0, n/st=10, player_2/loss=0.197, rew=1.00]          


Epoch #176: test_reward: 0.425200 ± 0.880911, best_reward: 0.494000 ± 0.842831 in #47


Epoch #177: 1001it [00:01, 814.64it/s, env_step=177000, len=6, n/ep=3, n/st=10, player_2/loss=0.219, rew=0.33]          


Epoch #177: test_reward: 0.424000 ± 0.887369, best_reward: 0.494000 ± 0.842831 in #47


Epoch #178: 1001it [00:01, 774.68it/s, env_step=178000, len=7, n/ep=3, n/st=10, player_2/loss=0.206, rew=-0.67]         


Epoch #178: test_reward: 0.415600 ± 0.882993, best_reward: 0.494000 ± 0.842831 in #47


Epoch #179: 1001it [00:01, 817.93it/s, env_step=179000, len=7, n/ep=2, n/st=10, player_2/loss=0.218, rew=1.00]          


Epoch #179: test_reward: 0.377800 ± 0.905244, best_reward: 0.494000 ± 0.842831 in #47


Epoch #180: 1001it [00:01, 823.81it/s, env_step=180000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #180: test_reward: 0.350800 ± 0.917464, best_reward: 0.494000 ± 0.842831 in #47


Epoch #181: 1001it [00:01, 829.29it/s, env_step=181000, len=7, n/ep=1, n/st=10, player_2/loss=0.160, rew=-1.00]         


Epoch #181: test_reward: 0.383000 ± 0.904163, best_reward: 0.494000 ± 0.842831 in #47


Epoch #182: 1001it [00:01, 839.87it/s, env_step=182000, len=5, n/ep=2, n/st=10, player_2/loss=0.222, rew=0.00]          


Epoch #182: test_reward: 0.388200 ± 0.904157, best_reward: 0.494000 ± 0.842831 in #47


Epoch #183: 1001it [00:01, 789.98it/s, env_step=183000, len=6, n/ep=2, n/st=10, player_2/loss=0.221, rew=0.00]          


Epoch #183: test_reward: 0.362800 ± 0.915410, best_reward: 0.494000 ± 0.842831 in #47


Epoch #184: 1001it [00:01, 814.86it/s, env_step=184000, len=5, n/ep=3, n/st=10, player_2/loss=0.253, rew=0.33]          


Epoch #184: test_reward: 0.359400 ± 0.913144, best_reward: 0.494000 ± 0.842831 in #47


Epoch #185: 1001it [00:01, 832.11it/s, env_step=185000, len=8, n/ep=1, n/st=10, player_2/loss=0.193, rew=1.00]          


Epoch #185: test_reward: 0.325400 ± 0.930545, best_reward: 0.494000 ± 0.842831 in #47


Epoch #186: 1001it [00:01, 770.72it/s, env_step=186000, len=7, n/ep=1, n/st=10, player_2/loss=0.191, rew=-1.00]         


Epoch #186: test_reward: 0.341800 ± 0.923782, best_reward: 0.494000 ± 0.842831 in #47


Epoch #187: 1001it [00:01, 779.51it/s, env_step=187000, len=6, n/ep=3, n/st=10, player_2/loss=0.259, rew=0.33]          


Epoch #187: test_reward: 0.325800 ± 0.934053, best_reward: 0.494000 ± 0.842831 in #47


Epoch #188: 1001it [00:01, 809.19it/s, env_step=188000, len=8, n/ep=1, n/st=10, player_2/loss=0.289, rew=1.00]          


Epoch #188: test_reward: 0.355800 ± 0.925638, best_reward: 0.494000 ± 0.842831 in #47


Epoch #189: 1001it [00:01, 784.80it/s, env_step=189000, len=6, n/ep=1, n/st=10, player_2/loss=0.226, rew=1.00]          


Epoch #189: test_reward: 0.422000 ± 0.896391, best_reward: 0.494000 ± 0.842831 in #47


Epoch #190: 1001it [00:01, 814.25it/s, env_step=190000, len=6, n/ep=1, n/st=10, player_2/loss=0.234, rew=1.00]          


Epoch #190: test_reward: 0.359400 ± 0.920343, best_reward: 0.494000 ± 0.842831 in #47


Epoch #191: 1001it [00:01, 815.20it/s, env_step=191000, len=7, n/ep=1, n/st=10, player_2/loss=0.250, rew=-1.00]         


Epoch #191: test_reward: 0.324800 ± 0.933437, best_reward: 0.494000 ± 0.842831 in #47


Epoch #192: 1001it [00:01, 809.66it/s, env_step=192000, len=6, n/ep=2, n/st=10, player_2/loss=0.159, rew=1.00]          


Epoch #192: test_reward: 0.337200 ± 0.922982, best_reward: 0.494000 ± 0.842831 in #47


Epoch #193: 1001it [00:01, 793.69it/s, env_step=193000, len=8, n/ep=2, n/st=10, player_2/loss=0.158, rew=0.00]          


Epoch #193: test_reward: 0.349800 ± 0.920348, best_reward: 0.494000 ± 0.842831 in #47


Epoch #194: 1001it [00:01, 811.11it/s, env_step=194000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #194: test_reward: 0.383200 ± 0.902861, best_reward: 0.494000 ± 0.842831 in #47


Epoch #195: 1001it [00:01, 813.89it/s, env_step=195000, len=6, n/ep=1, n/st=10, player_2/loss=0.147, rew=1.00]          


Epoch #195: test_reward: 0.355800 ± 0.914553, best_reward: 0.494000 ± 0.842831 in #47


Epoch #196: 1001it [00:01, 853.10it/s, env_step=196000, len=7, n/ep=2, n/st=10, player_2/loss=0.202, rew=1.00]          


Epoch #196: test_reward: 0.391200 ± 0.899646, best_reward: 0.494000 ± 0.842831 in #47


Epoch #197: 1001it [00:01, 779.55it/s, env_step=197000, len=8, n/ep=1, n/st=10, rew=1.00]                               


Epoch #197: test_reward: 0.372000 ± 0.907974, best_reward: 0.494000 ± 0.842831 in #47


Epoch #198: 1001it [00:01, 808.95it/s, env_step=198000, len=6, n/ep=2, n/st=10, player_2/loss=0.248, rew=0.00]          


Epoch #198: test_reward: 0.321000 ± 0.938061, best_reward: 0.494000 ± 0.842831 in #47


Epoch #199: 1001it [00:01, 838.28it/s, env_step=199000, len=6, n/ep=3, n/st=10, player_2/loss=0.237, rew=1.00]          


Epoch #199: test_reward: 0.337400 ± 0.932074, best_reward: 0.494000 ± 0.842831 in #47


Epoch #200: 1001it [00:01, 831.17it/s, env_step=200000, len=6, n/ep=0, n/st=10, player_2/loss=0.240, rew=-0.50]         


Epoch #200: test_reward: 0.373600 ± 0.913030, best_reward: 0.494000 ± 0.842831 in #47


Epoch #201: 1001it [00:01, 813.70it/s, env_step=201000, len=6, n/ep=2, n/st=10, player_2/loss=0.268, rew=-1.00]         


Epoch #201: test_reward: 0.355600 ± 0.923227, best_reward: 0.494000 ± 0.842831 in #47


Epoch #202: 1001it [00:01, 780.05it/s, env_step=202000, len=6, n/ep=1, n/st=10, player_2/loss=0.231, rew=1.00]          


Epoch #202: test_reward: 0.308800 ± 0.940767, best_reward: 0.494000 ± 0.842831 in #47


Epoch #203: 1001it [00:01, 793.46it/s, env_step=203000, len=7, n/ep=3, n/st=10, player_2/loss=0.196, rew=0.33]          


Epoch #203: test_reward: 0.325000 ± 0.932617, best_reward: 0.494000 ± 0.842831 in #47


Epoch #204: 1001it [00:01, 809.32it/s, env_step=204000, len=6, n/ep=2, n/st=10, player_2/loss=0.230, rew=1.00]          


Epoch #204: test_reward: 0.380400 ± 0.914601, best_reward: 0.494000 ± 0.842831 in #47


Epoch #205: 1001it [00:01, 792.21it/s, env_step=205000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #205: test_reward: 0.392400 ± 0.907977, best_reward: 0.494000 ± 0.842831 in #47


Epoch #206: 1001it [00:01, 819.92it/s, env_step=206000, len=6, n/ep=0, n/st=10, player_2/loss=0.238, rew=0.00]          


Epoch #206: test_reward: 0.395400 ± 0.909208, best_reward: 0.494000 ± 0.842831 in #47


Epoch #207: 1001it [00:01, 796.69it/s, env_step=207000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #207: test_reward: 0.377600 ± 0.917507, best_reward: 0.494000 ± 0.842831 in #47


Epoch #208: 1001it [00:01, 807.29it/s, env_step=208000, len=6, n/ep=0, n/st=10, player_2/loss=0.177, rew=1.00]          


Epoch #208: test_reward: 0.377200 ± 0.917453, best_reward: 0.494000 ± 0.842831 in #47


Epoch #209: 1001it [00:01, 835.01it/s, env_step=209000, len=6, n/ep=2, n/st=10, player_2/loss=0.193, rew=0.00]          


Epoch #209: test_reward: 0.393600 ± 0.908999, best_reward: 0.494000 ± 0.842831 in #47


Epoch #210: 1001it [00:01, 786.71it/s, env_step=210000, len=8, n/ep=1, n/st=10, player_2/loss=0.191, rew=1.00]          


Epoch #210: test_reward: 0.344400 ± 0.922924, best_reward: 0.494000 ± 0.842831 in #47


Epoch #211: 1001it [00:01, 831.36it/s, env_step=211000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #211: test_reward: 0.345600 ± 0.924857, best_reward: 0.494000 ± 0.842831 in #47


Epoch #212: 1001it [00:01, 811.32it/s, env_step=212000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #212: test_reward: 0.307000 ± 0.941675, best_reward: 0.494000 ± 0.842831 in #47


Epoch #213: 1001it [00:01, 807.53it/s, env_step=213000, len=6, n/ep=1, n/st=10, player_2/loss=0.280, rew=1.00]          


Epoch #213: test_reward: 0.305600 ± 0.940962, best_reward: 0.494000 ± 0.842831 in #47


Epoch #214: 1001it [00:01, 821.98it/s, env_step=214000, len=6, n/ep=1, n/st=10, player_2/loss=0.236, rew=1.00]          


Epoch #214: test_reward: 0.294400 ± 0.946641, best_reward: 0.494000 ± 0.842831 in #47


Epoch #215: 1001it [00:01, 800.70it/s, env_step=215000, len=6, n/ep=1, n/st=10, player_2/loss=0.227, rew=1.00]          


Epoch #215: test_reward: 0.270800 ± 0.952191, best_reward: 0.494000 ± 0.842831 in #47


Epoch #216: 1001it [00:01, 789.37it/s, env_step=216000, len=7, n/ep=2, n/st=10, player_2/loss=0.200, rew=1.00]          


Epoch #216: test_reward: 0.353800 ± 0.924027, best_reward: 0.494000 ± 0.842831 in #47


Epoch #217: 1001it [00:01, 815.13it/s, env_step=217000, len=7, n/ep=3, n/st=10, player_2/loss=0.226, rew=0.33]          


Epoch #217: test_reward: 0.333800 ± 0.937218, best_reward: 0.494000 ± 0.842831 in #47


Epoch #218: 1001it [00:01, 840.40it/s, env_step=218000, len=7, n/ep=2, n/st=10, player_2/loss=0.227, rew=0.00]          


Epoch #218: test_reward: 0.354000 ± 0.929238, best_reward: 0.494000 ± 0.842831 in #47


Epoch #219: 1001it [00:01, 807.02it/s, env_step=219000, len=6, n/ep=2, n/st=10, player_2/loss=0.192, rew=-1.00]         


Epoch #219: test_reward: 0.398800 ± 0.912446, best_reward: 0.494000 ± 0.842831 in #47


Epoch #220: 1001it [00:01, 828.88it/s, env_step=220000, len=9, n/ep=1, n/st=10, player_2/loss=0.196, rew=-1.00]         


Epoch #220: test_reward: 0.400800 ± 0.908273, best_reward: 0.494000 ± 0.842831 in #47


Epoch #221: 1001it [00:01, 824.92it/s, env_step=221000, len=7, n/ep=2, n/st=10, rew=1.00]                               


Epoch #221: test_reward: 0.345600 ± 0.932181, best_reward: 0.494000 ± 0.842831 in #47


Epoch #222: 1001it [00:01, 784.49it/s, env_step=222000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #222: test_reward: 0.349200 ± 0.928256, best_reward: 0.494000 ± 0.842831 in #47


Epoch #223: 1001it [00:01, 854.35it/s, env_step=223000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #223: test_reward: 0.395000 ± 0.909382, best_reward: 0.494000 ± 0.842831 in #47


Epoch #224: 1001it [00:01, 775.57it/s, env_step=224000, len=7, n/ep=2, n/st=10, player_2/loss=0.206, rew=1.00]          


Epoch #224: test_reward: 0.346800 ± 0.927863, best_reward: 0.494000 ± 0.842831 in #47


Epoch #225: 1001it [00:01, 760.39it/s, env_step=225000, len=6, n/ep=1, n/st=10, player_2/loss=0.226, rew=1.00]          


Epoch #225: test_reward: 0.344200 ± 0.932591, best_reward: 0.494000 ± 0.842831 in #47


Epoch #226: 1001it [00:01, 785.93it/s, env_step=226000, len=6, n/ep=1, n/st=10, player_2/loss=0.240, rew=1.00]          


Epoch #226: test_reward: 0.354000 ± 0.927946, best_reward: 0.494000 ± 0.842831 in #47


Epoch #227: 1001it [00:01, 820.31it/s, env_step=227000, len=7, n/ep=2, n/st=10, player_2/loss=0.194, rew=-1.00]         


Epoch #227: test_reward: 0.312000 ± 0.942473, best_reward: 0.494000 ± 0.842831 in #47


Epoch #228: 1001it [00:01, 836.73it/s, env_step=228000, len=7, n/ep=3, n/st=10, rew=0.33]                               


Epoch #228: test_reward: 0.306600 ± 0.944138, best_reward: 0.494000 ± 0.842831 in #47


Epoch #229: 1001it [00:01, 830.82it/s, env_step=229000, len=7, n/ep=2, n/st=10, rew=1.00]                               


Epoch #229: test_reward: 0.337000 ± 0.935858, best_reward: 0.494000 ± 0.842831 in #47


Epoch #230: 1001it [00:01, 832.22it/s, env_step=230000, len=7, n/ep=0, n/st=10, player_2/loss=0.289, rew=-1.00]         


Epoch #230: test_reward: 0.350000 ± 0.925365, best_reward: 0.494000 ± 0.842831 in #47


Epoch #231: 1001it [00:01, 828.51it/s, env_step=231000, len=6, n/ep=2, n/st=10, player_2/loss=0.291, rew=1.00]          


Epoch #231: test_reward: 0.339600 ± 0.929877, best_reward: 0.494000 ± 0.842831 in #47


Epoch #232: 1001it [00:01, 863.25it/s, env_step=232000, len=7, n/ep=3, n/st=10, player_2/loss=0.184, rew=0.33]          


Epoch #232: test_reward: 0.378000 ± 0.914940, best_reward: 0.494000 ± 0.842831 in #47


Epoch #233: 1001it [00:01, 796.20it/s, env_step=233000, len=5, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #233: test_reward: 0.357600 ± 0.923538, best_reward: 0.494000 ± 0.842831 in #47


Epoch #234: 1001it [00:01, 810.86it/s, env_step=234000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #234: test_reward: 0.381800 ± 0.917185, best_reward: 0.494000 ± 0.842831 in #47


Epoch #235: 1001it [00:01, 810.38it/s, env_step=235000, len=6, n/ep=1, n/st=10, player_2/loss=0.223, rew=1.00]          


Epoch #235: test_reward: 0.321000 ± 0.943588, best_reward: 0.494000 ± 0.842831 in #47


Epoch #236: 1001it [00:01, 807.56it/s, env_step=236000, len=6, n/ep=1, n/st=10, player_2/loss=0.144, rew=1.00]          


Epoch #236: test_reward: 0.279600 ± 0.953847, best_reward: 0.494000 ± 0.842831 in #47


Epoch #237: 1001it [00:01, 814.96it/s, env_step=237000, len=6, n/ep=0, n/st=10, player_2/loss=0.123, rew=1.00]          


Epoch #237: test_reward: 0.363000 ± 0.925436, best_reward: 0.494000 ± 0.842831 in #47


Epoch #238: 1001it [00:01, 788.14it/s, env_step=238000, len=9, n/ep=2, n/st=10, player_2/loss=0.253, rew=-1.00]         


Epoch #238: test_reward: 0.323600 ± 0.943230, best_reward: 0.494000 ± 0.842831 in #47


Epoch #239: 1001it [00:01, 785.69it/s, env_step=239000, len=7, n/ep=2, n/st=10, player_2/loss=0.251, rew=0.00]          


Epoch #239: test_reward: 0.316600 ± 0.942319, best_reward: 0.494000 ± 0.842831 in #47


Epoch #240: 1001it [00:01, 778.49it/s, env_step=240000, len=6, n/ep=2, n/st=10, player_2/loss=0.229, rew=0.00]          


Epoch #240: test_reward: 0.314000 ± 0.947736, best_reward: 0.494000 ± 0.842831 in #47


Epoch #241: 1001it [00:01, 852.90it/s, env_step=241000, len=8, n/ep=1, n/st=10, player_2/loss=0.133, rew=1.00]          


Epoch #241: test_reward: 0.297800 ± 0.947162, best_reward: 0.494000 ± 0.842831 in #47


Epoch #242: 1001it [00:01, 838.60it/s, env_step=242000, len=6, n/ep=0, n/st=10, player_2/loss=0.222, rew=1.00]          


Epoch #242: test_reward: 0.344200 ± 0.935161, best_reward: 0.494000 ± 0.842831 in #47


Epoch #243: 1001it [00:01, 850.91it/s, env_step=243000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #243: test_reward: 0.357800 ± 0.931332, best_reward: 0.494000 ± 0.842831 in #47


Epoch #244: 1001it [00:01, 826.12it/s, env_step=244000, len=6, n/ep=1, n/st=10, player_2/loss=0.205, rew=1.00]          


Epoch #244: test_reward: 0.361000 ± 0.927297, best_reward: 0.494000 ± 0.842831 in #47


Epoch #245: 1001it [00:01, 796.02it/s, env_step=245000, len=6, n/ep=2, n/st=10, rew=0.00]                               


Epoch #245: test_reward: 0.323600 ± 0.936847, best_reward: 0.494000 ± 0.842831 in #47


Epoch #246: 1001it [00:01, 816.94it/s, env_step=246000, len=7, n/ep=0, n/st=10, player_2/loss=0.253, rew=-1.00]         


Epoch #246: test_reward: 0.334000 ± 0.932761, best_reward: 0.494000 ± 0.842831 in #47


Epoch #247: 1001it [00:01, 802.31it/s, env_step=247000, len=7, n/ep=1, n/st=10, player_2/loss=0.234, rew=-1.00]         


Epoch #247: test_reward: 0.341800 ± 0.927886, best_reward: 0.494000 ± 0.842831 in #47


Epoch #248: 1001it [00:01, 829.31it/s, env_step=248000, len=5, n/ep=3, n/st=10, rew=0.33]                               


Epoch #248: test_reward: 0.406400 ± 0.899132, best_reward: 0.494000 ± 0.842831 in #47


Epoch #249: 1001it [00:01, 817.85it/s, env_step=249000, len=7, n/ep=0, n/st=10, player_2/loss=0.161, rew=0.33]          


Epoch #249: test_reward: 0.395000 ± 0.906739, best_reward: 0.494000 ± 0.842831 in #47


Epoch #250: 1001it [00:01, 843.39it/s, env_step=250000, len=6, n/ep=3, n/st=10, player_2/loss=0.195, rew=0.33]          


Epoch #250: test_reward: 0.361400 ± 0.921081, best_reward: 0.494000 ± 0.842831 in #47


Epoch #251: 1001it [00:01, 799.61it/s, env_step=251000, len=6, n/ep=2, n/st=10, player_2/loss=0.186, rew=1.00]          


Epoch #251: test_reward: 0.328800 ± 0.933322, best_reward: 0.494000 ± 0.842831 in #47


Epoch #252: 1001it [00:01, 825.07it/s, env_step=252000, len=5, n/ep=2, n/st=10, rew=0.00]                               


Epoch #252: test_reward: 0.280000 ± 0.951630, best_reward: 0.494000 ± 0.842831 in #47


Epoch #253: 1001it [00:01, 799.47it/s, env_step=253000, len=6, n/ep=2, n/st=10, player_2/loss=0.261, rew=1.00]          


Epoch #253: test_reward: 0.322400 ± 0.934055, best_reward: 0.494000 ± 0.842831 in #47


Epoch #254: 1001it [00:01, 797.40it/s, env_step=254000, len=5, n/ep=3, n/st=10, player_2/loss=0.206, rew=0.33]          


Epoch #254: test_reward: 0.354200 ± 0.924739, best_reward: 0.494000 ± 0.842831 in #47


Epoch #255: 1001it [00:01, 826.29it/s, env_step=255000, len=8, n/ep=1, n/st=10, player_2/loss=0.165, rew=1.00]          


Epoch #255: test_reward: 0.343400 ± 0.928158, best_reward: 0.494000 ± 0.842831 in #47


Epoch #256: 1001it [00:01, 808.02it/s, env_step=256000, len=6, n/ep=2, n/st=10, player_2/loss=0.156, rew=1.00]          


Epoch #256: test_reward: 0.325800 ± 0.932553, best_reward: 0.494000 ± 0.842831 in #47


Epoch #257: 1001it [00:01, 852.00it/s, env_step=257000, len=7, n/ep=1, n/st=10, player_2/loss=0.275, rew=-1.00]         


Epoch #257: test_reward: 0.300000 ± 0.945093, best_reward: 0.494000 ± 0.842831 in #47


Epoch #258: 1001it [00:01, 823.09it/s, env_step=258000, len=6, n/ep=1, n/st=10, player_2/loss=0.233, rew=1.00]          


Epoch #258: test_reward: 0.333800 ± 0.934012, best_reward: 0.494000 ± 0.842831 in #47


Epoch #259: 1001it [00:01, 828.87it/s, env_step=259000, len=6, n/ep=1, n/st=10, player_2/loss=0.204, rew=1.00]          


Epoch #259: test_reward: 0.351000 ± 0.926606, best_reward: 0.494000 ± 0.842831 in #47


Epoch #260: 1001it [00:01, 804.62it/s, env_step=260000, len=6, n/ep=1, n/st=10, player_2/loss=0.232, rew=1.00]          


Epoch #260: test_reward: 0.356600 ± 0.926195, best_reward: 0.494000 ± 0.842831 in #47


Epoch #261: 1001it [00:01, 845.97it/s, env_step=261000, len=6, n/ep=1, n/st=10, player_2/loss=0.202, rew=1.00]          


Epoch #261: test_reward: 0.305800 ± 0.944186, best_reward: 0.494000 ± 0.842831 in #47


Epoch #262: 1001it [00:01, 808.17it/s, env_step=262000, len=5, n/ep=0, n/st=10, player_2/loss=0.175, rew=0.00]          


Epoch #262: test_reward: 0.348200 ± 0.927231, best_reward: 0.494000 ± 0.842831 in #47


Epoch #263: 1001it [00:01, 819.44it/s, env_step=263000, len=7, n/ep=2, n/st=10, player_2/loss=0.213, rew=1.00]          


Epoch #263: test_reward: 0.379200 ± 0.910498, best_reward: 0.494000 ± 0.842831 in #47


Epoch #264: 1001it [00:01, 812.31it/s, env_step=264000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #264: test_reward: 0.387200 ± 0.913497, best_reward: 0.494000 ± 0.842831 in #47


Epoch #265: 1001it [00:01, 871.67it/s, env_step=265000, len=7, n/ep=2, n/st=10, player_2/loss=0.251, rew=1.00]          


Epoch #265: test_reward: 0.366400 ± 0.921819, best_reward: 0.494000 ± 0.842831 in #47


Epoch #266: 1001it [00:01, 823.35it/s, env_step=266000, len=7, n/ep=0, n/st=10, player_2/loss=0.257, rew=-1.00]         


Epoch #266: test_reward: 0.360600 ± 0.928314, best_reward: 0.494000 ± 0.842831 in #47


Epoch #267: 1001it [00:01, 808.25it/s, env_step=267000, len=9, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #267: test_reward: 0.411200 ± 0.897393, best_reward: 0.494000 ± 0.842831 in #47


Epoch #268: 1001it [00:01, 789.18it/s, env_step=268000, len=6, n/ep=3, n/st=10, player_2/loss=0.252, rew=-0.33]         


Epoch #268: test_reward: 0.377400 ± 0.914204, best_reward: 0.494000 ± 0.842831 in #47


Epoch #269: 1001it [00:01, 782.26it/s, env_step=269000, len=5, n/ep=3, n/st=10, player_2/loss=0.223, rew=0.33]          


Epoch #269: test_reward: 0.368800 ± 0.922381, best_reward: 0.494000 ± 0.842831 in #47


Epoch #270: 1001it [00:01, 796.43it/s, env_step=270000, len=6, n/ep=3, n/st=10, rew=0.33]                               


Epoch #270: test_reward: 0.423000 ± 0.887959, best_reward: 0.494000 ± 0.842831 in #47


Epoch #271: 1001it [00:01, 833.41it/s, env_step=271000, len=7, n/ep=0, n/st=10, player_2/loss=0.131, rew=0.33]          


Epoch #271: test_reward: 0.425000 ± 0.893742, best_reward: 0.494000 ± 0.842831 in #47


Epoch #272: 1001it [00:01, 816.15it/s, env_step=272000, len=7, n/ep=1, n/st=10, player_2/loss=0.166, rew=-1.00]         


Epoch #272: test_reward: 0.382400 ± 0.913329, best_reward: 0.494000 ± 0.842831 in #47


Epoch #273: 1001it [00:01, 804.51it/s, env_step=273000, len=6, n/ep=1, n/st=10, player_2/loss=0.184, rew=1.00]          


Epoch #273: test_reward: 0.358800 ± 0.924155, best_reward: 0.494000 ± 0.842831 in #47


Epoch #274: 1001it [00:01, 800.55it/s, env_step=274000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #274: test_reward: 0.391400 ± 0.910498, best_reward: 0.494000 ± 0.842831 in #47


Epoch #275: 1001it [00:01, 839.10it/s, env_step=275000, len=8, n/ep=1, n/st=10, player_2/loss=0.203, rew=1.00]          


Epoch #275: test_reward: 0.359000 ± 0.918542, best_reward: 0.494000 ± 0.842831 in #47


Epoch #276: 1001it [00:01, 830.61it/s, env_step=276000, len=8, n/ep=2, n/st=10, rew=1.00]                               


Epoch #276: test_reward: 0.364400 ± 0.918048, best_reward: 0.494000 ± 0.842831 in #47


Epoch #277: 1001it [00:01, 808.31it/s, env_step=277000, len=7, n/ep=1, n/st=10, player_2/loss=0.144, rew=-1.00]         


Epoch #277: test_reward: 0.342400 ± 0.931645, best_reward: 0.494000 ± 0.842831 in #47


Epoch #278: 1001it [00:01, 835.53it/s, env_step=278000, len=8, n/ep=0, n/st=10, player_2/loss=0.122, rew=0.33]          


Epoch #278: test_reward: 0.382200 ± 0.918979, best_reward: 0.494000 ± 0.842831 in #47


Epoch #279: 1001it [00:01, 815.60it/s, env_step=279000, len=6, n/ep=0, n/st=10, player_2/loss=0.158, rew=1.00]          


Epoch #279: test_reward: 0.361400 ± 0.926709, best_reward: 0.494000 ± 0.842831 in #47


Epoch #280: 1001it [00:01, 775.32it/s, env_step=280000, len=5, n/ep=0, n/st=10, player_2/loss=0.197, rew=0.33]          


Epoch #280: test_reward: 0.344200 ± 0.929799, best_reward: 0.494000 ± 0.842831 in #47


Epoch #281: 1001it [00:01, 823.42it/s, env_step=281000, len=7, n/ep=2, n/st=10, rew=-1.00]                              


Epoch #281: test_reward: 0.346400 ± 0.929089, best_reward: 0.494000 ± 0.842831 in #47


Epoch #282: 1001it [00:01, 779.74it/s, env_step=282000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #282: test_reward: 0.389400 ± 0.910257, best_reward: 0.494000 ± 0.842831 in #47


Epoch #283: 1001it [00:01, 867.71it/s, env_step=283000, len=7, n/ep=2, n/st=10, player_2/loss=0.169, rew=0.00]          


Epoch #283: test_reward: 0.451600 ± 0.884340, best_reward: 0.494000 ± 0.842831 in #47


Epoch #284: 1001it [00:01, 830.02it/s, env_step=284000, len=6, n/ep=1, n/st=10, player_2/loss=0.171, rew=1.00]          


Epoch #284: test_reward: 0.439000 ± 0.887851, best_reward: 0.494000 ± 0.842831 in #47


Epoch #285: 1001it [00:01, 828.67it/s, env_step=285000, len=7, n/ep=0, n/st=10, player_2/loss=0.195, rew=1.00]          


Epoch #285: test_reward: 0.443800 ± 0.884331, best_reward: 0.494000 ± 0.842831 in #47


Epoch #286: 1001it [00:01, 800.03it/s, env_step=286000, len=8, n/ep=1, n/st=10, player_2/loss=0.165, rew=1.00]          


Epoch #286: test_reward: 0.389800 ± 0.909426, best_reward: 0.494000 ± 0.842831 in #47


Epoch #287: 1001it [00:01, 818.99it/s, env_step=287000, len=8, n/ep=4, n/st=10, player_2/loss=0.146, rew=0.00]          


Epoch #287: test_reward: 0.437000 ± 0.889512, best_reward: 0.494000 ± 0.842831 in #47


Epoch #288: 1001it [00:01, 797.93it/s, env_step=288000, len=6, n/ep=2, n/st=10, player_2/loss=0.180, rew=1.00]          


Epoch #288: test_reward: 0.406400 ± 0.902019, best_reward: 0.494000 ± 0.842831 in #47


Epoch #289: 1001it [00:01, 803.87it/s, env_step=289000, len=6, n/ep=0, n/st=10, player_2/loss=0.144, rew=1.00]          


Epoch #289: test_reward: 0.392400 ± 0.908638, best_reward: 0.494000 ± 0.842831 in #47


Epoch #290: 1001it [00:01, 805.87it/s, env_step=290000, len=6, n/ep=2, n/st=10, player_2/loss=0.166, rew=0.00]          


Epoch #290: test_reward: 0.317200 ± 0.941374, best_reward: 0.494000 ± 0.842831 in #47


Epoch #291: 1001it [00:01, 809.37it/s, env_step=291000, len=6, n/ep=1, n/st=10, player_2/loss=0.219, rew=1.00]          


Epoch #291: test_reward: 0.307000 ± 0.943796, best_reward: 0.494000 ± 0.842831 in #47


Epoch #292: 1001it [00:01, 813.21it/s, env_step=292000, len=6, n/ep=0, n/st=10, player_2/loss=0.231, rew=-0.33]         


Epoch #292: test_reward: 0.420000 ± 0.895545, best_reward: 0.494000 ± 0.842831 in #47


Epoch #293: 1001it [00:01, 833.27it/s, env_step=293000, len=6, n/ep=1, n/st=10, player_2/loss=0.163, rew=1.00]          


Epoch #293: test_reward: 0.379800 ± 0.915397, best_reward: 0.494000 ± 0.842831 in #47


Epoch #294: 1001it [00:01, 803.23it/s, env_step=294000, len=8, n/ep=1, n/st=10, player_2/loss=0.157, rew=1.00]          


Epoch #294: test_reward: 0.364000 ± 0.924502, best_reward: 0.494000 ± 0.842831 in #47


Epoch #295: 1001it [00:01, 825.19it/s, env_step=295000, len=8, n/ep=1, n/st=10, rew=1.00]                               


Epoch #295: test_reward: 0.380400 ± 0.914601, best_reward: 0.494000 ± 0.842831 in #47


Epoch #296: 1001it [00:01, 806.81it/s, env_step=296000, len=8, n/ep=1, n/st=10, player_2/loss=0.188, rew=1.00]          


Epoch #296: test_reward: 0.421400 ± 0.901012, best_reward: 0.494000 ± 0.842831 in #47


Epoch #297: 1001it [00:01, 813.60it/s, env_step=297000, len=6, n/ep=2, n/st=10, player_2/loss=0.181, rew=0.00]          


Epoch #297: test_reward: 0.396200 ± 0.906656, best_reward: 0.494000 ± 0.842831 in #47


Epoch #298: 1001it [00:01, 779.48it/s, env_step=298000, len=5, n/ep=3, n/st=10, rew=0.33]                               


Epoch #298: test_reward: 0.402000 ± 0.898441, best_reward: 0.494000 ± 0.842831 in #47


Epoch #299: 1001it [00:01, 803.48it/s, env_step=299000, len=6, n/ep=1, n/st=10, player_2/loss=0.229, rew=1.00]          


Epoch #299: test_reward: 0.425400 ± 0.882290, best_reward: 0.494000 ± 0.842831 in #47


Epoch #300: 1001it [00:01, 792.53it/s, env_step=300000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #300: test_reward: 0.360000 ± 0.904655, best_reward: 0.494000 ± 0.842831 in #47


Epoch #301: 1001it [00:01, 780.64it/s, env_step=301000, len=7, n/ep=0, n/st=10, player_2/loss=0.172, rew=1.00]          


Epoch #301: test_reward: 0.334400 ± 0.919009, best_reward: 0.494000 ± 0.842831 in #47


Epoch #302: 1001it [00:01, 801.16it/s, env_step=302000, len=6, n/ep=3, n/st=10, rew=0.33]                               


Epoch #302: test_reward: 0.354000 ± 0.907681, best_reward: 0.494000 ± 0.842831 in #47


Epoch #303: 1001it [00:01, 816.22it/s, env_step=303000, len=7, n/ep=2, n/st=10, player_2/loss=0.159, rew=-1.00]         


Epoch #303: test_reward: 0.397400 ± 0.898150, best_reward: 0.494000 ± 0.842831 in #47


Epoch #304: 1001it [00:01, 822.29it/s, env_step=304000, len=6, n/ep=2, n/st=10, player_2/loss=0.204, rew=1.00]          


Epoch #304: test_reward: 0.392200 ± 0.904422, best_reward: 0.494000 ± 0.842831 in #47


Epoch #305: 1001it [00:01, 852.89it/s, env_step=305000, len=7, n/ep=0, n/st=10, rew=-1.00]                              


Epoch #305: test_reward: 0.454400 ± 0.860884, best_reward: 0.494000 ± 0.842831 in #47


Epoch #306: 1001it [00:01, 815.60it/s, env_step=306000, len=5, n/ep=2, n/st=10, rew=0.00]                               


Epoch #306: test_reward: 0.416800 ± 0.886046, best_reward: 0.494000 ± 0.842831 in #47


Epoch #307: 1001it [00:01, 822.96it/s, env_step=307000, len=7, n/ep=1, n/st=10, player_2/loss=0.205, rew=-1.00]         


Epoch #307: test_reward: 0.454000 ± 0.865496, best_reward: 0.494000 ± 0.842831 in #47


Epoch #308: 1001it [00:01, 820.14it/s, env_step=308000, len=6, n/ep=1, n/st=10, player_2/loss=0.231, rew=1.00]          


Epoch #308: test_reward: 0.462000 ± 0.860556, best_reward: 0.494000 ± 0.842831 in #47


Epoch #309: 1001it [00:01, 781.41it/s, env_step=309000, len=8, n/ep=1, n/st=10, player_2/loss=0.233, rew=1.00]          


Epoch #309: test_reward: 0.387600 ± 0.901868, best_reward: 0.494000 ± 0.842831 in #47


Epoch #310: 1001it [00:01, 798.09it/s, env_step=310000, len=7, n/ep=2, n/st=10, player_2/loss=0.211, rew=-0.50]         


Epoch #310: test_reward: 0.354600 ± 0.922637, best_reward: 0.494000 ± 0.842831 in #47


Epoch #311: 1001it [00:01, 834.18it/s, env_step=311000, len=6, n/ep=0, n/st=10, player_2/loss=0.249, rew=1.00]          


Epoch #311: test_reward: 0.341400 ± 0.929971, best_reward: 0.494000 ± 0.842831 in #47


Epoch #312: 1001it [00:01, 843.49it/s, env_step=312000, len=6, n/ep=2, n/st=10, player_2/loss=0.206, rew=1.00]          


Epoch #312: test_reward: 0.393400 ± 0.901020, best_reward: 0.494000 ± 0.842831 in #47


Epoch #313: 1001it [00:01, 839.33it/s, env_step=313000, len=6, n/ep=2, n/st=10, player_2/loss=0.150, rew=1.00]          


Epoch #313: test_reward: 0.384600 ± 0.902155, best_reward: 0.494000 ± 0.842831 in #47


Epoch #314: 1001it [00:01, 780.44it/s, env_step=314000, len=7, n/ep=1, n/st=10, player_2/loss=0.138, rew=-1.00]         


Epoch #314: test_reward: 0.388400 ± 0.904182, best_reward: 0.494000 ± 0.842831 in #47


Epoch #315: 1001it [00:01, 828.58it/s, env_step=315000, len=8, n/ep=0, n/st=10, rew=0.00]                               


Epoch #315: test_reward: 0.434000 ± 0.886817, best_reward: 0.494000 ± 0.842831 in #47


Epoch #316: 1001it [00:01, 828.49it/s, env_step=316000, len=6, n/ep=1, n/st=10, player_2/loss=0.210, rew=1.00]          


Epoch #316: test_reward: 0.426200 ± 0.875302, best_reward: 0.494000 ± 0.842831 in #47


Epoch #317: 1001it [00:01, 806.86it/s, env_step=317000, len=6, n/ep=3, n/st=10, player_2/loss=0.211, rew=1.00]          


Epoch #317: test_reward: 0.360200 ± 0.913923, best_reward: 0.494000 ± 0.842831 in #47


Epoch #318: 1001it [00:01, 833.98it/s, env_step=318000, len=5, n/ep=0, n/st=10, player_2/loss=0.204, rew=0.00]          


Epoch #318: test_reward: 0.405400 ± 0.883544, best_reward: 0.494000 ± 0.842831 in #47


Epoch #319: 1001it [00:01, 819.41it/s, env_step=319000, len=6, n/ep=0, n/st=10, player_2/loss=0.172, rew=1.00]          


Epoch #319: test_reward: 0.441200 ± 0.866108, best_reward: 0.494000 ± 0.842831 in #47


Epoch #320: 1001it [00:01, 809.56it/s, env_step=320000, len=6, n/ep=3, n/st=10, player_2/loss=0.209, rew=-0.33]         


Epoch #320: test_reward: 0.444400 ± 0.861225, best_reward: 0.494000 ± 0.842831 in #47


Epoch #321: 1001it [00:01, 802.19it/s, env_step=321000, len=9, n/ep=1, n/st=10, player_2/loss=0.183, rew=0.00]          


Epoch #321: test_reward: 0.374000 ± 0.879388, best_reward: 0.494000 ± 0.842831 in #47


Epoch #322: 1001it [00:01, 838.90it/s, env_step=322000, len=7, n/ep=4, n/st=10, rew=0.50]                               


Epoch #322: test_reward: 0.345000 ± 0.907290, best_reward: 0.494000 ± 0.842831 in #47


Epoch #323: 1001it [00:01, 772.38it/s, env_step=323000, len=8, n/ep=1, n/st=10, player_2/loss=0.208, rew=1.00]          


Epoch #323: test_reward: 0.378000 ± 0.903281, best_reward: 0.494000 ± 0.842831 in #47


Epoch #324: 1001it [00:01, 844.10it/s, env_step=324000, len=8, n/ep=1, n/st=10, rew=1.00]                               


Epoch #324: test_reward: 0.420000 ± 0.874071, best_reward: 0.494000 ± 0.842831 in #47


Epoch #325: 1001it [00:01, 847.82it/s, env_step=325000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #325: test_reward: 0.435600 ± 0.882866, best_reward: 0.494000 ± 0.842831 in #47


Epoch #326: 1001it [00:01, 792.99it/s, env_step=326000, len=6, n/ep=3, n/st=10, player_2/loss=0.183, rew=0.33]          


Epoch #326: test_reward: 0.396000 ± 0.895089, best_reward: 0.494000 ± 0.842831 in #47


Epoch #327: 1001it [00:01, 782.14it/s, env_step=327000, len=5, n/ep=1, n/st=10, player_2/loss=0.188, rew=-1.00]         


Epoch #327: test_reward: 0.404000 ± 0.889935, best_reward: 0.494000 ± 0.842831 in #47


Epoch #328: 1001it [00:01, 821.23it/s, env_step=328000, len=7, n/ep=2, n/st=10, player_2/loss=0.165, rew=0.00]          


Epoch #328: test_reward: 0.410200 ± 0.890806, best_reward: 0.494000 ± 0.842831 in #47


Epoch #329: 1001it [00:01, 794.97it/s, env_step=329000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #329: test_reward: 0.365000 ± 0.900764, best_reward: 0.494000 ± 0.842831 in #47


Epoch #330: 1001it [00:01, 786.02it/s, env_step=330000, len=5, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #330: test_reward: 0.372600 ± 0.900538, best_reward: 0.494000 ± 0.842831 in #47


Epoch #331: 1001it [00:01, 822.34it/s, env_step=331000, len=5, n/ep=1, n/st=10, player_2/loss=0.110, rew=-1.00]         


Epoch #331: test_reward: 0.431800 ± 0.871406, best_reward: 0.494000 ± 0.842831 in #47


Epoch #332: 1001it [00:01, 825.89it/s, env_step=332000, len=7, n/ep=3, n/st=10, player_2/loss=0.167, rew=0.67]          


Epoch #332: test_reward: 0.384800 ± 0.896398, best_reward: 0.494000 ± 0.842831 in #47


Epoch #333: 1001it [00:01, 791.68it/s, env_step=333000, len=6, n/ep=1, n/st=10, player_2/loss=0.239, rew=1.00]          


Epoch #333: test_reward: 0.360600 ± 0.900981, best_reward: 0.494000 ± 0.842831 in #47


Epoch #334: 1001it [00:01, 794.20it/s, env_step=334000, len=7, n/ep=2, n/st=10, player_2/loss=0.253, rew=1.00]          


Epoch #334: test_reward: 0.406800 ± 0.872991, best_reward: 0.494000 ± 0.842831 in #47


Epoch #335: 1001it [00:01, 833.34it/s, env_step=335000, len=6, n/ep=1, n/st=10, player_2/loss=0.157, rew=1.00]          


Epoch #335: test_reward: 0.381400 ± 0.898629, best_reward: 0.494000 ± 0.842831 in #47


Epoch #336: 1001it [00:01, 809.95it/s, env_step=336000, len=6, n/ep=2, n/st=10, player_2/loss=0.141, rew=1.00]          


Epoch #336: test_reward: 0.368000 ± 0.910481, best_reward: 0.494000 ± 0.842831 in #47


Epoch #337: 1001it [00:01, 835.08it/s, env_step=337000, len=6, n/ep=1, n/st=10, player_2/loss=0.152, rew=1.00]          


Epoch #337: test_reward: 0.412000 ± 0.896134, best_reward: 0.494000 ± 0.842831 in #47


Epoch #338: 1001it [00:01, 825.59it/s, env_step=338000, len=5, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #338: test_reward: 0.448600 ± 0.872788, best_reward: 0.494000 ± 0.842831 in #47


Epoch #339: 1001it [00:01, 810.45it/s, env_step=339000, len=7, n/ep=1, n/st=10, player_2/loss=0.170, rew=-1.00]         


Epoch #339: test_reward: 0.466000 ± 0.865358, best_reward: 0.494000 ± 0.842831 in #47


Epoch #340: 1001it [00:01, 818.50it/s, env_step=340000, len=6, n/ep=1, n/st=10, player_2/loss=0.181, rew=1.00]          


Epoch #340: test_reward: 0.418600 ± 0.892062, best_reward: 0.494000 ± 0.842831 in #47


Epoch #341: 1001it [00:01, 850.13it/s, env_step=341000, len=6, n/ep=1, n/st=10, player_2/loss=0.177, rew=1.00]          


Epoch #341: test_reward: 0.436200 ± 0.884042, best_reward: 0.494000 ± 0.842831 in #47


Epoch #342: 1001it [00:01, 788.94it/s, env_step=342000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #342: test_reward: 0.416400 ± 0.885557, best_reward: 0.494000 ± 0.842831 in #47


Epoch #343: 1001it [00:01, 854.16it/s, env_step=343000, len=7, n/ep=3, n/st=10, player_2/loss=0.231, rew=0.33]          


Epoch #343: test_reward: 0.409800 ± 0.893456, best_reward: 0.494000 ± 0.842831 in #47


Epoch #344: 1001it [00:01, 825.92it/s, env_step=344000, len=6, n/ep=5, n/st=10, player_2/loss=0.176, rew=0.20]          


Epoch #344: test_reward: 0.436200 ± 0.879960, best_reward: 0.494000 ± 0.842831 in #47


Epoch #345: 1001it [00:01, 790.74it/s, env_step=345000, len=7, n/ep=0, n/st=10, player_2/loss=0.189, rew=1.00]          


Epoch #345: test_reward: 0.453400 ± 0.863614, best_reward: 0.494000 ± 0.842831 in #47


Epoch #346: 1001it [00:01, 795.44it/s, env_step=346000, len=9, n/ep=1, n/st=10, player_2/loss=0.148, rew=-1.00]         


Epoch #346: test_reward: 0.415400 ± 0.887042, best_reward: 0.494000 ± 0.842831 in #47


Epoch #347: 1001it [00:01, 792.08it/s, env_step=347000, len=6, n/ep=2, n/st=10, player_2/loss=0.147, rew=1.00]          


Epoch #347: test_reward: 0.382000 ± 0.907566, best_reward: 0.494000 ± 0.842831 in #47


Epoch #348: 1001it [00:01, 818.08it/s, env_step=348000, len=6, n/ep=0, n/st=10, player_2/loss=0.156, rew=1.00]          


Epoch #348: test_reward: 0.412200 ± 0.902602, best_reward: 0.494000 ± 0.842831 in #47


Epoch #349: 1001it [00:01, 836.07it/s, env_step=349000, len=5, n/ep=0, n/st=10, player_2/loss=0.287, rew=0.33]          


Epoch #349: test_reward: 0.436600 ± 0.889933, best_reward: 0.494000 ± 0.842831 in #47


Epoch #350: 1001it [00:01, 772.73it/s, env_step=350000, len=6, n/ep=0, n/st=10, player_2/loss=0.205, rew=1.00]          


Epoch #350: test_reward: 0.406800 ± 0.894491, best_reward: 0.494000 ± 0.842831 in #47


Epoch #351: 1001it [00:01, 820.94it/s, env_step=351000, len=7, n/ep=1, n/st=10, player_2/loss=0.176, rew=-1.00]         


Epoch #351: test_reward: 0.389600 ± 0.911818, best_reward: 0.494000 ± 0.842831 in #47


Epoch #352: 1001it [00:01, 824.10it/s, env_step=352000, len=6, n/ep=0, n/st=10, player_2/loss=0.161, rew=0.50]          


Epoch #352: test_reward: 0.394400 ± 0.909752, best_reward: 0.494000 ± 0.842831 in #47


Epoch #353: 1001it [00:01, 812.99it/s, env_step=353000, len=6, n/ep=1, n/st=10, player_2/loss=0.191, rew=1.00]          


Epoch #353: test_reward: 0.404800 ± 0.904509, best_reward: 0.494000 ± 0.842831 in #47


Epoch #354: 1001it [00:01, 792.31it/s, env_step=354000, len=6, n/ep=2, n/st=10, rew=-1.00]                              


Epoch #354: test_reward: 0.382400 ± 0.910039, best_reward: 0.494000 ± 0.842831 in #47


Epoch #355: 1001it [00:01, 813.30it/s, env_step=355000, len=6, n/ep=1, n/st=10, player_2/loss=0.139, rew=1.00]          


Epoch #355: test_reward: 0.374600 ± 0.913167, best_reward: 0.494000 ± 0.842831 in #47


Epoch #356: 1001it [00:01, 796.25it/s, env_step=356000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #356: test_reward: 0.376800 ± 0.916090, best_reward: 0.494000 ± 0.842831 in #47


Epoch #357: 1001it [00:01, 810.03it/s, env_step=357000, len=6, n/ep=3, n/st=10, player_2/loss=0.215, rew=1.00]          


Epoch #357: test_reward: 0.393400 ± 0.905006, best_reward: 0.494000 ± 0.842831 in #47


Epoch #358: 1001it [00:01, 814.60it/s, env_step=358000, len=6, n/ep=1, n/st=10, player_2/loss=0.199, rew=1.00]          


Epoch #358: test_reward: 0.397400 ± 0.902593, best_reward: 0.494000 ± 0.842831 in #47


Epoch #359: 1001it [00:01, 844.35it/s, env_step=359000, len=6, n/ep=2, n/st=10, rew=-1.00]                              


Epoch #359: test_reward: 0.329200 ± 0.924136, best_reward: 0.494000 ± 0.842831 in #47


Epoch #360: 1001it [00:01, 874.52it/s, env_step=360000, len=6, n/ep=1, n/st=10, player_2/loss=0.156, rew=1.00]          


Epoch #360: test_reward: 0.403000 ± 0.902104, best_reward: 0.494000 ± 0.842831 in #47


Epoch #361: 1001it [00:01, 740.76it/s, env_step=361000, len=9, n/ep=1, n/st=10, player_2/loss=0.206, rew=0.00]          


Epoch #361: test_reward: 0.401400 ± 0.901265, best_reward: 0.494000 ± 0.842831 in #47


Epoch #362: 1001it [00:01, 843.71it/s, env_step=362000, len=7, n/ep=0, n/st=10, rew=-1.00]                              


Epoch #362: test_reward: 0.398800 ± 0.899199, best_reward: 0.494000 ± 0.842831 in #47


Epoch #363: 1001it [00:01, 842.94it/s, env_step=363000, len=5, n/ep=3, n/st=10, player_2/loss=0.223, rew=0.33]          


Epoch #363: test_reward: 0.413800 ± 0.892059, best_reward: 0.494000 ± 0.842831 in #47


Epoch #364: 1001it [00:01, 796.45it/s, env_step=364000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #364: test_reward: 0.368200 ± 0.910510, best_reward: 0.494000 ± 0.842831 in #47


Epoch #365: 1001it [00:01, 807.48it/s, env_step=365000, len=9, n/ep=1, n/st=10, player_2/loss=0.182, rew=-1.00]         


Epoch #365: test_reward: 0.387800 ± 0.906317, best_reward: 0.494000 ± 0.842831 in #47


Epoch #366: 1001it [00:01, 815.00it/s, env_step=366000, len=6, n/ep=2, n/st=10, player_2/loss=0.133, rew=1.00]          


Epoch #366: test_reward: 0.372600 ± 0.910477, best_reward: 0.494000 ± 0.842831 in #47


Epoch #367: 1001it [00:01, 783.34it/s, env_step=367000, len=6, n/ep=0, n/st=10, player_2/loss=0.086, rew=0.50]          


Epoch #367: test_reward: 0.383200 ± 0.907721, best_reward: 0.494000 ± 0.842831 in #47


Epoch #368: 1001it [00:01, 802.59it/s, env_step=368000, len=7, n/ep=4, n/st=10, player_2/loss=0.125, rew=0.50]          


Epoch #368: test_reward: 0.383400 ± 0.908187, best_reward: 0.494000 ± 0.842831 in #47


Epoch #369: 1001it [00:01, 789.28it/s, env_step=369000, len=7, n/ep=2, n/st=10, player_2/loss=0.175, rew=1.00]          


Epoch #369: test_reward: 0.421400 ± 0.891191, best_reward: 0.494000 ± 0.842831 in #47


Epoch #370: 1001it [00:01, 838.28it/s, env_step=370000, len=6, n/ep=2, n/st=10, player_2/loss=0.195, rew=1.00]          


Epoch #370: test_reward: 0.399200 ± 0.901687, best_reward: 0.494000 ± 0.842831 in #47


Epoch #371: 1001it [00:01, 853.43it/s, env_step=371000, len=8, n/ep=1, n/st=10, player_2/loss=0.223, rew=1.00]          


Epoch #371: test_reward: 0.405800 ± 0.897511, best_reward: 0.494000 ± 0.842831 in #47


Epoch #372: 1001it [00:01, 794.02it/s, env_step=372000, len=8, n/ep=0, n/st=10, player_2/loss=0.155, rew=-1.00]         


Epoch #372: test_reward: 0.390600 ± 0.896232, best_reward: 0.494000 ± 0.842831 in #47


Epoch #373: 1001it [00:01, 824.31it/s, env_step=373000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #373: test_reward: 0.385200 ± 0.905108, best_reward: 0.494000 ± 0.842831 in #47


Epoch #374: 1001it [00:01, 790.68it/s, env_step=374000, len=6, n/ep=0, n/st=10, player_2/loss=0.224, rew=0.20]          


Epoch #374: test_reward: 0.432400 ± 0.875117, best_reward: 0.494000 ± 0.842831 in #47


Epoch #375: 1001it [00:01, 829.48it/s, env_step=375000, len=5, n/ep=2, n/st=10, rew=0.00]                               


Epoch #375: test_reward: 0.460000 ± 0.862322, best_reward: 0.494000 ± 0.842831 in #47


Epoch #376: 1001it [00:01, 828.17it/s, env_step=376000, len=6, n/ep=2, n/st=10, player_2/loss=0.229, rew=1.00]          


Epoch #376: test_reward: 0.411400 ± 0.888904, best_reward: 0.494000 ± 0.842831 in #47


Epoch #377: 1001it [00:01, 811.32it/s, env_step=377000, len=8, n/ep=1, n/st=10, player_2/loss=0.151, rew=1.00]          


Epoch #377: test_reward: 0.329200 ± 0.920667, best_reward: 0.494000 ± 0.842831 in #47


Epoch #378: 1001it [00:01, 826.20it/s, env_step=378000, len=6, n/ep=2, n/st=10, player_2/loss=0.168, rew=0.00]          


Epoch #378: test_reward: 0.338600 ± 0.925608, best_reward: 0.494000 ± 0.842831 in #47


Epoch #379: 1001it [00:01, 829.17it/s, env_step=379000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #379: test_reward: 0.360400 ± 0.919626, best_reward: 0.494000 ± 0.842831 in #47


Epoch #380: 1001it [00:01, 806.11it/s, env_step=380000, len=6, n/ep=3, n/st=10, player_2/loss=0.207, rew=-0.33]         


Epoch #380: test_reward: 0.405400 ± 0.895908, best_reward: 0.494000 ± 0.842831 in #47


Epoch #381: 1001it [00:01, 826.60it/s, env_step=381000, len=6, n/ep=1, n/st=10, player_2/loss=0.185, rew=1.00]          


Epoch #381: test_reward: 0.436400 ± 0.878837, best_reward: 0.494000 ± 0.842831 in #47


Epoch #382: 1001it [00:01, 803.18it/s, env_step=382000, len=6, n/ep=1, n/st=10, player_2/loss=0.216, rew=1.00]          


Epoch #382: test_reward: 0.459400 ± 0.870374, best_reward: 0.494000 ± 0.842831 in #47


Epoch #383: 1001it [00:01, 852.08it/s, env_step=383000, len=6, n/ep=4, n/st=10, player_2/loss=0.254, rew=0.00]          


Epoch #383: test_reward: 0.403600 ± 0.901724, best_reward: 0.494000 ± 0.842831 in #47


Epoch #384: 1001it [00:01, 810.74it/s, env_step=384000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #384: test_reward: 0.418200 ± 0.896944, best_reward: 0.494000 ± 0.842831 in #47


Epoch #385: 1001it [00:01, 820.56it/s, env_step=385000, len=6, n/ep=0, n/st=10, player_2/loss=0.238, rew=1.00]          


Epoch #385: test_reward: 0.442200 ± 0.882417, best_reward: 0.494000 ± 0.842831 in #47


Epoch #386: 1001it [00:01, 791.19it/s, env_step=386000, len=6, n/ep=1, n/st=10, player_2/loss=0.201, rew=1.00]          


Epoch #386: test_reward: 0.452000 ± 0.871835, best_reward: 0.494000 ± 0.842831 in #47


Epoch #387: 1001it [00:01, 844.70it/s, env_step=387000, len=6, n/ep=3, n/st=10, player_2/loss=0.239, rew=-0.33]         


Epoch #387: test_reward: 0.422000 ± 0.889447, best_reward: 0.494000 ± 0.842831 in #47


Epoch #388: 1001it [00:01, 822.06it/s, env_step=388000, len=6, n/ep=0, n/st=10, player_2/loss=0.249, rew=1.00]          


Epoch #388: test_reward: 0.384600 ± 0.914266, best_reward: 0.494000 ± 0.842831 in #47


Epoch #389: 1001it [00:01, 830.35it/s, env_step=389000, len=6, n/ep=1, n/st=10, player_2/loss=0.152, rew=1.00]          


Epoch #389: test_reward: 0.424400 ± 0.890778, best_reward: 0.494000 ± 0.842831 in #47


Epoch #390: 1001it [00:01, 866.62it/s, env_step=390000, len=7, n/ep=0, n/st=10, player_2/loss=0.200, rew=-1.00]         


Epoch #390: test_reward: 0.348000 ± 0.921573, best_reward: 0.494000 ± 0.842831 in #47


Epoch #391: 1001it [00:01, 789.88it/s, env_step=391000, len=6, n/ep=1, n/st=10, player_2/loss=0.238, rew=1.00]          


Epoch #391: test_reward: 0.421600 ± 0.893898, best_reward: 0.494000 ± 0.842831 in #47


Epoch #392: 1001it [00:01, 800.85it/s, env_step=392000, len=6, n/ep=2, n/st=10, player_2/loss=0.218, rew=1.00]          


Epoch #392: test_reward: 0.443200 ± 0.880440, best_reward: 0.494000 ± 0.842831 in #47


Epoch #393: 1001it [00:01, 843.12it/s, env_step=393000, len=6, n/ep=0, n/st=10, player_2/loss=0.170, rew=1.00]          


Epoch #393: test_reward: 0.433800 ± 0.888154, best_reward: 0.494000 ± 0.842831 in #47


Epoch #394: 1001it [00:01, 799.53it/s, env_step=394000, len=7, n/ep=1, n/st=10, player_2/loss=0.158, rew=-1.00]         


Epoch #394: test_reward: 0.393200 ± 0.907851, best_reward: 0.494000 ± 0.842831 in #47


Epoch #395: 1001it [00:01, 842.27it/s, env_step=395000, len=7, n/ep=0, n/st=10, rew=0.33]                               


Epoch #395: test_reward: 0.387200 ± 0.908227, best_reward: 0.494000 ± 0.842831 in #47


Epoch #396: 1001it [00:01, 771.52it/s, env_step=396000, len=6, n/ep=0, n/st=10, rew=0.00]                               


Epoch #396: test_reward: 0.452200 ± 0.883468, best_reward: 0.494000 ± 0.842831 in #47


Epoch #397: 1001it [00:01, 846.58it/s, env_step=397000, len=7, n/ep=1, n/st=10, player_2/loss=0.171, rew=-1.00]         


Epoch #397: test_reward: 0.423000 ± 0.892676, best_reward: 0.494000 ± 0.842831 in #47


Epoch #398: 1001it [00:01, 839.28it/s, env_step=398000, len=6, n/ep=1, n/st=10, player_2/loss=0.200, rew=1.00]          


Epoch #398: test_reward: 0.431800 ± 0.890252, best_reward: 0.494000 ± 0.842831 in #47


Epoch #399: 1001it [00:01, 790.98it/s, env_step=399000, len=5, n/ep=2, n/st=10, player_2/loss=0.175, rew=0.00]          


Epoch #399: test_reward: 0.447600 ± 0.881620, best_reward: 0.494000 ± 0.842831 in #47


Epoch #400: 1001it [00:01, 794.85it/s, env_step=400000, len=8, n/ep=2, n/st=10, player_2/loss=0.149, rew=1.00]          


Epoch #400: test_reward: 0.430200 ± 0.889454, best_reward: 0.494000 ± 0.842831 in #47


Epoch #401: 1001it [00:01, 813.20it/s, env_step=401000, len=7, n/ep=2, n/st=10, rew=1.00]                               


Epoch #401: test_reward: 0.450000 ± 0.885607, best_reward: 0.494000 ± 0.842831 in #47


Epoch #402: 1001it [00:01, 731.92it/s, env_step=402000, len=7, n/ep=1, n/st=10, player_2/loss=0.192, rew=-1.00]         


Epoch #402: test_reward: 0.429200 ± 0.892293, best_reward: 0.494000 ± 0.842831 in #47


Epoch #403: 1001it [00:01, 802.09it/s, env_step=403000, len=7, n/ep=2, n/st=10, player_2/loss=0.209, rew=0.00]          


Epoch #403: test_reward: 0.442800 ± 0.879732, best_reward: 0.494000 ± 0.842831 in #47


Epoch #404: 1001it [00:01, 819.57it/s, env_step=404000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #404: test_reward: 0.431400 ± 0.884361, best_reward: 0.494000 ± 0.842831 in #47


Epoch #405: 1001it [00:01, 801.46it/s, env_step=405000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #405: test_reward: 0.362200 ± 0.923369, best_reward: 0.494000 ± 0.842831 in #47


Epoch #406: 1001it [00:01, 849.06it/s, env_step=406000, len=7, n/ep=1, n/st=10, player_2/loss=0.146, rew=-1.00]         


Epoch #406: test_reward: 0.413000 ± 0.905114, best_reward: 0.494000 ± 0.842831 in #47


Epoch #407: 1001it [00:01, 775.26it/s, env_step=407000, len=7, n/ep=2, n/st=10, player_2/loss=0.172, rew=1.00]          


Epoch #407: test_reward: 0.424000 ± 0.892986, best_reward: 0.494000 ± 0.842831 in #47


Epoch #408: 1001it [00:01, 798.18it/s, env_step=408000, len=6, n/ep=3, n/st=10, player_2/loss=0.201, rew=1.00]          


Epoch #408: test_reward: 0.440600 ± 0.884348, best_reward: 0.494000 ± 0.842831 in #47


Epoch #409: 1001it [00:01, 810.66it/s, env_step=409000, len=6, n/ep=0, n/st=10, player_2/loss=0.193, rew=0.33]          


Epoch #409: test_reward: 0.487600 ± 0.852670, best_reward: 0.494000 ± 0.842831 in #47


Epoch #410: 1001it [00:01, 822.99it/s, env_step=410000, len=6, n/ep=1, n/st=10, player_2/loss=0.150, rew=1.00]          


Epoch #410: test_reward: 0.421600 ± 0.891209, best_reward: 0.494000 ± 0.842831 in #47


Epoch #411: 1001it [00:01, 804.55it/s, env_step=411000, len=6, n/ep=1, n/st=10, player_2/loss=0.147, rew=1.00]          


Epoch #411: test_reward: 0.405400 ± 0.898137, best_reward: 0.494000 ± 0.842831 in #47


Epoch #412: 1001it [00:01, 802.63it/s, env_step=412000, len=8, n/ep=2, n/st=10, player_2/loss=0.153, rew=0.00]          


Epoch #412: test_reward: 0.406200 ± 0.901999, best_reward: 0.494000 ± 0.842831 in #47


Epoch #413: 1001it [00:01, 824.17it/s, env_step=413000, len=6, n/ep=3, n/st=10, player_2/loss=0.148, rew=0.33]          


Epoch #413: test_reward: 0.434000 ± 0.892437, best_reward: 0.494000 ± 0.842831 in #47


Epoch #414: 1001it [00:01, 770.26it/s, env_step=414000, len=6, n/ep=2, n/st=10, player_2/loss=0.182, rew=1.00]          


Epoch #414: test_reward: 0.422000 ± 0.899731, best_reward: 0.494000 ± 0.842831 in #47


Epoch #415: 1001it [00:01, 831.00it/s, env_step=415000, len=6, n/ep=1, n/st=10, player_2/loss=0.199, rew=1.00]          


Epoch #415: test_reward: 0.389600 ± 0.911599, best_reward: 0.494000 ± 0.842831 in #47


Epoch #416: 1001it [00:01, 813.73it/s, env_step=416000, len=7, n/ep=2, n/st=10, player_2/loss=0.218, rew=0.00]          


Epoch #416: test_reward: 0.471000 ± 0.871527, best_reward: 0.494000 ± 0.842831 in #47


Epoch #417: 1001it [00:01, 824.76it/s, env_step=417000, len=6, n/ep=3, n/st=10, rew=1.00]                               


Epoch #417: test_reward: 0.464800 ± 0.874049, best_reward: 0.494000 ± 0.842831 in #47


Epoch #418: 1001it [00:01, 789.16it/s, env_step=418000, len=5, n/ep=2, n/st=10, player_2/loss=0.203, rew=0.00]          


Epoch #418: test_reward: 0.425600 ± 0.888631, best_reward: 0.494000 ± 0.842831 in #47


Epoch #419: 1001it [00:01, 777.01it/s, env_step=419000, len=6, n/ep=3, n/st=10, player_2/loss=0.198, rew=1.00]          


Epoch #419: test_reward: 0.484400 ± 0.861949, best_reward: 0.494000 ± 0.842831 in #47


Epoch #420: 1001it [00:01, 814.12it/s, env_step=420000, len=8, n/ep=0, n/st=10, rew=-0.67]                              


Epoch #420: test_reward: 0.465000 ± 0.874514, best_reward: 0.494000 ± 0.842831 in #47


Epoch #421: 1001it [00:01, 813.42it/s, env_step=421000, len=7, n/ep=0, n/st=10, player_2/loss=0.176, rew=1.00]          


Epoch #421: test_reward: 0.472600 ± 0.868590, best_reward: 0.494000 ± 0.842831 in #47


Epoch #422: 1001it [00:01, 791.22it/s, env_step=422000, len=8, n/ep=1, n/st=10, player_2/loss=0.226, rew=1.00]          


Epoch #422: test_reward: 0.410200 ± 0.898630, best_reward: 0.494000 ± 0.842831 in #47


Epoch #423: 1001it [00:01, 804.19it/s, env_step=423000, len=7, n/ep=2, n/st=10, player_2/loss=0.208, rew=0.00]          


Epoch #423: test_reward: 0.493800 ± 0.851094, best_reward: 0.494000 ± 0.842831 in #47


Epoch #424: 1001it [00:01, 772.51it/s, env_step=424000, len=6, n/ep=0, n/st=10, rew=1.00]                               


Epoch #424: test_reward: 0.443600 ± 0.885223, best_reward: 0.494000 ± 0.842831 in #47


Epoch #425: 1001it [00:01, 848.54it/s, env_step=425000, len=7, n/ep=1, n/st=10, player_2/loss=0.205, rew=-1.00]         


Epoch #425: test_reward: 0.420400 ± 0.890204, best_reward: 0.494000 ± 0.842831 in #47


Epoch #426: 1001it [00:01, 821.34it/s, env_step=426000, len=6, n/ep=2, n/st=10, rew=-1.00]                              


Epoch #426: test_reward: 0.462400 ± 0.876006, best_reward: 0.494000 ± 0.842831 in #47


Epoch #427: 1001it [00:01, 804.98it/s, env_step=427000, len=6, n/ep=4, n/st=10, player_2/loss=0.267, rew=1.00]          


Epoch #427: test_reward: 0.493600 ± 0.857181, best_reward: 0.494000 ± 0.842831 in #47


Epoch #428: 1001it [00:01, 791.79it/s, env_step=428000, len=7, n/ep=1, n/st=10, player_2/loss=0.146, rew=-1.00]         


Epoch #428: test_reward: 0.454600 ± 0.881101, best_reward: 0.494000 ± 0.842831 in #47


Epoch #429: 1001it [00:01, 807.66it/s, env_step=429000, len=6, n/ep=3, n/st=10, player_2/loss=0.152, rew=-0.33]         


Epoch #429: test_reward: 0.465200 ± 0.877490, best_reward: 0.494000 ± 0.842831 in #47


Epoch #430: 1001it [00:01, 831.51it/s, env_step=430000, len=6, n/ep=3, n/st=10, player_2/loss=0.241, rew=1.00]          


Epoch #430: test_reward: 0.457000 ± 0.877127, best_reward: 0.494000 ± 0.842831 in #47


Epoch #431: 1001it [00:01, 828.59it/s, env_step=431000, len=6, n/ep=1, n/st=10, player_2/loss=0.198, rew=1.00]          


Epoch #431: test_reward: 0.440600 ± 0.891107, best_reward: 0.494000 ± 0.842831 in #47


Epoch #432: 1001it [00:01, 780.78it/s, env_step=432000, len=6, n/ep=3, n/st=10, player_2/loss=0.187, rew=1.00]          


Epoch #432: test_reward: 0.468000 ± 0.869584, best_reward: 0.494000 ± 0.842831 in #47


Epoch #433: 1001it [00:01, 882.81it/s, env_step=433000, len=6, n/ep=1, n/st=10, player_2/loss=0.232, rew=1.00]          


Epoch #433: test_reward: 0.438400 ± 0.884876, best_reward: 0.494000 ± 0.842831 in #47


Epoch #434: 1001it [00:01, 801.46it/s, env_step=434000, len=5, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #434: test_reward: 0.387800 ± 0.913352, best_reward: 0.494000 ± 0.842831 in #47


Epoch #435: 1001it [00:01, 789.10it/s, env_step=435000, len=5, n/ep=3, n/st=10, player_2/loss=0.222, rew=0.33]          


Epoch #435: test_reward: 0.364600 ± 0.923291, best_reward: 0.494000 ± 0.842831 in #47


Epoch #436: 1001it [00:01, 791.02it/s, env_step=436000, len=6, n/ep=2, n/st=10, player_2/loss=0.198, rew=1.00]          


Epoch #436: test_reward: 0.403600 ± 0.902611, best_reward: 0.494000 ± 0.842831 in #47


Epoch #437: 1001it [00:01, 813.92it/s, env_step=437000, len=6, n/ep=2, n/st=10, rew=1.00]                               


Epoch #437: test_reward: 0.468800 ± 0.871910, best_reward: 0.494000 ± 0.842831 in #47


Epoch #438: 1001it [00:01, 766.30it/s, env_step=438000, len=7, n/ep=0, n/st=10, player_2/loss=0.142, rew=0.60]          


Epoch #438: test_reward: 0.418600 ± 0.902981, best_reward: 0.494000 ± 0.842831 in #47


Epoch #439: 1001it [00:01, 823.51it/s, env_step=439000, len=7, n/ep=2, n/st=10, player_2/loss=0.119, rew=0.00]          


Epoch #439: test_reward: 0.403000 ± 0.909610, best_reward: 0.494000 ± 0.842831 in #47


Epoch #440: 1001it [00:01, 812.83it/s, env_step=440000, len=6, n/ep=1, n/st=10, player_2/loss=0.232, rew=1.00]          


Epoch #440: test_reward: 0.448000 ± 0.887297, best_reward: 0.494000 ± 0.842831 in #47


Epoch #441: 1001it [00:01, 795.14it/s, env_step=441000, len=6, n/ep=1, n/st=10, rew=1.00]                               


Epoch #441: test_reward: 0.456800 ± 0.887544, best_reward: 0.494000 ± 0.842831 in #47


Epoch #442: 1001it [00:01, 814.22it/s, env_step=442000, len=7, n/ep=3, n/st=10, player_2/loss=0.220, rew=0.33]          


Epoch #442: test_reward: 0.445000 ± 0.891389, best_reward: 0.494000 ± 0.842831 in #47


Epoch #443: 1001it [00:01, 762.06it/s, env_step=443000, len=6, n/ep=2, n/st=10, player_2/loss=0.210, rew=0.00]          


Epoch #443: test_reward: 0.392400 ± 0.917182, best_reward: 0.494000 ± 0.842831 in #47


Epoch #444: 1001it [00:01, 777.84it/s, env_step=444000, len=6, n/ep=1, n/st=10, player_2/loss=0.157, rew=1.00]          


Epoch #444: test_reward: 0.457400 ± 0.881014, best_reward: 0.494000 ± 0.842831 in #47


Epoch #445: 1001it [00:01, 779.85it/s, env_step=445000, len=7, n/ep=2, n/st=10, player_2/loss=0.142, rew=1.00]          


Epoch #445: test_reward: 0.395200 ± 0.910064, best_reward: 0.494000 ± 0.842831 in #47


Epoch #446: 1001it [00:01, 829.12it/s, env_step=446000, len=6, n/ep=1, n/st=10, player_2/loss=0.126, rew=1.00]          


Epoch #446: test_reward: 0.393200 ± 0.908292, best_reward: 0.494000 ± 0.842831 in #47


Epoch #447: 1001it [00:01, 758.21it/s, env_step=447000, len=6, n/ep=2, n/st=10, rew=0.00]                               


Epoch #447: test_reward: 0.361000 ± 0.920586, best_reward: 0.494000 ± 0.842831 in #47


Epoch #448: 1001it [00:01, 806.84it/s, env_step=448000, len=6, n/ep=4, n/st=10, player_2/loss=0.234, rew=1.00]          


Epoch #448: test_reward: 0.359000 ± 0.920499, best_reward: 0.494000 ± 0.842831 in #47


Epoch #449: 1001it [00:01, 811.20it/s, env_step=449000, len=7, n/ep=1, n/st=10, rew=-1.00]                              


Epoch #449: test_reward: 0.422000 ± 0.892589, best_reward: 0.494000 ± 0.842831 in #47


Epoch #450: 1001it [00:01, 790.61it/s, env_step=450000, len=6, n/ep=2, n/st=10, rew=0.00]                               


Epoch #450: test_reward: 0.386200 ± 0.905014, best_reward: 0.494000 ± 0.842831 in #47


Epoch #451: 1001it [00:01, 821.34it/s, env_step=451000, len=6, n/ep=0, n/st=10, player_2/loss=0.160, rew=0.00]          
