# Training DQN for Tic Tac Toe and play against LLM

Based on [RL against random policy opponent with PettingZoo](https://tianshou.org/en/stable/01_tutorials/04_tictactoe.html).

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext tensorboard

# Install dependencies

In [None]:
!pip install gymnasium==0.29.1 pygame==2.3.0 pettingzoo==1.24.3 tianshou==0.5.1 transformers==4.39.1 accelerate==0.28.0

Collecting gymnasium==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.3.0
  Downloading pygame-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pettingzoo==1.24.3
  Downloading pettingzoo-1.24.3-py3-none-any.whl (847 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m847.8/847.8 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tianshou==0.5.1
  Downloading tianshou-0.5.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.39.1
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━

# Import needed dependencies

In [None]:
from typing import Any, Dict
import gymnasium as gym
from gymnasium.spaces import Discrete, Space
import torch
from torch import nn
import numpy as np
import re
from pettingzoo.classic import tictactoe_v3
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Batch, Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import (
    BasePolicy,
    DQNPolicy,
    MultiAgentPolicyManager,
    RandomPolicy
)
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Setup environment

In [None]:
def get_env(render_mode=None):
  return PettingZooEnv(tictactoe_v3.env(render_mode=render_mode))

# create the environment and get the shape of the states and shape of the actions
env = get_env()
observation_space = env.observation_space['observation'] if isinstance(
  env.observation_space, gym.spaces.Dict
) else env.observation_space
state_shape = observation_space.shape or observation_space.n
action_shape = env.action_space.shape or env.action_space.n

# Setup policies for training DQNPolicy

One training policy (DQNPolicy) and the opponent (RandomPolicy).

In [None]:
# Hidden sizes: shape of the MLP
hidden_sizes = [128, 128, 128, 128]
# device to train on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# the number of steps to look ahead
estimation_step = 3
# the target network update frequency (0 if you do not use the target network).
target_update_freq = 320
# learning rate of the Adam optimizer
lr = 1e-4

In [None]:
def get_dqn_policy():
  # The deep learning model (MLP) that underpins the behaviour of the agent (it is not the agent itself).
  net = Net(
    state_shape,
    action_shape,
    hidden_sizes=hidden_sizes,
    device=device
  ).to(device)

  # Adam optimizer
  optim = torch.optim.Adam(net.parameters(), lr=lr)

  # Agent to learn
  return DQNPolicy(
    model=net,
    optim=optim,
    action_space=env.action_space,
    estimation_step=estimation_step,
    target_update_freq=target_update_freq
  )

# Train the agent with DQN
Using the OffPolicyTrainer.

In [None]:
# path to save the results and the logging
path = '/content/tic_tac_toe/dqn'

def train_policy(policy, agent_id):
  # number of training environments
  training_num = 100
  # number of testing environments
  test_num = 100
  # size of the VectorReplayBuffer
  buffer_size = 20000
  # the batch size of sample data, which is going to feed in the policy network.
  batch_size = 64
  # the maximum number of epochs for training. The training process might be finished before reaching max_epoch if stop_fn is set.
  epoch = 100
  # the number of transitions collected per epoch.
  step_per_epoch = 1000
  # the number of transitions the collector would collect before the network update,
  # i.e., trainer will collect "step_per_collect" transitions and do some policy network update
  # repeatedly in each epoch.
  step_per_collect = 10
  # used in the stop function when the mean rewards are over this threshold
  win_rate = 0.99
  # The eps for epsilon-greedy exploration for test and training
  eps_test = 0.05
  eps_train = 0.1
  # the number of times the policy network would be updated per transition after (step_per_collect)
  # transitions are collected, e.g., if update_per_step set to 0.3, and step_per_collect is 256,
  # policy will be updated round(256 * 0.3 = 76.8) = 77 times after 256 transitions are collected
  # by the collector. Default to 1.
  update_per_step = 0.1

  # Dummy vectorized environment wrapper, implemented in for-loop.
  # This has the same interface as true vectorized environment, but the rollout does not happen in parallel.
  # So, all workers just wait for each other and the environment is as efficient as using a single environment.
  # This can be useful for testing or for demonstration purposes.
  train_envs = DummyVectorEnv([get_env for _ in range(training_num)])
  test_envs = DummyVectorEnv([get_env for _ in range(test_num)])

  # VectorReplayBuffer contains n ReplayBuffer with the same size.
  # It is used for storing transition from different environments yet keeping the order of time.
  vectorReplayBuffer = VectorReplayBuffer(buffer_size, len(train_envs))

  # determine whether the action needs to be modified with corresponding policy’s exploration noise.
  # If so, “policy. exploration_noise(act, batch)” will be called automatically to add the
  # exploration noise into action.
  exploration_noise = True

  # Train and test collector
  # Collector enables the policy to interact with different types of envs with exact number of steps or episodes.
  train_collector = Collector(policy, train_envs, vectorReplayBuffer, exploration_noise=exploration_noise)
  test_collector = Collector(policy, test_envs, exploration_noise=exploration_noise)

  # Collect a specified number of step or episode.
  train_collector.collect(n_step=batch_size * training_num)

  # A logger that logs statistics during training/testing/updating
  writer = SummaryWriter(path)
  logger = TensorboardLogger(writer)

  # Functions for the OffpolicyTrainer
  # Save the best model
  def save_best_fn(policy):
    torch.save(policy.policies[agent_id].state_dict(), path + '/policy-' + agent_id + '.pth')

  # When to stop training
  def stop_fn(mean_rewards):
    return mean_rewards >= win_rate

  # a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations
  def train_fn(epoch, env_step):
      # Set the eps for epsilon-greedy exploration.
      policy.policies[agent_id].set_eps(eps_train)

  def test_fn(epoch, env_step):
      # Set the eps for epsilon-greedy exploration.
      policy.policies[agent_id].set_eps(eps_test)

  # A function with signature used in multi-agent RL.
  # We need to return a single scalar for each episode’s result to monitor training in the multi-agent RL setting.
  # This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents.
  def reward_metric(rews):
    if agent_id == 'player_2':
      return rews[:, 1]
    return rews[:, 0]

  # Offpolicy trainer, samples mini-batches from buffer and passes them to update.
  result = OffpolicyTrainer(
    policy,
    train_collector,
    test_collector,
    epoch,
    step_per_epoch,
    step_per_collect,
    test_num,
    batch_size,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=update_per_step,
    logger=logger,
    test_in_train=False, # whether to test in the training phase.
    reward_metric=reward_metric
  ).run()

In [None]:
agent_learn_player1 = get_dqn_policy()
agent_learn_player2 = get_dqn_policy()
agent_random = RandomPolicy()

agents_1 = [agent_learn_player1, agent_random]
agents_2 = [agent_random, agent_learn_player2]

# Multi-agent policy manager for Multi-Agent Reinforcement Learning (https://tianshou.org/en/stable/01_tutorials/07_cheatsheet.html#marl-example)
policy_1 = MultiAgentPolicyManager(agents_1, env)
policy_2 = MultiAgentPolicyManager(agents_2, env)

train_policy(policy_1, 'player_1')
train_policy(policy_2, 'player_2')

Epoch #1: 1001it [00:03, 332.76it/s, env_step=1000, len=7, n/ep=15, n/st=100, player_1/loss=0.296, rew=0.27]                          


Epoch #1: test_reward: 0.850000 ± 0.433013, best_reward: 0.850000 ± 0.433013 in #1


Epoch #2: 1001it [00:00, 1029.49it/s, env_step=2000, len=7, n/ep=12, n/st=100, player_1/loss=0.233, rew=-0.25]                          


Epoch #2: test_reward: 0.410000 ± 0.837795, best_reward: 0.850000 ± 0.433013 in #1


Epoch #3: 1001it [00:00, 1038.55it/s, env_step=3000, len=8, n/ep=12, n/st=100, player_1/loss=0.217, rew=0.50]                          


Epoch #3: test_reward: 0.340000 ± 0.827285, best_reward: 0.850000 ± 0.433013 in #1


Epoch #4: 1001it [00:00, 1013.63it/s, env_step=4000, len=6, n/ep=14, n/st=100, player_1/loss=0.197, rew=0.64]                          


Epoch #4: test_reward: 0.410000 ± 0.884251, best_reward: 0.850000 ± 0.433013 in #1


Epoch #5: 1001it [00:00, 1009.60it/s, env_step=5000, len=6, n/ep=14, n/st=100, player_1/loss=0.190, rew=0.50]                          


Epoch #5: test_reward: 0.390000 ± 0.904378, best_reward: 0.850000 ± 0.433013 in #1


Epoch #6: 1001it [00:00, 1009.32it/s, env_step=6000, len=6, n/ep=21, n/st=100, player_1/loss=0.174, rew=0.43]                          


Epoch #6: test_reward: 0.400000 ± 0.894427, best_reward: 0.850000 ± 0.433013 in #1


Epoch #7: 1001it [00:00, 1020.67it/s, env_step=7000, len=7, n/ep=12, n/st=100, player_1/loss=0.189, rew=0.17]                          


Epoch #7: test_reward: 0.500000 ± 0.818535, best_reward: 0.850000 ± 0.433013 in #1


Epoch #8: 1001it [00:00, 1006.48it/s, env_step=8000, len=6, n/ep=17, n/st=100, player_1/loss=0.176, rew=0.41]                          


Epoch #8: test_reward: 0.610000 ± 0.746927, best_reward: 0.850000 ± 0.433013 in #1


Epoch #9: 1001it [00:01, 995.04it/s, env_step=9000, len=6, n/ep=24, n/st=100, player_1/loss=0.167, rew=0.79]                           


Epoch #9: test_reward: 0.690000 ± 0.673721, best_reward: 0.850000 ± 0.433013 in #1


Epoch #10: 1001it [00:00, 1017.37it/s, env_step=10000, len=5, n/ep=9, n/st=100, player_1/loss=0.164, rew=0.78]                          


Epoch #10: test_reward: 0.760000 ± 0.634350, best_reward: 0.850000 ± 0.433013 in #1


Epoch #11: 1001it [00:00, 1024.29it/s, env_step=11000, len=6, n/ep=13, n/st=100, player_1/loss=0.177, rew=0.54]                          


Epoch #11: test_reward: 0.700000 ± 0.608276, best_reward: 0.850000 ± 0.433013 in #1


Epoch #12: 1001it [00:01, 999.14it/s, env_step=12000, len=6, n/ep=17, n/st=100, player_1/loss=0.177, rew=0.53]                           


Epoch #12: test_reward: 0.850000 ± 0.497494, best_reward: 0.850000 ± 0.433013 in #1


Epoch #13: 1001it [00:00, 1002.37it/s, env_step=13000, len=6, n/ep=17, n/st=100, player_1/loss=0.137, rew=0.71]                          


Epoch #13: test_reward: 0.740000 ± 0.642184, best_reward: 0.850000 ± 0.433013 in #1


Epoch #14: 1001it [00:00, 1014.86it/s, env_step=14000, len=6, n/ep=15, n/st=100, player_1/loss=0.152, rew=0.67]                          


Epoch #14: test_reward: 0.730000 ± 0.645833, best_reward: 0.850000 ± 0.433013 in #1


Epoch #15: 1001it [00:00, 1014.18it/s, env_step=15000, len=6, n/ep=9, n/st=100, player_1/loss=0.157, rew=0.78]                          


Epoch #15: test_reward: 0.470000 ± 0.830120, best_reward: 0.850000 ± 0.433013 in #1


Epoch #16: 1001it [00:00, 1019.89it/s, env_step=16000, len=6, n/ep=16, n/st=100, player_1/loss=0.140, rew=0.69]                          


Epoch #16: test_reward: 0.790000 ± 0.588133, best_reward: 0.850000 ± 0.433013 in #1


Epoch #17: 1001it [00:00, 1021.77it/s, env_step=17000, len=6, n/ep=19, n/st=100, player_1/loss=0.152, rew=0.89]                          


Epoch #17: test_reward: 0.810000 ± 0.542125, best_reward: 0.850000 ± 0.433013 in #1


Epoch #18: 1001it [00:00, 1001.41it/s, env_step=18000, len=6, n/ep=22, n/st=100, player_1/loss=0.149, rew=0.73]                          


Epoch #18: test_reward: 0.500000 ± 0.768115, best_reward: 0.850000 ± 0.433013 in #1


Epoch #19: 1001it [00:00, 1021.83it/s, env_step=19000, len=6, n/ep=11, n/st=100, player_1/loss=0.145, rew=0.73]                          


Epoch #19: test_reward: 0.730000 ± 0.661135, best_reward: 0.850000 ± 0.433013 in #1


Epoch #20: 1001it [00:00, 1034.68it/s, env_step=20000, len=7, n/ep=11, n/st=100, player_1/loss=0.129, rew=0.73]                          


Epoch #20: test_reward: 0.840000 ± 0.440908, best_reward: 0.850000 ± 0.433013 in #1


Epoch #21: 1001it [00:00, 1010.77it/s, env_step=21000, len=5, n/ep=18, n/st=100, player_1/loss=0.127, rew=0.78]                          


Epoch #21: test_reward: 0.870000 ± 0.461628, best_reward: 0.870000 ± 0.461628 in #21


Epoch #22: 1001it [00:00, 1008.24it/s, env_step=22000, len=6, n/ep=16, n/st=100, player_1/loss=0.124, rew=0.81]                          


Epoch #22: test_reward: 0.800000 ± 0.547723, best_reward: 0.870000 ± 0.461628 in #21


Epoch #23: 1001it [00:00, 1011.47it/s, env_step=23000, len=6, n/ep=13, n/st=100, player_1/loss=0.118, rew=0.38]                          


Epoch #23: test_reward: 0.860000 ± 0.469468, best_reward: 0.870000 ± 0.461628 in #21


Epoch #24: 1001it [00:01, 997.72it/s, env_step=24000, len=6, n/ep=17, n/st=100, player_1/loss=0.124, rew=0.76]                           


Epoch #24: test_reward: 0.750000 ± 0.622495, best_reward: 0.870000 ± 0.461628 in #21


Epoch #25: 1001it [00:00, 1011.29it/s, env_step=25000, len=6, n/ep=9, n/st=100, player_1/loss=0.112, rew=0.33]                          


Epoch #25: test_reward: 0.630000 ± 0.743707, best_reward: 0.870000 ± 0.461628 in #21


Epoch #26: 1001it [00:01, 996.61it/s, env_step=26000, len=6, n/ep=20, n/st=100, player_1/loss=0.112, rew=0.55]                           


Epoch #26: test_reward: 0.780000 ± 0.609590, best_reward: 0.870000 ± 0.461628 in #21


Epoch #27: 1001it [00:00, 1034.08it/s, env_step=27000, len=7, n/ep=10, n/st=100, player_1/loss=0.106, rew=0.60]                          


Epoch #27: test_reward: 0.760000 ± 0.618385, best_reward: 0.870000 ± 0.461628 in #21


Epoch #28: 1001it [00:00, 1008.64it/s, env_step=28000, len=6, n/ep=14, n/st=100, player_1/loss=0.114, rew=0.71]                          


Epoch #28: test_reward: 0.800000 ± 0.565685, best_reward: 0.870000 ± 0.461628 in #21


Epoch #29: 1001it [00:00, 1025.25it/s, env_step=29000, len=6, n/ep=15, n/st=100, player_1/loss=0.092, rew=0.60]                          


Epoch #29: test_reward: 0.840000 ± 0.523832, best_reward: 0.870000 ± 0.461628 in #21


Epoch #30: 1001it [00:00, 1002.87it/s, env_step=30000, len=6, n/ep=22, n/st=100, player_1/loss=0.104, rew=0.77]                          


Epoch #30: test_reward: 0.790000 ± 0.588133, best_reward: 0.870000 ± 0.461628 in #21


Epoch #31: 1001it [00:00, 1013.54it/s, env_step=31000, len=5, n/ep=15, n/st=100, player_1/loss=0.096, rew=0.73]                          


Epoch #31: test_reward: 0.910000 ± 0.376696, best_reward: 0.910000 ± 0.376696 in #31


Epoch #32: 1001it [00:00, 1012.39it/s, env_step=32000, len=6, n/ep=24, n/st=100, player_1/loss=0.084, rew=0.79]                          


Epoch #32: test_reward: 0.820000 ± 0.572364, best_reward: 0.910000 ± 0.376696 in #31


Epoch #33: 1001it [00:00, 1007.25it/s, env_step=33000, len=5, n/ep=15, n/st=100, player_1/loss=0.100, rew=0.87]                          


Epoch #33: test_reward: 0.870000 ± 0.482804, best_reward: 0.910000 ± 0.376696 in #31


Epoch #34: 1001it [00:00, 1008.10it/s, env_step=34000, len=6, n/ep=15, n/st=100, player_1/loss=0.089, rew=0.73]                          


Epoch #34: test_reward: 0.900000 ± 0.435890, best_reward: 0.910000 ± 0.376696 in #31


Epoch #35: 1001it [00:00, 1006.52it/s, env_step=35000, len=5, n/ep=18, n/st=100, player_1/loss=0.102, rew=0.83]                          


Epoch #35: test_reward: 0.810000 ± 0.577841, best_reward: 0.910000 ± 0.376696 in #31


Epoch #36: 1001it [00:01, 995.98it/s, env_step=36000, len=5, n/ep=18, n/st=100, player_1/loss=0.101, rew=0.89]                           


Epoch #36: test_reward: 0.810000 ± 0.577841, best_reward: 0.910000 ± 0.376696 in #31


Epoch #37: 1001it [00:00, 1002.01it/s, env_step=37000, len=6, n/ep=7, n/st=100, player_1/loss=0.097, rew=1.00]                          


Epoch #37: test_reward: 0.910000 ± 0.402368, best_reward: 0.910000 ± 0.376696 in #31


Epoch #38: 1001it [00:01, 1000.45it/s, env_step=38000, len=5, n/ep=14, n/st=100, player_1/loss=0.101, rew=0.71]                          


Epoch #38: test_reward: 0.780000 ± 0.609590, best_reward: 0.910000 ± 0.376696 in #31


Epoch #39: 1001it [00:01, 970.01it/s, env_step=39000, len=5, n/ep=12, n/st=100, player_1/loss=0.103, rew=1.00]                           


Epoch #39: test_reward: 0.830000 ± 0.510979, best_reward: 0.910000 ± 0.376696 in #31


Epoch #40: 1001it [00:00, 1013.93it/s, env_step=40000, len=6, n/ep=14, n/st=100, player_1/loss=0.099, rew=1.00]                          


Epoch #40: test_reward: 0.820000 ± 0.572364, best_reward: 0.910000 ± 0.376696 in #31


Epoch #41: 1001it [00:00, 1002.80it/s, env_step=41000, len=5, n/ep=13, n/st=100, player_1/loss=0.091, rew=0.62]                          


Epoch #41: test_reward: 0.920000 ± 0.391918, best_reward: 0.920000 ± 0.391918 in #41


Epoch #42: 1001it [00:00, 1006.78it/s, env_step=42000, len=6, n/ep=13, n/st=100, player_1/loss=0.097, rew=0.69]                          


Epoch #42: test_reward: 0.550000 ± 0.804674, best_reward: 0.920000 ± 0.391918 in #41


Epoch #43: 1001it [00:00, 1021.77it/s, env_step=43000, len=7, n/ep=14, n/st=100, player_1/loss=0.097, rew=-0.07]                          


Epoch #43: test_reward: 0.720000 ± 0.649307, best_reward: 0.920000 ± 0.391918 in #41


Epoch #44: 1001it [00:00, 1006.41it/s, env_step=44000, len=5, n/ep=10, n/st=100, player_1/loss=0.108, rew=0.80]                          


Epoch #44: test_reward: 0.480000 ± 0.830422, best_reward: 0.920000 ± 0.391918 in #41


Epoch #45: 1001it [00:01, 997.95it/s, env_step=45000, len=7, n/ep=18, n/st=100, player_1/loss=0.104, rew=0.28]                           


Epoch #45: test_reward: 0.490000 ± 0.830602, best_reward: 0.920000 ± 0.391918 in #41


Epoch #46: 1001it [00:00, 1044.55it/s, env_step=46000, len=7, n/ep=12, n/st=100, player_1/loss=0.110, rew=0.58]                          


Epoch #46: test_reward: 0.750000 ± 0.638357, best_reward: 0.920000 ± 0.391918 in #41


Epoch #47: 1001it [00:00, 1001.20it/s, env_step=47000, len=6, n/ep=18, n/st=100, player_1/loss=0.108, rew=0.83]                          


Epoch #47: test_reward: 0.440000 ± 0.863944, best_reward: 0.920000 ± 0.391918 in #41


Epoch #48: 1001it [00:00, 1031.12it/s, env_step=48000, len=5, n/ep=12, n/st=100, player_1/loss=0.100, rew=0.83]                          


Epoch #48: test_reward: 0.740000 ± 0.672607, best_reward: 0.920000 ± 0.391918 in #41


Epoch #49: 1001it [00:00, 1010.09it/s, env_step=49000, len=5, n/ep=22, n/st=100, player_1/loss=0.106, rew=0.68]                          


Epoch #49: test_reward: 0.630000 ± 0.743707, best_reward: 0.920000 ± 0.391918 in #41


Epoch #50: 1001it [00:00, 1018.18it/s, env_step=50000, len=6, n/ep=15, n/st=100, player_1/loss=0.107, rew=0.73]                          


Epoch #50: test_reward: 0.570000 ± 0.777882, best_reward: 0.920000 ± 0.391918 in #41


Epoch #51: 1001it [00:00, 1018.46it/s, env_step=51000, len=7, n/ep=18, n/st=100, player_1/loss=0.116, rew=0.28]                          


Epoch #51: test_reward: 0.550000 ± 0.753326, best_reward: 0.920000 ± 0.391918 in #41


Epoch #52: 1001it [00:00, 1038.51it/s, env_step=52000, len=7, n/ep=13, n/st=100, player_1/loss=0.134, rew=0.38]                          


Epoch #52: test_reward: 0.450000 ± 0.852936, best_reward: 0.920000 ± 0.391918 in #41


Epoch #53: 1001it [00:00, 1010.79it/s, env_step=53000, len=6, n/ep=15, n/st=100, player_1/loss=0.126, rew=0.53]                          


Epoch #53: test_reward: 0.740000 ± 0.626418, best_reward: 0.920000 ± 0.391918 in #41


Epoch #54: 1001it [00:00, 1021.50it/s, env_step=54000, len=7, n/ep=14, n/st=100, player_1/loss=0.121, rew=0.43]                          


Epoch #54: test_reward: 0.720000 ± 0.664530, best_reward: 0.920000 ± 0.391918 in #41


Epoch #55: 1001it [00:00, 1027.05it/s, env_step=55000, len=6, n/ep=11, n/st=100, player_1/loss=0.118, rew=0.27]                          


Epoch #55: test_reward: 0.760000 ± 0.618385, best_reward: 0.920000 ± 0.391918 in #41


Epoch #56: 1001it [00:01, 985.71it/s, env_step=56000, len=7, n/ep=11, n/st=100, player_1/loss=0.125, rew=0.45]                           


Epoch #56: test_reward: 0.500000 ± 0.806226, best_reward: 0.920000 ± 0.391918 in #41


Epoch #57: 1001it [00:00, 1014.00it/s, env_step=57000, len=6, n/ep=21, n/st=100, player_1/loss=0.127, rew=0.71]                          


Epoch #57: test_reward: 0.740000 ± 0.657571, best_reward: 0.920000 ± 0.391918 in #41


Epoch #58: 1001it [00:00, 1019.61it/s, env_step=58000, len=5, n/ep=24, n/st=100, player_1/loss=0.124, rew=0.67]                          


Epoch #58: test_reward: 0.820000 ± 0.536284, best_reward: 0.920000 ± 0.391918 in #41


Epoch #59: 1001it [00:00, 1004.87it/s, env_step=59000, len=6, n/ep=19, n/st=100, player_1/loss=0.148, rew=0.68]                          


Epoch #59: test_reward: 0.840000 ± 0.504381, best_reward: 0.920000 ± 0.391918 in #41


Epoch #60: 1001it [00:00, 1006.35it/s, env_step=60000, len=6, n/ep=12, n/st=100, player_1/loss=0.120, rew=0.75]                          


Epoch #60: test_reward: 0.710000 ± 0.682569, best_reward: 0.920000 ± 0.391918 in #41


Epoch #61: 1001it [00:00, 1007.95it/s, env_step=61000, len=5, n/ep=21, n/st=100, player_1/loss=0.119, rew=0.71]                          


Epoch #61: test_reward: 0.720000 ± 0.693974, best_reward: 0.920000 ± 0.391918 in #41


Epoch #62: 1001it [00:00, 1004.56it/s, env_step=62000, len=5, n/ep=21, n/st=100, player_1/loss=0.128, rew=0.90]                          


Epoch #62: test_reward: 0.720000 ± 0.649307, best_reward: 0.920000 ± 0.391918 in #41


Epoch #63: 1001it [00:00, 1010.66it/s, env_step=63000, len=6, n/ep=21, n/st=100, player_1/loss=0.127, rew=0.67]                          


Epoch #63: test_reward: 0.600000 ± 0.787401, best_reward: 0.920000 ± 0.391918 in #41


Epoch #64: 1001it [00:01, 997.64it/s, env_step=64000, len=6, n/ep=11, n/st=100, player_1/loss=0.130, rew=0.82]                           


Epoch #64: test_reward: 0.780000 ± 0.592959, best_reward: 0.920000 ± 0.391918 in #41


Epoch #65: 1001it [00:00, 1017.03it/s, env_step=65000, len=6, n/ep=11, n/st=100, player_1/loss=0.131, rew=0.45]                          


Epoch #65: test_reward: 0.720000 ± 0.679412, best_reward: 0.920000 ± 0.391918 in #41


Epoch #66: 1001it [00:00, 1014.68it/s, env_step=66000, len=7, n/ep=12, n/st=100, player_1/loss=0.129, rew=0.58]                          


Epoch #66: test_reward: 0.560000 ± 0.791454, best_reward: 0.920000 ± 0.391918 in #41


Epoch #67: 1001it [00:00, 1008.30it/s, env_step=67000, len=5, n/ep=11, n/st=100, player_1/loss=0.124, rew=1.00]                          


Epoch #67: test_reward: 0.730000 ± 0.676092, best_reward: 0.920000 ± 0.391918 in #41


Epoch #68: 1001it [00:00, 1012.61it/s, env_step=68000, len=6, n/ep=15, n/st=100, player_1/loss=0.123, rew=0.20]                          


Epoch #68: test_reward: 0.690000 ± 0.688404, best_reward: 0.920000 ± 0.391918 in #41


Epoch #69: 1001it [00:00, 1002.51it/s, env_step=69000, len=6, n/ep=16, n/st=100, player_1/loss=0.129, rew=0.75]                          


Epoch #69: test_reward: 0.770000 ± 0.614085, best_reward: 0.920000 ± 0.391918 in #41


Epoch #70: 1001it [00:00, 1016.47it/s, env_step=70000, len=6, n/ep=15, n/st=100, player_1/loss=0.133, rew=0.60]                          


Epoch #70: test_reward: 0.560000 ± 0.778717, best_reward: 0.920000 ± 0.391918 in #41


Epoch #71: 1001it [00:00, 1007.05it/s, env_step=71000, len=7, n/ep=10, n/st=100, player_1/loss=0.125, rew=0.30]                          


Epoch #71: test_reward: 0.740000 ± 0.657571, best_reward: 0.920000 ± 0.391918 in #41


Epoch #72: 1001it [00:01, 993.33it/s, env_step=72000, len=6, n/ep=16, n/st=100, player_1/loss=0.105, rew=0.50]                           


Epoch #72: test_reward: 0.810000 ± 0.560268, best_reward: 0.920000 ± 0.391918 in #41


Epoch #73: 1001it [00:00, 1001.13it/s, env_step=73000, len=6, n/ep=20, n/st=100, player_1/loss=0.115, rew=0.65]                          


Epoch #73: test_reward: 0.790000 ± 0.604897, best_reward: 0.920000 ± 0.391918 in #41


Epoch #74: 1001it [00:00, 1024.96it/s, env_step=74000, len=6, n/ep=12, n/st=100, player_1/loss=0.126, rew=0.75]                          


Epoch #74: test_reward: 0.520000 ± 0.830422, best_reward: 0.920000 ± 0.391918 in #41


Epoch #75: 1001it [00:00, 1022.18it/s, env_step=75000, len=6, n/ep=12, n/st=100, player_1/loss=0.117, rew=0.33]                          


Epoch #75: test_reward: 0.520000 ± 0.805978, best_reward: 0.920000 ± 0.391918 in #41


Epoch #76: 1001it [00:00, 1026.11it/s, env_step=76000, len=6, n/ep=18, n/st=100, player_1/loss=0.124, rew=0.94]                          


Epoch #76: test_reward: 0.590000 ± 0.736139, best_reward: 0.920000 ± 0.391918 in #41


Epoch #77: 1001it [00:00, 1024.14it/s, env_step=77000, len=6, n/ep=15, n/st=100, player_1/loss=0.124, rew=0.87]                          


Epoch #77: test_reward: 0.450000 ± 0.841130, best_reward: 0.920000 ± 0.391918 in #41


Epoch #78: 1001it [00:00, 1018.98it/s, env_step=78000, len=6, n/ep=13, n/st=100, player_1/loss=0.122, rew=0.85]                          


Epoch #78: test_reward: 0.550000 ± 0.766485, best_reward: 0.920000 ± 0.391918 in #41


Epoch #79: 1001it [00:00, 1024.01it/s, env_step=79000, len=5, n/ep=15, n/st=100, player_1/loss=0.111, rew=1.00]                          


Epoch #79: test_reward: 0.490000 ± 0.806164, best_reward: 0.920000 ± 0.391918 in #41


Epoch #80: 1001it [00:00, 1016.83it/s, env_step=80000, len=7, n/ep=14, n/st=100, player_1/loss=0.134, rew=0.64]                          


Epoch #80: test_reward: 0.750000 ± 0.653835, best_reward: 0.920000 ± 0.391918 in #41


Epoch #81: 1001it [00:00, 1016.87it/s, env_step=81000, len=6, n/ep=14, n/st=100, player_1/loss=0.126, rew=0.14]                          


Epoch #81: test_reward: 0.940000 ± 0.310483, best_reward: 0.940000 ± 0.310483 in #81


Epoch #82: 1001it [00:00, 1024.30it/s, env_step=82000, len=6, n/ep=16, n/st=100, player_1/loss=0.119, rew=0.12]                          


Epoch #82: test_reward: 0.830000 ± 0.548726, best_reward: 0.940000 ± 0.310483 in #81


Epoch #83: 1001it [00:00, 1001.92it/s, env_step=83000, len=6, n/ep=21, n/st=100, player_1/loss=0.133, rew=0.90]                          


Epoch #83: test_reward: 0.770000 ± 0.597578, best_reward: 0.940000 ± 0.310483 in #81


Epoch #84: 1001it [00:00, 1001.44it/s, env_step=84000, len=6, n/ep=17, n/st=100, player_1/loss=0.138, rew=0.47]                          


Epoch #84: test_reward: 0.510000 ± 0.830602, best_reward: 0.940000 ± 0.310483 in #81


Epoch #85: 1001it [00:00, 1013.86it/s, env_step=85000, len=7, n/ep=8, n/st=100, player_1/loss=0.132, rew=0.62]                          


Epoch #85: test_reward: 0.570000 ± 0.710704, best_reward: 0.940000 ± 0.310483 in #81


Epoch #86: 1001it [00:00, 1034.14it/s, env_step=86000, len=6, n/ep=13, n/st=100, player_1/loss=0.129, rew=0.85]                          


Epoch #86: test_reward: 0.510000 ± 0.754917, best_reward: 0.940000 ± 0.310483 in #81


Epoch #87: 1001it [00:00, 1012.18it/s, env_step=87000, len=7, n/ep=11, n/st=100, player_1/loss=0.135, rew=0.91]                          


Epoch #87: test_reward: 0.470000 ± 0.853874, best_reward: 0.940000 ± 0.310483 in #81


Epoch #88: 1001it [00:00, 1012.03it/s, env_step=88000, len=8, n/ep=6, n/st=100, player_1/loss=0.131, rew=0.50]                          


Epoch #88: test_reward: 0.730000 ± 0.614085, best_reward: 0.940000 ± 0.310483 in #81


Epoch #89: 1001it [00:00, 1019.82it/s, env_step=89000, len=7, n/ep=20, n/st=100, player_1/loss=0.143, rew=0.65]                          


Epoch #89: test_reward: 0.670000 ± 0.679043, best_reward: 0.940000 ± 0.310483 in #81


Epoch #90: 1001it [00:00, 1006.07it/s, env_step=90000, len=7, n/ep=16, n/st=100, player_1/loss=0.120, rew=0.44]                          


Epoch #90: test_reward: 0.770000 ± 0.614085, best_reward: 0.940000 ± 0.310483 in #81


Epoch #91: 1001it [00:00, 1005.66it/s, env_step=91000, len=6, n/ep=10, n/st=100, player_1/loss=0.123, rew=1.00]                          


Epoch #91: test_reward: 0.760000 ± 0.634350, best_reward: 0.940000 ± 0.310483 in #81


Epoch #92: 1001it [00:00, 1010.33it/s, env_step=92000, len=6, n/ep=13, n/st=100, player_1/loss=0.138, rew=0.54]                          


Epoch #92: test_reward: 0.780000 ± 0.558211, best_reward: 0.940000 ± 0.310483 in #81


Epoch #93: 1001it [00:01, 1000.62it/s, env_step=93000, len=6, n/ep=14, n/st=100, player_1/loss=0.134, rew=0.86]                          


Epoch #93: test_reward: 0.880000 ± 0.430813, best_reward: 0.940000 ± 0.310483 in #81


Epoch #94: 1001it [00:00, 1011.78it/s, env_step=94000, len=5, n/ep=22, n/st=100, player_1/loss=0.132, rew=0.77]                          


Epoch #94: test_reward: 0.800000 ± 0.583095, best_reward: 0.940000 ± 0.310483 in #81


Epoch #95: 1001it [00:00, 1004.60it/s, env_step=95000, len=6, n/ep=18, n/st=100, player_1/loss=0.134, rew=0.94]                          


Epoch #95: test_reward: 0.870000 ± 0.461628, best_reward: 0.940000 ± 0.310483 in #81


Epoch #96: 1001it [00:00, 1024.22it/s, env_step=96000, len=6, n/ep=13, n/st=100, player_1/loss=0.138, rew=0.62]                          


Epoch #96: test_reward: 0.770000 ± 0.614085, best_reward: 0.940000 ± 0.310483 in #81


Epoch #97: 1001it [00:00, 1013.98it/s, env_step=97000, len=7, n/ep=18, n/st=100, player_1/loss=0.120, rew=0.50]                          


Epoch #97: test_reward: 0.700000 ± 0.700000, best_reward: 0.940000 ± 0.310483 in #81


Epoch #98: 1001it [00:01, 985.95it/s, env_step=98000, len=6, n/ep=10, n/st=100, player_1/loss=0.117, rew=1.00]                           


Epoch #98: test_reward: 0.830000 ± 0.548726, best_reward: 0.940000 ± 0.310483 in #81


Epoch #99: 1001it [00:00, 1004.89it/s, env_step=99000, len=6, n/ep=17, n/st=100, player_1/loss=0.126, rew=0.53]                          


Epoch #99: test_reward: 0.840000 ± 0.542586, best_reward: 0.940000 ± 0.310483 in #81


Epoch #100: 1001it [00:00, 1009.97it/s, env_step=100000, len=6, n/ep=15, n/st=100, player_1/loss=0.127, rew=1.00]                          


Epoch #100: test_reward: 0.830000 ± 0.548726, best_reward: 0.940000 ± 0.310483 in #81


Epoch #1: 1001it [00:00, 1025.84it/s, env_step=1000, len=7, n/ep=13, n/st=100, player_2/loss=0.292, rew=-0.15]                          


Epoch #1: test_reward: -0.140000 ± 0.959375, best_reward: -0.140000 ± 0.872009 in #0


Epoch #2: 1001it [00:00, 1034.34it/s, env_step=2000, len=7, n/ep=10, n/st=100, player_2/loss=0.297, rew=-0.20]                          


Epoch #2: test_reward: -0.200000 ± 0.927362, best_reward: -0.140000 ± 0.872009 in #0


Epoch #3: 1001it [00:00, 1051.98it/s, env_step=3000, len=8, n/ep=10, n/st=100, player_2/loss=0.299, rew=-0.60]                          


Epoch #3: test_reward: -0.210000 ± 0.886510, best_reward: -0.140000 ± 0.872009 in #0


Epoch #4: 1001it [00:00, 1027.17it/s, env_step=4000, len=7, n/ep=11, n/st=100, player_2/loss=0.271, rew=-0.18]                          


Epoch #4: test_reward: -0.200000 ± 0.927362, best_reward: -0.140000 ± 0.872009 in #0


Epoch #5: 1001it [00:00, 1022.61it/s, env_step=5000, len=7, n/ep=23, n/st=100, player_2/loss=0.273, rew=-0.13]                          


Epoch #5: test_reward: 0.070000 ± 0.982395, best_reward: 0.070000 ± 0.982395 in #5


Epoch #6: 1001it [00:00, 1032.38it/s, env_step=6000, len=6, n/ep=20, n/st=100, player_2/loss=0.257, rew=-0.30]                          


Epoch #6: test_reward: 0.060000 ± 0.988130, best_reward: 0.070000 ± 0.982395 in #5


Epoch #7: 1001it [00:00, 1037.73it/s, env_step=7000, len=7, n/ep=9, n/st=100, player_2/loss=0.272, rew=-0.44]                          


Epoch #7: test_reward: 0.150000 ± 0.983616, best_reward: 0.150000 ± 0.983616 in #7


Epoch #8: 1001it [00:00, 1014.09it/s, env_step=8000, len=7, n/ep=17, n/st=100, player_2/loss=0.267, rew=0.06]                          


Epoch #8: test_reward: 0.320000 ± 0.893085, best_reward: 0.320000 ± 0.893085 in #8


Epoch #9: 1001it [00:00, 1034.40it/s, env_step=9000, len=6, n/ep=16, n/st=100, player_2/loss=0.262, rew=0.50]                          


Epoch #9: test_reward: 0.510000 ± 0.854342, best_reward: 0.510000 ± 0.854342 in #9


Epoch #10: 1001it [00:00, 1031.03it/s, env_step=10000, len=7, n/ep=16, n/st=100, player_2/loss=0.269, rew=0.81]                          


Epoch #10: test_reward: 0.340000 ± 0.907965, best_reward: 0.510000 ± 0.854342 in #9


Epoch #11: 1001it [00:00, 1019.13it/s, env_step=11000, len=7, n/ep=11, n/st=100, player_2/loss=0.272, rew=0.09]                          


Epoch #11: test_reward: 0.300000 ± 0.943398, best_reward: 0.510000 ± 0.854342 in #9


Epoch #12: 1001it [00:00, 1018.86it/s, env_step=12000, len=6, n/ep=20, n/st=100, player_2/loss=0.262, rew=0.45]                          


Epoch #12: test_reward: 0.470000 ± 0.853874, best_reward: 0.510000 ± 0.854342 in #9


Epoch #13: 1001it [00:00, 1004.71it/s, env_step=13000, len=6, n/ep=9, n/st=100, player_2/loss=0.258, rew=0.67]                          


Epoch #13: test_reward: 0.400000 ± 0.883176, best_reward: 0.510000 ± 0.854342 in #9


Epoch #14: 1001it [00:00, 1024.93it/s, env_step=14000, len=6, n/ep=16, n/st=100, player_2/loss=0.257, rew=0.50]                          


Epoch #14: test_reward: 0.340000 ± 0.907965, best_reward: 0.510000 ± 0.854342 in #9


Epoch #15: 1001it [00:00, 1025.45it/s, env_step=15000, len=6, n/ep=12, n/st=100, player_2/loss=0.274, rew=0.17]                          


Epoch #15: test_reward: 0.360000 ± 0.900222, best_reward: 0.510000 ± 0.854342 in #9


Epoch #16: 1001it [00:00, 1023.30it/s, env_step=16000, len=6, n/ep=12, n/st=100, player_2/loss=0.270, rew=0.42]                          


Epoch #16: test_reward: 0.360000 ± 0.900222, best_reward: 0.510000 ± 0.854342 in #9


Epoch #17: 1001it [00:00, 1022.91it/s, env_step=17000, len=6, n/ep=16, n/st=100, player_2/loss=0.260, rew=0.44]                          


Epoch #17: test_reward: 0.460000 ± 0.841665, best_reward: 0.510000 ± 0.854342 in #9


Epoch #18: 1001it [00:00, 1038.63it/s, env_step=18000, len=6, n/ep=15, n/st=100, player_2/loss=0.273, rew=0.07]                          


Epoch #18: test_reward: 0.500000 ± 0.842615, best_reward: 0.510000 ± 0.854342 in #9


Epoch #19: 1001it [00:00, 1013.01it/s, env_step=19000, len=6, n/ep=12, n/st=100, player_2/loss=0.277, rew=0.00]                          


Epoch #19: test_reward: 0.300000 ± 0.932738, best_reward: 0.510000 ± 0.854342 in #9


Epoch #20: 1001it [00:00, 1036.95it/s, env_step=20000, len=6, n/ep=19, n/st=100, player_2/loss=0.276, rew=0.95]                          


Epoch #20: test_reward: 0.450000 ± 0.864581, best_reward: 0.510000 ± 0.854342 in #9


Epoch #21: 1001it [00:00, 1019.38it/s, env_step=21000, len=7, n/ep=12, n/st=100, player_2/loss=0.252, rew=-0.33]                          


Epoch #21: test_reward: 0.350000 ± 0.931397, best_reward: 0.510000 ± 0.854342 in #9


Epoch #22: 1001it [00:00, 1009.00it/s, env_step=22000, len=6, n/ep=14, n/st=100, player_2/loss=0.249, rew=-0.29]                          


Epoch #22: test_reward: 0.450000 ± 0.887412, best_reward: 0.510000 ± 0.854342 in #9


Epoch #23: 1001it [00:01, 1000.14it/s, env_step=23000, len=6, n/ep=19, n/st=100, player_2/loss=0.256, rew=0.16]                          


Epoch #23: test_reward: 0.310000 ± 0.945463, best_reward: 0.510000 ± 0.854342 in #9


Epoch #24: 1001it [00:00, 1025.23it/s, env_step=24000, len=5, n/ep=12, n/st=100, player_2/loss=0.231, rew=0.17]                          


Epoch #24: test_reward: 0.510000 ± 0.854342, best_reward: 0.510000 ± 0.854342 in #9


Epoch #25: 1001it [00:00, 1022.39it/s, env_step=25000, len=6, n/ep=19, n/st=100, player_2/loss=0.233, rew=0.63]                          


Epoch #25: test_reward: 0.350000 ± 0.931397, best_reward: 0.510000 ± 0.854342 in #9


Epoch #26: 1001it [00:00, 1023.50it/s, env_step=26000, len=6, n/ep=21, n/st=100, player_2/loss=0.230, rew=0.29]                          


Epoch #26: test_reward: 0.340000 ± 0.940425, best_reward: 0.510000 ± 0.854342 in #9


Epoch #27: 1001it [00:00, 1025.50it/s, env_step=27000, len=6, n/ep=16, n/st=100, player_2/loss=0.234, rew=0.12]                          


Epoch #27: test_reward: 0.560000 ± 0.828493, best_reward: 0.560000 ± 0.828493 in #27


Epoch #28: 1001it [00:00, 1025.31it/s, env_step=28000, len=6, n/ep=14, n/st=100, player_2/loss=0.227, rew=0.29]                          


Epoch #28: test_reward: 0.410000 ± 0.895489, best_reward: 0.560000 ± 0.828493 in #27


Epoch #29: 1001it [00:00, 1009.13it/s, env_step=29000, len=6, n/ep=13, n/st=100, player_2/loss=0.222, rew=-0.23]                          


Epoch #29: test_reward: 0.410000 ± 0.906587, best_reward: 0.560000 ± 0.828493 in #27


Epoch #30: 1001it [00:00, 1010.13it/s, env_step=30000, len=7, n/ep=17, n/st=100, player_2/loss=0.224, rew=0.29]                          


Epoch #30: test_reward: 0.390000 ± 0.915369, best_reward: 0.560000 ± 0.828493 in #27


Epoch #31: 1001it [00:00, 1006.72it/s, env_step=31000, len=7, n/ep=9, n/st=100, player_2/loss=0.224, rew=-0.22]                          


Epoch #31: test_reward: 0.300000 ± 0.943398, best_reward: 0.560000 ± 0.828493 in #27


Epoch #32: 1001it [00:00, 1014.56it/s, env_step=32000, len=6, n/ep=16, n/st=100, player_2/loss=0.221, rew=0.00]                          


Epoch #32: test_reward: 0.320000 ± 0.936803, best_reward: 0.560000 ± 0.828493 in #27


Epoch #33: 1001it [00:00, 1024.36it/s, env_step=33000, len=6, n/ep=11, n/st=100, player_2/loss=0.238, rew=0.73]                          


Epoch #33: test_reward: 0.480000 ± 0.854166, best_reward: 0.560000 ± 0.828493 in #27


Epoch #34: 1001it [00:00, 1018.58it/s, env_step=34000, len=7, n/ep=19, n/st=100, player_2/loss=0.223, rew=0.21]                          


Epoch #34: test_reward: 0.360000 ± 0.932952, best_reward: 0.560000 ± 0.828493 in #27


Epoch #35: 1001it [00:00, 1023.64it/s, env_step=35000, len=6, n/ep=15, n/st=100, player_2/loss=0.223, rew=0.13]                          


Epoch #35: test_reward: 0.400000 ± 0.916515, best_reward: 0.560000 ± 0.828493 in #27


Epoch #36: 1001it [00:00, 1021.87it/s, env_step=36000, len=6, n/ep=20, n/st=100, player_2/loss=0.213, rew=0.40]                          


Epoch #36: test_reward: 0.370000 ± 0.912743, best_reward: 0.560000 ± 0.828493 in #27


Epoch #37: 1001it [00:00, 1016.50it/s, env_step=37000, len=6, n/ep=19, n/st=100, player_2/loss=0.227, rew=0.37]                          


Epoch #37: test_reward: 0.340000 ± 0.929731, best_reward: 0.560000 ± 0.828493 in #27


Epoch #38: 1001it [00:00, 1016.03it/s, env_step=38000, len=6, n/ep=17, n/st=100, player_2/loss=0.231, rew=0.41]                          


Epoch #38: test_reward: 0.280000 ± 0.928224, best_reward: 0.560000 ± 0.828493 in #27


Epoch #39: 1001it [00:01, 994.49it/s, env_step=39000, len=6, n/ep=16, n/st=100, player_2/loss=0.235, rew=0.38]                           


Epoch #39: test_reward: 0.310000 ± 0.945463, best_reward: 0.560000 ± 0.828493 in #27


Epoch #40: 1001it [00:00, 1009.02it/s, env_step=40000, len=6, n/ep=15, n/st=100, player_2/loss=0.230, rew=0.33]                          


Epoch #40: test_reward: 0.470000 ± 0.876983, best_reward: 0.560000 ± 0.828493 in #27


Epoch #41: 1001it [00:00, 1005.37it/s, env_step=41000, len=6, n/ep=21, n/st=100, player_2/loss=0.241, rew=0.19]                          


Epoch #41: test_reward: 0.380000 ± 0.924986, best_reward: 0.560000 ± 0.828493 in #27


Epoch #42: 1001it [00:00, 1024.10it/s, env_step=42000, len=6, n/ep=12, n/st=100, player_2/loss=0.239, rew=0.50]                          


Epoch #42: test_reward: 0.120000 ± 0.982649, best_reward: 0.560000 ± 0.828493 in #27


Epoch #43: 1001it [00:00, 1007.30it/s, env_step=43000, len=6, n/ep=11, n/st=100, player_2/loss=0.228, rew=0.64]                          


Epoch #43: test_reward: 0.260000 ± 0.965609, best_reward: 0.560000 ± 0.828493 in #27


Epoch #44: 1001it [00:00, 1017.31it/s, env_step=44000, len=6, n/ep=17, n/st=100, player_2/loss=0.235, rew=0.35]                          


Epoch #44: test_reward: 0.380000 ± 0.914112, best_reward: 0.560000 ± 0.828493 in #27


Epoch #45: 1001it [00:00, 1007.27it/s, env_step=45000, len=6, n/ep=17, n/st=100, player_2/loss=0.237, rew=0.53]                          


Epoch #45: test_reward: 0.380000 ± 0.924986, best_reward: 0.560000 ± 0.828493 in #27


Epoch #46: 1001it [00:00, 1003.55it/s, env_step=46000, len=6, n/ep=16, n/st=100, player_2/loss=0.261, rew=0.88]                          


Epoch #46: test_reward: 0.500000 ± 0.866025, best_reward: 0.560000 ± 0.828493 in #27


Epoch #47: 1001it [00:01, 994.13it/s, env_step=47000, len=6, n/ep=22, n/st=100, player_2/loss=0.246, rew=0.18]                           


Epoch #47: test_reward: 0.540000 ± 0.841665, best_reward: 0.560000 ± 0.828493 in #27


Epoch #48: 1001it [00:00, 1012.96it/s, env_step=48000, len=6, n/ep=14, n/st=100, player_2/loss=0.243, rew=0.29]                          


Epoch #48: test_reward: 0.430000 ± 0.886059, best_reward: 0.560000 ± 0.828493 in #27


Epoch #49: 1001it [00:00, 1028.22it/s, env_step=49000, len=7, n/ep=13, n/st=100, player_2/loss=0.246, rew=0.31]                          


Epoch #49: test_reward: 0.400000 ± 0.894427, best_reward: 0.560000 ± 0.828493 in #27


Epoch #50: 1001it [00:00, 1015.10it/s, env_step=50000, len=6, n/ep=21, n/st=100, player_2/loss=0.263, rew=0.29]                          


Epoch #50: test_reward: 0.600000 ± 0.800000, best_reward: 0.600000 ± 0.800000 in #50


Epoch #51: 1001it [00:00, 1009.62it/s, env_step=51000, len=6, n/ep=18, n/st=100, player_2/loss=0.239, rew=0.33]                          


Epoch #51: test_reward: 0.370000 ± 0.901721, best_reward: 0.600000 ± 0.800000 in #50


Epoch #52: 1001it [00:00, 1023.03it/s, env_step=52000, len=6, n/ep=8, n/st=100, player_2/loss=0.248, rew=0.75]                          


Epoch #52: test_reward: 0.300000 ± 0.953939, best_reward: 0.600000 ± 0.800000 in #50


Epoch #53: 1001it [00:00, 1016.09it/s, env_step=53000, len=6, n/ep=21, n/st=100, player_2/loss=0.244, rew=0.62]                          


Epoch #53: test_reward: 0.410000 ± 0.895489, best_reward: 0.600000 ± 0.800000 in #50


Epoch #54: 1001it [00:00, 1025.47it/s, env_step=54000, len=7, n/ep=14, n/st=100, player_2/loss=0.248, rew=0.07]                          


Epoch #54: test_reward: 0.200000 ± 0.938083, best_reward: 0.600000 ± 0.800000 in #50


Epoch #55: 1001it [00:00, 1003.74it/s, env_step=55000, len=7, n/ep=17, n/st=100, player_2/loss=0.261, rew=0.06]                          


Epoch #55: test_reward: -0.130000 ± 0.945040, best_reward: 0.600000 ± 0.800000 in #50


Epoch #56: 1001it [00:00, 1001.22it/s, env_step=56000, len=7, n/ep=15, n/st=100, player_2/loss=0.264, rew=-0.07]                          


Epoch #56: test_reward: 0.200000 ± 0.969536, best_reward: 0.600000 ± 0.800000 in #50


Epoch #57: 1001it [00:00, 1014.72it/s, env_step=57000, len=7, n/ep=15, n/st=100, player_2/loss=0.232, rew=0.33]                          


Epoch #57: test_reward: 0.070000 ± 0.992522, best_reward: 0.600000 ± 0.800000 in #50


Epoch #58: 1001it [00:00, 1022.93it/s, env_step=58000, len=6, n/ep=15, n/st=100, player_2/loss=0.253, rew=0.27]                          


Epoch #58: test_reward: 0.140000 ± 0.969742, best_reward: 0.600000 ± 0.800000 in #50


Epoch #59: 1001it [00:00, 1025.90it/s, env_step=59000, len=6, n/ep=16, n/st=100, player_2/loss=0.246, rew=0.50]                          


Epoch #59: test_reward: 0.370000 ± 0.923634, best_reward: 0.600000 ± 0.800000 in #50


Epoch #60: 1001it [00:00, 1014.57it/s, env_step=60000, len=6, n/ep=12, n/st=100, player_2/loss=0.282, rew=0.33]                          


Epoch #60: test_reward: 0.060000 ± 0.977957, best_reward: 0.600000 ± 0.800000 in #50


Epoch #61: 1001it [00:00, 1007.29it/s, env_step=61000, len=7, n/ep=15, n/st=100, player_2/loss=0.257, rew=0.00]                          


Epoch #61: test_reward: -0.050000 ± 0.983616, best_reward: 0.600000 ± 0.800000 in #50


Epoch #62: 1001it [00:00, 1031.24it/s, env_step=62000, len=7, n/ep=13, n/st=100, player_2/loss=0.261, rew=0.38]                          


Epoch #62: test_reward: -0.080000 ± 0.976524, best_reward: 0.600000 ± 0.800000 in #50


Epoch #63: 1001it [00:00, 1030.68it/s, env_step=63000, len=6, n/ep=12, n/st=100, player_2/loss=0.266, rew=-0.17]                          


Epoch #63: test_reward: -0.080000 ± 0.976524, best_reward: 0.600000 ± 0.800000 in #50


Epoch #64: 1001it [00:00, 1017.66it/s, env_step=64000, len=7, n/ep=10, n/st=100, player_2/loss=0.274, rew=-0.40]                          


Epoch #64: test_reward: -0.100000 ± 0.964365, best_reward: 0.600000 ± 0.800000 in #50


Epoch #65: 1001it [00:00, 1006.42it/s, env_step=65000, len=6, n/ep=17, n/st=100, player_2/loss=0.258, rew=-0.06]                          


Epoch #65: test_reward: 0.280000 ± 0.938936, best_reward: 0.600000 ± 0.800000 in #50


Epoch #66: 1001it [00:00, 1021.95it/s, env_step=66000, len=6, n/ep=20, n/st=100, player_2/loss=0.282, rew=0.15]                          


Epoch #66: test_reward: 0.110000 ± 0.968452, best_reward: 0.600000 ± 0.800000 in #50


Epoch #67: 1001it [00:00, 1013.27it/s, env_step=67000, len=7, n/ep=18, n/st=100, player_2/loss=0.271, rew=-0.11]                          


Epoch #67: test_reward: 0.140000 ± 0.959375, best_reward: 0.600000 ± 0.800000 in #50


Epoch #68: 1001it [00:00, 1018.60it/s, env_step=68000, len=7, n/ep=12, n/st=100, player_2/loss=0.277, rew=0.17]                          


Epoch #68: test_reward: 0.280000 ± 0.938936, best_reward: 0.600000 ± 0.800000 in #50


Epoch #69: 1001it [00:00, 1013.22it/s, env_step=69000, len=6, n/ep=16, n/st=100, player_2/loss=0.291, rew=0.12]                          


Epoch #69: test_reward: 0.070000 ± 0.982395, best_reward: 0.600000 ± 0.800000 in #50


Epoch #70: 1001it [00:00, 1013.12it/s, env_step=70000, len=6, n/ep=7, n/st=100, player_2/loss=0.284, rew=0.43]                          


Epoch #70: test_reward: 0.020000 ± 0.979592, best_reward: 0.600000 ± 0.800000 in #50


Epoch #71: 1001it [00:00, 1008.28it/s, env_step=71000, len=6, n/ep=18, n/st=100, player_2/loss=0.285, rew=0.33]                          


Epoch #71: test_reward: 0.380000 ± 0.891964, best_reward: 0.600000 ± 0.800000 in #50


Epoch #72: 1001it [00:00, 1002.77it/s, env_step=72000, len=6, n/ep=12, n/st=100, player_2/loss=0.299, rew=0.58]                          


Epoch #72: test_reward: 0.180000 ± 0.973447, best_reward: 0.600000 ± 0.800000 in #50


Epoch #73: 1001it [00:01, 992.27it/s, env_step=73000, len=6, n/ep=11, n/st=100, player_2/loss=0.265, rew=0.09]                           


Epoch #73: test_reward: 0.350000 ± 0.909670, best_reward: 0.600000 ± 0.800000 in #50


Epoch #74: 1001it [00:00, 1016.04it/s, env_step=74000, len=6, n/ep=15, n/st=100, player_2/loss=0.277, rew=0.00]                          


Epoch #74: test_reward: 0.310000 ± 0.924067, best_reward: 0.600000 ± 0.800000 in #50


Epoch #75: 1001it [00:00, 1012.70it/s, env_step=75000, len=6, n/ep=13, n/st=100, player_2/loss=0.283, rew=-0.08]                          


Epoch #75: test_reward: 0.360000 ± 0.911263, best_reward: 0.600000 ± 0.800000 in #50


Epoch #76: 1001it [00:00, 1015.04it/s, env_step=76000, len=6, n/ep=23, n/st=100, player_2/loss=0.289, rew=0.13]                          


Epoch #76: test_reward: 0.460000 ± 0.876584, best_reward: 0.600000 ± 0.800000 in #50


Epoch #77: 1001it [00:00, 1027.25it/s, env_step=77000, len=6, n/ep=14, n/st=100, player_2/loss=0.299, rew=0.71]                          


Epoch #77: test_reward: 0.250000 ± 0.952628, best_reward: 0.600000 ± 0.800000 in #50


Epoch #78: 1001it [00:00, 1013.94it/s, env_step=78000, len=6, n/ep=21, n/st=100, player_2/loss=0.287, rew=0.29]                          


Epoch #78: test_reward: 0.230000 ± 0.947154, best_reward: 0.600000 ± 0.800000 in #50


Epoch #79: 1001it [00:00, 1018.85it/s, env_step=79000, len=7, n/ep=12, n/st=100, player_2/loss=0.292, rew=0.50]                          


Epoch #79: test_reward: 0.430000 ± 0.886059, best_reward: 0.600000 ± 0.800000 in #50


Epoch #80: 1001it [00:00, 1014.27it/s, env_step=80000, len=7, n/ep=10, n/st=100, player_2/loss=0.278, rew=0.40]                          


Epoch #80: test_reward: 0.290000 ± 0.908790, best_reward: 0.600000 ± 0.800000 in #50


Epoch #81: 1001it [00:01, 996.81it/s, env_step=81000, len=6, n/ep=16, n/st=100, player_2/loss=0.285, rew=0.19]                           


Epoch #81: test_reward: 0.270000 ± 0.925797, best_reward: 0.600000 ± 0.800000 in #50


Epoch #82: 1001it [00:01, 989.99it/s, env_step=82000, len=6, n/ep=20, n/st=100, player_2/loss=0.295, rew=0.45]                           


Epoch #82: test_reward: 0.210000 ± 0.951788, best_reward: 0.600000 ± 0.800000 in #50


Epoch #83: 1001it [00:00, 1031.30it/s, env_step=83000, len=6, n/ep=17, n/st=100, player_2/loss=0.269, rew=0.12]                          


Epoch #83: test_reward: 0.460000 ± 0.887919, best_reward: 0.600000 ± 0.800000 in #50


Epoch #84: 1001it [00:00, 1017.45it/s, env_step=84000, len=6, n/ep=8, n/st=100, player_2/loss=0.274, rew=0.75]                          


Epoch #84: test_reward: 0.510000 ± 0.830602, best_reward: 0.600000 ± 0.800000 in #50


Epoch #85: 1001it [00:00, 1015.17it/s, env_step=85000, len=6, n/ep=20, n/st=100, player_2/loss=0.267, rew=0.00]                          


Epoch #85: test_reward: 0.400000 ± 0.894427, best_reward: 0.600000 ± 0.800000 in #50


Epoch #86: 1001it [00:00, 1030.08it/s, env_step=86000, len=6, n/ep=14, n/st=100, player_2/loss=0.271, rew=0.71]                          


Epoch #86: test_reward: 0.180000 ± 0.963120, best_reward: 0.600000 ± 0.800000 in #50


Epoch #87: 1001it [00:00, 1023.74it/s, env_step=87000, len=6, n/ep=13, n/st=100, player_2/loss=0.268, rew=0.23]                          


Epoch #87: test_reward: 0.070000 ± 0.982395, best_reward: 0.600000 ± 0.800000 in #50


Epoch #88: 1001it [00:00, 1029.27it/s, env_step=88000, len=6, n/ep=18, n/st=100, player_2/loss=0.277, rew=0.22]                          


Epoch #88: test_reward: 0.350000 ± 0.920598, best_reward: 0.600000 ± 0.800000 in #50


Epoch #89: 1001it [00:00, 1018.22it/s, env_step=89000, len=6, n/ep=14, n/st=100, player_2/loss=0.270, rew=0.43]                          


Epoch #89: test_reward: 0.480000 ± 0.865794, best_reward: 0.600000 ± 0.800000 in #50


Epoch #90: 1001it [00:01, 986.65it/s, env_step=90000, len=6, n/ep=14, n/st=100, player_2/loss=0.257, rew=0.50]                           


Epoch #90: test_reward: 0.390000 ± 0.904378, best_reward: 0.600000 ± 0.800000 in #50


Epoch #91: 1001it [00:00, 1024.36it/s, env_step=91000, len=6, n/ep=16, n/st=100, player_2/loss=0.244, rew=0.56]                          


Epoch #91: test_reward: 0.570000 ± 0.815537, best_reward: 0.600000 ± 0.800000 in #50


Epoch #92: 1001it [00:00, 1033.84it/s, env_step=92000, len=6, n/ep=14, n/st=100, player_2/loss=0.263, rew=-0.07]                          


Epoch #92: test_reward: 0.410000 ± 0.895489, best_reward: 0.600000 ± 0.800000 in #50


Epoch #93: 1001it [00:00, 1013.83it/s, env_step=93000, len=7, n/ep=16, n/st=100, player_2/loss=0.258, rew=0.19]                          


Epoch #93: test_reward: 0.430000 ± 0.897274, best_reward: 0.600000 ± 0.800000 in #50


Epoch #94: 1001it [00:00, 1015.41it/s, env_step=94000, len=6, n/ep=15, n/st=100, player_2/loss=0.264, rew=0.47]                          


Epoch #94: test_reward: 0.380000 ± 0.914112, best_reward: 0.600000 ± 0.800000 in #50


Epoch #95: 1001it [00:00, 1022.98it/s, env_step=95000, len=6, n/ep=8, n/st=100, player_2/loss=0.246, rew=0.25]                          


Epoch #95: test_reward: 0.250000 ± 0.942072, best_reward: 0.600000 ± 0.800000 in #50


Epoch #96: 1001it [00:00, 1008.05it/s, env_step=96000, len=6, n/ep=18, n/st=100, player_2/loss=0.246, rew=0.11]                          


Epoch #96: test_reward: 0.390000 ± 0.904378, best_reward: 0.600000 ± 0.800000 in #50


Epoch #97: 1001it [00:00, 1026.65it/s, env_step=97000, len=6, n/ep=18, n/st=100, player_2/loss=0.252, rew=0.44]                          


Epoch #97: test_reward: 0.310000 ± 0.924067, best_reward: 0.600000 ± 0.800000 in #50


Epoch #98: 1001it [00:00, 1024.15it/s, env_step=98000, len=6, n/ep=7, n/st=100, player_2/loss=0.265, rew=0.43]                          


Epoch #98: test_reward: 0.340000 ± 0.918912, best_reward: 0.600000 ± 0.800000 in #50


Epoch #99: 1001it [00:01, 999.56it/s, env_step=99000, len=6, n/ep=14, n/st=100, player_2/loss=0.232, rew=0.29]                           


Epoch #99: test_reward: 0.410000 ± 0.906587, best_reward: 0.600000 ± 0.800000 in #50


Epoch #100: 1001it [00:00, 1016.61it/s, env_step=100000, len=6, n/ep=12, n/st=100, player_2/loss=0.249, rew=0.50]                          


Epoch #100: test_reward: 0.320000 ± 0.947418, best_reward: 0.600000 ± 0.800000 in #50


In [None]:
%tensorboard --logdir /content/tic_tac_toe/dqn

Load best trained agents

In [None]:
agent_learn_player1.load_state_dict(torch.load(path + '/policy-player_1.pth'))
agent_learn_player2.load_state_dict(torch.load(path + '/policy-player_2.pth'))

<All keys matched successfully>

# Play agent against agent function


In [None]:
def play(agent1, agent2, n_episode=100):
  env = get_env(render_mode=None)
  policy = MultiAgentPolicyManager([agent1, agent2], env)
  dummy_vector_env = DummyVectorEnv([lambda: env])
  collector = Collector(policy, dummy_vector_env, exploration_noise=True)
  # play number of episodes
  result = collector.collect(n_episode=n_episode, render=None)
  rews, lens = result["rews"], result["lens"]
  print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()}")

  won = 0
  draw = 0
  lost = 0
  for res in result['rews']:
    if res[0] == 1:
      won += 1
    elif res[0] == -1:
      lost +=1
    else:
      draw += 1

  print("Win: " + str(won) + " lost: " + str(lost) + " draw: " + str(draw))

  return (won, lost, draw)

# Setup LLM Agent

In [None]:
class LLMAgent(BasePolicy):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        models_dict = {"StableLM Zephyr 3B": "stabilityai/stablelm-zephyr-3b"}
        model_id = models_dict["StableLM Zephyr 3B"]
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)#, device=0)
        self.pipe = pipeline("text-generation", model=model_id, device_map="auto", tokenizer=self.tokenizer, torch_dtype=torch.bfloat16)
        self.cache = {}

    def field_to_string(self, field, c):
      return_value = ""
      if field[0] == 1:
        return_value = "X "
      elif field[1] == 1:
        return_value = "O "
      else:
          return_value = str(c) + " "
      if c < 6:
          return_value += "| "
      return return_value

    def board_to_string(self, batch: Batch) -> str:
        rows = batch.obs['obs']
        row1 = self.field_to_string(rows[0][0][0], 0) + self.field_to_string(rows[0][1][0], 3) + self.field_to_string(rows[0][2][0], 6)
        row2 = self.field_to_string(rows[0][0][1], 1) + self.field_to_string(rows[0][1][1], 4) + self.field_to_string(rows[0][2][1], 7)
        row3 = self.field_to_string(rows[0][0][2], 2) + self.field_to_string(rows[0][1][2], 5) + self.field_to_string(rows[0][2][2], 8)
        return row1 + '\n' + row2 + '\n' + row3

    def ask_llm_for_choice(self, board: str, possible_choices) -> int:
        job_description = "You will be provided with a tic tac toe board. There are two players, X and O. An empty board looks likes this:\n0 | 3 | 6\n1 | 4 | 7\n2 | 5 | 8\nWhen a player made a move a X or O is placed on the board.\nYou are player X and should choose the best possible option."
        possible_choices_text = ", ".join(possible_choices)
        question = "The current board is: \n" + board + "\nThe possible numbers are " + possible_choices_text + ". Only answer best number to choose, no comments or explanation, just a number."
        output = self.generate(job_description, question, 0.1, 40)

        extracted_choice = [int(i) for i in re.sub(r'[^0-9\s]', '', output).split() if i.isdigit() and str(i) in possible_choices]
        if extracted_choice:
          print("Choice: " + str(extracted_choice[0]))
          return extracted_choice[0]
        else:
          print("No choice, return -1")
          return -1

    def generate(self, job_description, question, temperature=0.7, max_new_tokens=512):
        messages = [
            {
                "role": "system",
                "content": job_description,
            },
        ]
        messages.append({"role": "user", "content": question})
        prompt = self.pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = self.pipe(prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=0.95)
        output = outputs[0]["generated_text"]
        messages.append({"role": "assistant", "content": output})
        response_start = output.rfind('<|assistant|>')
        text_output = output[response_start + len('<|assistant|>'):]
        return text_output

    def forward(
        self, batch: Batch, state: dict | Batch | np.ndarray | None = None
    ) -> Batch:
        board = self.board_to_string(batch)

        if board in self.cache:
          print("cache hit:\n" + board + "\nChoice: " + str(self.cache[board]))
          return Batch(act=[self.cache[board]])

        all_choices = np.array(list(range(0,9)))
        mask = batch.obs.mask.flatten()
        masked_choices = all_choices[mask.astype(bool)].astype(str)

        if len(masked_choices) == 1:
          return Batch(act=[int(masked_choices[0])])

        choice = -1
        tries = 3
        while str(choice) not in masked_choices and tries > 0:
          choice = self.ask_llm_for_choice(board, masked_choices)
          tries -= 1

        if choice != -1 and str(choice) in masked_choices:
          print("Add to cache:\n" + board + "\nChoice: " + str(choice))
          self.cache[board] = choice
        else:
          print("Invalid choice, pick first: " + masked_choices[0])
          return Batch(act=[int(masked_choices[0])])

        return Batch(act=[choice])

    def learn(self, batch: Batch) -> Dict[str, Any]:
        return {}

# Play with agents in all possible combinations

1. Random - Random
1. Random - DQN
1. Random - LLM
1. DQN - Random
1. DQN - DQN
1. DQN - LLM
1. LLM - Random
1. LLM - DQN
1. LLM - LLM

So every agents plays as player_1 against the other agents

In [None]:
llm_agent_1 = LLMAgent()
llm_agent_2 = LLMAgent()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
random_random = play(RandomPolicy(), RandomPolicy())
random_dqn = play(RandomPolicy(), agent_learn_player2)
random_llm = play(RandomPolicy(), llm_agent_2)
dqn_random = play(agent_learn_player1, RandomPolicy())
dqn_dqn = play(agent_learn_player1, agent_learn_player2)
dqn_llm = play(agent_learn_player1, llm_agent_2)
llm_random = play(llm_agent_1, RandomPolicy())
llm_dqn = play(llm_agent_1, agent_learn_player2)
llm_llm = play(llm_agent_1, llm_agent_2)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
cache hit:
X | 3 | 6 
1 | 4 | O 
2 | 5 | 8 
Choice: 4
cache hit:
X | 3 | 6 
1 | X | O 
2 | O | 8 
Choice: 3
cache hit:
X | X | 6 
1 | X | O 
2 | O | O 
Choice: 1
cache hit:
0 | 3 | 6 
1 | 4 | 7 
2 | 5 | 8 
Choice: 0
cache hit:
X | 3 | 6 
1 | 4 | O 
2 | 5 | 8 
Choice: 4
cache hit:
X | 3 | 6 
1 | X | O 
2 | O | 8 
Choice: 3
cache hit:
X | X | 6 
1 | X | O 
O | O | 8 
Choice: 1
cache hit:
0 | 3 | 6 
1 | 4 | 7 
2 | 5 | 8 
Choice: 0
cache hit:
X | 3 | 6 
1 | 4 | O 
2 | 5 | 8 
Choice: 4
cache hit:
X | 3 | 6 
1 | X | O 
2 | O | 8 
Choice: 3
cache hit:
X | X | 6 
1 | X | O 
2 | O | O 
Choice: 1
cache hit:
0 | 3 | 6 
1 | 4 | 7 
2 | 5 | 8 
Choice: 0
cache hit:
X | 3 | 6 
1 | 4 | O 
2 | 5 | 8 
Choice: 4
cache hit:
X | 3 | 6 
1 | X | O 
2 | O | 8 
Choice: 3
cache hit:
X | X | 6 
1 | X | O 
2 | O | O 
Choice: 1
cache hit:
0 | 3 | 6 
1 | 4 | 7 
2 | 5 | 8 
Choice: 0
cache hit:
X | 3 | 6 
1 | 4 | O 
2 | 5 | 8 
Choice: 4
cache hit:
X | 3 

In [None]:
print("Random - random: " + str(random_random[0]/100))
print("Random - DQN: " + str(random_dqn[0]/100))
print("Random - LLM: " + str(random_llm[0]/100))
print("DQN - random: " + str(dqn_random[0]/100))
print("DQN - DQN: " + str(dqn_dqn[0]/100))
print("DQN - LLM: " + str(dqn_llm[0]/100))
print("LLM - random: " + str(llm_random[0]/100))
print("LLM - DQN: " + str(llm_dqn[0]/100))
print("LLM - LLM: " + str(llm_llm[0]/100))

Random - random: 0.62
Random - DQN: 0.32
Random - LLM: 0.68
DQN - random: 0.86
DQN - DQN: 0.94
DQN - LLM: 1.0
LLM - random: 0.57
LLM - DQN: 0.03
LLM - LLM: 0.0
