# Training DNQ for Tic Tac Toe

Based on [RL against random policy opponent with PettingZoo](https://tianshou.org/en/stable/01_tutorials/04_tictactoe.html).

In [None]:
%load_ext autoreload
%autoreload 2

# Install dependencies

In [None]:
!pip install gymnasium==0.29.1 pygame==2.3.0 pettingzoo==1.24.3 tianshou==0.5.1 transformers==4.39.1 accelerate==0.28.0 openai

Collecting gymnasium==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygame==2.3.0
  Downloading pygame-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pettingzoo==1.24.3
  Downloading pettingzoo-1.24.3-py3-none-any.whl (847 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m847.8/847.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tianshou==0.5.1
  Downloading tianshou-0.5.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.39.1
  Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)
[2K     [90m━━━

# Import needed dependencies

In [None]:
import gymnasium as gym
import torch
from pettingzoo.classic import tictactoe_v3
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import (
    BasePolicy,
    DQNPolicy,
    MultiAgentPolicyManager,
    RandomPolicy,
)
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net

# Setup environment

In [None]:
def get_env(render_mode=None):
  return PettingZooEnv(tictactoe_v3.env(render_mode=render_mode))

# create the environment and get the shape of the states and shape of the actions
env = get_env()
observation_space = env.observation_space['observation'] if isinstance(
  env.observation_space, gym.spaces.Dict
) else env.observation_space
state_shape = observation_space.shape or observation_space.n
action_shape = env.action_space.shape or env.action_space.n

# Setup policies

One training policy (DNQPolicy) and the opponent (RandomPolicy).

In [None]:
# Hidden sizes: shape of the MLP
hidden_sizes = [128, 128, 128, 128]
# device to train on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# the number of steps to look ahead
estimation_step = 3
# the target network update frequency (0 if you do not use the target network).
target_update_freq = 320
# learning rate of the Adam optimizer
lr = 1e-4

# The deep learning model (MLP) that underpins the behaviour of the agent (it is not the agent itself).
net = Net(
  state_shape,
  action_shape,
  hidden_sizes=hidden_sizes,
  device=device
).to(device)

# Adam optimizer
optim = torch.optim.Adam(net.parameters(), lr=lr)

# Agent to learn
agent_learn = DQNPolicy(
  model=net,
  optim=optim,
  action_space=env.action_space,
  estimation_step=estimation_step,
  target_update_freq=target_update_freq
)

# A2CPolicy(
#         actor, critic, optim, dist, args.gamma, vf_coef=args.vf_coef,
#         ent_coef=args.ent_coef, max_grad_norm=args.max_grad_norm)

# opponent for the learning agent
agent_opponent = RandomPolicy(action_space=env.action_space)

agents = [agent_learn, agent_opponent]

# Multi-agent policy manager for Multi-Agent Reinforcement Learning (https://tianshou.org/en/stable/01_tutorials/07_cheatsheet.html#marl-example)
policy = MultiAgentPolicyManager(agents, env)

# Train the agent
Using the OffPolicyTrainer.

In [None]:
# number of training environments
training_num = 10
# number of testing environments
test_num = 10
# size of the VectorReplayBuffer
buffer_size = 20000
# the batch size of sample data, which is going to feed in the policy network.
batch_size = 64
# the maximum number of epochs for training. The training process might be finished before reaching max_epoch if stop_fn is set.
epoch = 50
# the number of transitions collected per epoch.
step_per_epoch = 1000
# the number of transitions the collector would collect before the network update,
# i.e., trainer will collect "step_per_collect" transitions and do some policy network update
# repeatedly in each epoch.
step_per_collect = 10
# used in the stop function when the mean rewards are over this threshold
win_rate = 0.6
# The eps for epsilon-greedy exploration for test and training
eps_test = 0.05
eps_train = 0.1
# the number of times the policy network would be updated per transition after (step_per_collect)
# transitions are collected, e.g., if update_per_step set to 0.3, and step_per_collect is 256,
# policy will be updated round(256 * 0.3 = 76.8) = 77 times after 256 transitions are collected
# by the collector. Default to 1.
update_per_step = 0.1
# path to save the results and the logging
path = '/content/tic_tac_toe/dnq'

# Dummy vectorized environment wrapper, implemented in for-loop.
# This has the same interface as true vectorized environment, but the rollout does not happen in parallel.
# So, all workers just wait for each other and the environment is as efficient as using a single environment.
# This can be useful for testing or for demonstration purposes.
train_envs = DummyVectorEnv([get_env for _ in range(training_num)])
test_envs = DummyVectorEnv([get_env for _ in range(test_num)])

# VectorReplayBuffer contains n ReplayBuffer with the same size.
# It is used for storing transition from different environments yet keeping the order of time.
vectorReplayBuffer = VectorReplayBuffer(buffer_size, len(train_envs))

# determine whether the action needs to be modified with corresponding policy’s exploration noise.
# If so, “policy. exploration_noise(act, batch)” will be called automatically to add the
# exploration noise into action.
exploration_noise = True

# Train and test collector
# Collector enables the policy to interact with different types of envs with exact number of steps or episodes.
train_collector = Collector(policy, train_envs, vectorReplayBuffer, exploration_noise=exploration_noise)
test_collector = Collector(policy, test_envs, exploration_noise=exploration_noise)

# Collect a specified number of step or episode.
train_collector.collect(n_step=batch_size * training_num)

# A logger that logs statistics during training/testing/updating
writer = SummaryWriter(path)
logger = TensorboardLogger(writer)

# ID of the agent that we are training
agent_id = 'player_1'

# Functions for the OffpolicyTrainer
# Save the best model
def save_best_fn(policy):
  torch.save(policy.policies[agent_id].state_dict(), path + '/policy.pth')

# When to stop training
def stop_fn(mean_rewards):
    return mean_rewards >= win_rate

# a hook called at the beginning of training in each epoch. It can be used to perform custom additional operations
def train_fn(epoch, env_step):
    # Set the eps for epsilon-greedy exploration.
    policy.policies[agent_id].set_eps(eps_train)

def test_fn(epoch, env_step):
    # Set the eps for epsilon-greedy exploration.
    policy.policies[agent_id].set_eps(eps_test)

# A function with signature used in multi-agent RL.
# We need to return a single scalar for each episode’s result to monitor training in the multi-agent RL setting.
# This function specifies what is the desired metric, e.g., the reward of agent 1 or the average reward over all agents.
def reward_metric(rews):
    return rews[:, 0]

# Offpolicy trainer, samples mini-batches from buffer and passes them to update.
result = OffpolicyTrainer(
  policy,
  train_collector,
  test_collector,
  epoch,
  step_per_epoch,
  step_per_collect,
  test_num,
  batch_size,
  train_fn=train_fn,
  test_fn=test_fn,
  stop_fn=stop_fn,
  save_best_fn=save_best_fn,
  update_per_step=update_per_step,
  logger=logger,
  test_in_train=False, # whether to test in the training phase.
  reward_metric=reward_metric
).run()

Epoch #1: 1001it [00:04, 246.53it/s, env_step=1000, len=5, n/ep=1, n/st=10, player_1/loss=0.303, rew=1.00]                          


Epoch #1: test_reward: 0.300000 ± 0.781025, best_reward: 0.300000 ± 0.781025 in #1


Epoch #2: 1001it [00:01, 549.43it/s, env_step=2000, len=5, n/ep=1, n/st=10, player_1/loss=0.290, rew=1.00]                          


Epoch #2: test_reward: 1.000000 ± 0.000000, best_reward: 1.000000 ± 0.000000 in #2


# Play

Play with the trained agent to the opponent a number of episodes and print the results

In [13]:
env = get_env(render_mode=None)
env = DummyVectorEnv([lambda: env])
collector = Collector(policy, env, exploration_noise=True)
# play number of episodes
result = collector.collect(n_episode=100, render=None)
rews, lens = result["rews"], result["lens"]
print(f"Final reward: {rews[:, 0].mean()}, length: {lens.mean()}")

won = 0
draw = 0
lost = 0
for res in result['rews']:
  if res[0] == 1:
    won += 1
  elif res[0] == -1:
    lost +=1
  else:
    draw += 1

print("Win: " + str(won) + " lost: " + str(lost) + " draw: " + str(draw))

Final reward: 0.88, length: 5.54
Win: 94 lost: 6 draw: 0
