# Training DNQ for BlackJack

Based on [Deep Q-Network from Tianshou](https://tianshou.org/en/stable/01_tutorials/00_dqn.html).

In [None]:
%load_ext autoreload
%autoreload 2

# Install dependencies

In [None]:
#!pip install gymnasium==0.29.1 pygame==2.3.0 pettingzoo==1.24.3 tianshou==0.5.1 transformers==4.39.1 accelerate==0.28.0 openai



# Setup environment

In [None]:
import gymnasium as gym
import tianshou as ts
from gymnasium.spaces import Dict
from gymnasium.wrappers import FlattenObservation

def get_env(render_mode = None):
  """
  BlackJack has an observation space which is a tuple, consisting of
  the player's sum, the dealers card showing and whether or not
  the player has a usable ace. This tuple is flattened for Tianshou
  to be able to put it through the deep network layers.
  """
  env = gym.make("Blackjack-v1", render_mode=render_mode)
  env = FlattenObservation(env)
  env.reset(seed=42)

  return env

env = get_env()

In [None]:
train_envs = ts.env.DummyVectorEnv([get_env for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([get_env for _ in range(100)])

  and should_run_async(code)


# Setup PyTorch Network

In [None]:
import torch
import numpy as np
from torch import nn


class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

state_shape = env.observation_space.shape
action_shape = env.action_space.shape or env.action_space.n

net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)

# Create Policy

Create a deep DQN Policy using the network and optimizer.

In [None]:
policy = ts.policy.DQNPolicy(
    model=net,
    optim=optim,
    action_space=env.action_space,
    discount_factor=0.9,
    estimation_step=3,
    target_update_freq=320,
)

  and should_run_async(code)


In [None]:
train_collector = ts.data.Collector(
    policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

  and should_run_async(code)


# Train the agent
Using the OffPolicyTrainer.

In [None]:
result = ts.trainer.OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=10000,
    step_per_collect=10,
    update_per_step=0.1,
    episode_per_test=100,
    batch_size=64,
    train_fn=lambda epoch, env_step: policy.set_eps(0.1),
    test_fn=lambda epoch, env_step: policy.set_eps(0.05),
).run()
print(f'Finished training! Use {result["duration"]}')

result

  and should_run_async(code)
Epoch #1: 10001it [00:16, 623.83it/s, env_step=10000, len=1, loss=0.511, n/ep=7, n/st=10, rew=0.14]                           


Epoch #1: test_reward: 0.900000 ± 0.435890, best_reward: 0.900000 ± 0.435890 in #1


Epoch #2: 10001it [00:16, 597.43it/s, env_step=20000, len=1, loss=0.592, n/ep=8, n/st=10, rew=-0.12]                           


Epoch #2: test_reward: 0.000000 ± 0.244949, best_reward: 0.900000 ± 0.435890 in #1


Epoch #3: 10001it [00:16, 623.83it/s, env_step=30000, len=1, loss=0.616, n/ep=6, n/st=10, rew=-0.67]                           


Epoch #3: test_reward: 0.960000 ± 0.280000, best_reward: 0.960000 ± 0.280000 in #3


Epoch #4: 10001it [00:16, 605.30it/s, env_step=40000, len=1, loss=0.604, n/ep=7, n/st=10, rew=-0.29]                           


Epoch #4: test_reward: 0.960000 ± 0.280000, best_reward: 0.960000 ± 0.280000 in #3


Epoch #5: 10001it [00:16, 622.11it/s, env_step=50000, len=2, loss=0.615, n/ep=5, n/st=10, rew=0.40]                           


Epoch #5: test_reward: 0.980000 ± 0.140000, best_reward: 0.980000 ± 0.140000 in #5


Epoch #6: 10001it [00:16, 614.21it/s, env_step=60000, len=1, loss=0.611, n/ep=8, n/st=10, rew=0.00]                           


Epoch #6: test_reward: 0.920000 ± 0.391918, best_reward: 0.980000 ± 0.140000 in #5


Epoch #7: 10001it [00:16, 596.03it/s, env_step=70000, len=1, loss=0.615, n/ep=6, n/st=10, rew=0.00]                           


Epoch #7: test_reward: -0.880000 ± 0.474974, best_reward: 0.980000 ± 0.140000 in #5


Epoch #8: 10001it [00:16, 610.20it/s, env_step=80000, len=1, loss=0.605, n/ep=6, n/st=10, rew=-0.17]                           


Epoch #8: test_reward: 0.920000 ± 0.391918, best_reward: 0.980000 ± 0.140000 in #5


Epoch #9: 10001it [00:16, 620.30it/s, env_step=90000, len=1, loss=0.602, n/ep=7, n/st=10, rew=0.14]                           


Epoch #9: test_reward: 0.840000 ± 0.542586, best_reward: 0.980000 ± 0.140000 in #5


Epoch #10: 10001it [00:16, 610.78it/s, env_step=100000, len=1, loss=0.605, n/ep=7, n/st=10, rew=0.43]                           


Epoch #10: test_reward: -0.940000 ± 0.341174, best_reward: 0.980000 ± 0.140000 in #5
Finished training! Use 164.42s


{'duration': '164.42s',
 'train_time/model': '110.39s',
 'test_step': 1593,
 'test_episode': 1100,
 'test_time': '0.48s',
 'test_speed': '3314.25 step/s',
 'best_reward': 0.98,
 'best_result': '0.98 ± 0.14',
 'train_step': 100000,
 'train_episode': 68490,
 'train_time/collector': '53.55s',
 'train_speed': '610.00 step/s'}

# Play

Play with the trained agent to the opponent a number of episodes and print the results

In [None]:
policy.eval()

env = get_env(render_mode=None)
env = ts.env.DummyVectorEnv([lambda: env])
collector = ts.data.Collector(policy, env, exploration_noise=True)
# play number of episodes
result = collector.collect(n_episode=100, render=None)
rews, lens = result["rews"], result["lens"]

won = 0
draw = 0
lost = 0
for res in result['rews']:
  if res == 1:
    won += 1
  elif res == -1:
    lost +=1
  else:
    draw += 1

print("Win: " + str(won) + " lost: " + str(lost) + " draw: " + str(draw))

Win: 42 lost: 43 draw: 15
