# Training PPO for BlackJack

Used [PPOPolicy from Tianshou](https://tianshou.org/en/stable/03_api/policy/modelfree/ppo.html).

In [1]:
%load_ext autoreload
%autoreload 2

# Install dependencies

In [None]:
!pip install gymnasium==0.29.1 pygame==2.3.0 pettingzoo==1.24.3 tianshou==0.5.1 transformers==4.39.1 accelerate==0.28.0 openai

# Setup environment

In [19]:
import gymnasium as gym
import tianshou as ts
import torch
from gymnasium.spaces import Dict
from gymnasium.wrappers import FlattenObservation

device = "cuda" if torch.cuda.is_available() else "cpu"

def get_env(render_mode = None):
  """
  BlackJack has an observation space which is a tuple, consisting of
  the player's sum, the dealers card showing and whether or not
  the player has a usable ace. This tuple is flattened for Tianshou
  to be able to put it through the deep network layers.
  """
  env = gym.make("Blackjack-v1", render_mode=render_mode)
  env = FlattenObservation(env)
  env.reset(seed=42)

  return env

env = get_env()

In [20]:
train_envs = ts.env.DummyVectorEnv([get_env for _ in range(10)])
test_envs = ts.env.DummyVectorEnv([get_env for _ in range(100)])

# Create Policy

Create a deep DQN Policy using the network and optimizer.

In [21]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

net = ts.utils.net.common.Net(state_shape=state_shape, hidden_sizes=[128, 128, 128], device=device)

actor = ts.utils.net.discrete.Actor(preprocess_net=net, action_shape=env.action_space.n, device=device).to(device)
critic = ts.utils.net.discrete.Critic(preprocess_net=net, device=device).to(device)
actor_critic = ts.utils.net.common.ActorCritic(actor, critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

policy = ts.policy.PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    action_scaling=False,
)

In [22]:
train_collector = ts.data.Collector(
    policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True
)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

# Train the agent
Using the OffPolicyTrainer.

In [None]:
result = ts.trainer.OnpolicyTrainer(
    policy=policy,
    batch_size=64,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=10,
    step_per_epoch=10000,
    repeat_per_collect=10,
    episode_per_test=100,
    step_per_collect=10
).run()
print(f'Finished training! Use {result["duration"]}')

result

# Play

Play with the trained agent to the opponent a number of episodes and print the results

In [None]:
policy.eval()

env = get_env(render_mode=None)
env = ts.env.DummyVectorEnv([lambda: env])
collector = ts.data.Collector(policy, env, exploration_noise=True)
# play number of episodes
result = collector.collect(n_episode=100, render=None)
rews, lens = result["rews"], result["lens"]

won = 0
draw = 0
lost = 0
for res in result['rews']:
  if res == 1:
    won += 1
  elif res == -1:
    lost +=1
  else:
    draw += 1

print("Win: " + str(won) + " lost: " + str(lost) + " draw: " + str(draw))