In [6]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
from torch import optim
import torch
import os
import time


%load_ext autoreload
%autoreload 2
import env
import network
import policy

EPISODES_PER_AGENT = 256
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
actor = network.Actor(env.BOARD_XSIZE, env.BOARD_YSIZE).to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

In [None]:
opponent_pool:list[policy.Policy] = [
    policy.MinimaxPolicy(depth=2, randomness=0.1),
    policy.MinimaxPolicy(depth=2, randomness=0.3),
    policy.MinimaxPolicy(depth=2, randomness=0.5),
    policy.MinimaxPolicy(depth=2, randomness=0.7),
]

rewards_vs: dict[str, list[float]] = {}

In [9]:
def play(nn_policy:policy.NNPolicy, opponent_policy: policy.Policy, nn_player: env.Player) -> tuple[
    list[env.State],
    list[env.Action],
    list[float],
    list[float],
]:
    e = env.Env()
    current_player = env.PLAYER1

    s_t:list[env.State] = []
    a_t:list[env.Action] = []
    r_t:list[float] = []
    # play the game

    while not e.game_over():
        if nn_player == current_player:
            s = e.state.copy()
            chosen_action = nn_policy(s)
            reward = e.step(chosen_action)
            s_t.append(s)
            a_t.append(chosen_action)
            r_t.append(reward)
        else:
            opponent_action = opponent_policy(e.state)
            e.step(opponent_action)

        # flip turn
        current_player = env.opponent(current_player)

    # compute value
    v_t = network.compute_value(r_t)

    return s_t, a_t, r_t, v_t

In [None]:
# interrupt this cell when you're done training

for _ in range(TRAIN_EPOCHS):
    s_batch:list[env.State] = []
    a_batch:list[env.Action] = []
    p_batch:list[np.ndarray] = []
    v_batch:list[float] = []
    
    # create actor player
    actor.eval()
    nn_player = policy.NNPolicy(actor)
    
    with torch.inference_mode():
        for _ in range(EPISODES_PER_AGENT):
            # pick a random opponent
            opponent_player = opponent_pool[np.random.randint(len(opponent_pool))]

            # which player the actor will be
            actor_player_identity = env.PLAYER1 if np.random.randint(2) == 0 else env.PLAYER2

            # play the game
            s_t, a_t, r_t, v_t = play(nn_player, opponent_player, actor_player_identity)

            # now update the minibatch
            s_batch += s_t
            a_batch += a_t
            v_batch += v_t

            # statistics
            opp_name = opponent_player.fmt_config(opponent_player.model_dump())
            total_reward = np.array(r_t).sum()
            if opp_name in rewards_vs:
                rewards_vs[opp_name].append(total_reward)
            else:
                rewards_vs[opp_name] = [total_reward]

    actor.train()
    actor_losses = network.train_policygradient(
        actor,
        actor_optimizer,
        s_batch,
        a_batch,
        v_batch
    )

    for actor_loss in actor_losses:
        writer.add_scalar('actor_loss', actor_loss, step)

        for opponent_name, rewards in rewards_vs.items():
            if len(rewards) > 400:
                avg_reward = np.array(rewards).mean()
                writer.add_scalar(f'reward_against_{opponent_name}', avg_reward, step)
                rewards_vs[opponent_name] = []

        if step % MODEL_SAVE_INTERVAL == 0:
            # Save the neural net parameters to disk.
            torch.save(actor.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_actor.ckpt")
        
        step += 1

KeyboardInterrupt: 

: 

In [None]:
actor.load_state_dict(torch.load('./summary/nn_model_ep_500_actor.ckpt'))

NameError: name 'actor' is not defined

In [None]:
e = env.Env()

e.step(env.Action(1), env.PLAYER1)
e.step(env.Action(1), env.PLAYER1)
e.step(env.Action(1), env.PLAYER1)

e.step(env.Action(5), env.PLAYER2)
e.step(env.Action(5), env.PLAYER2)
e.step(env.Action(5), env.PLAYER2)




o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
print('0 1 2 3 4 5 6 7')
print(actor.forward(actor.obs_batch_to_tensor([o], device))[0])

TypeError: Env.step() takes 2 positional arguments but 3 were given

In [None]:
# use this cell to observe some games from the network

s_tensor = network.state_batch_to_tensor(s_batch, device)
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, s, actor_guess in zip(v_batch, s_batch, actor_guesses):
    print("real_value", v)
    print("actor_probs", np.array(actor_guess))
    env.print_state(s)
    print('0 1 2 3 4 5 6 7')

real_value 0.0
actor_probs [9.99502659e-01 4.94937121e-04 1.53832517e-07 1.19036024e-07
 4.64389132e-07 9.15452574e-07 7.04528020e-07]
              
              
              
              
              
              

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [9.99566495e-01 4.31661145e-04 1.18984836e-07 9.18868537e-08
 3.64844709e-07 7.29323858e-07 5.54547739e-07]
              
              
              
              
O             
#             

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [9.9974376e-01 2.5557168e-04 4.0986315e-08 3.1064278e-08 1.3589285e-07
 2.8581084e-07 2.1142290e-07]
              
              
              
#             
O             
#         O   

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [9.9987674e-01 1.2304794e-04 9.3147907e-09 6.8830506e-09 3.3819578e-08
 7.6371926e-08 5.4053640e-08]
              
              
#             
#             
O             
#       O O   

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [9.9990082e-01 9.91