In [1]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
from torch import optim
import torch
import os


%load_ext autoreload
%autoreload 2
import env
import network
import player

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 40
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# settings
#os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

In [2]:
# TODO: restore neural net parameters

actor = network.Actor(BOARD_XSIZE, BOARD_YSIZE).to(device)
critic = network.Critic(BOARD_XSIZE, BOARD_YSIZE).to(device)

def detect_nans(model, input, output):
    if output.isnan().any():
        raise ValueError("NaN found in activations pass!")

actor.register_forward_hook(detect_nans)


actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)
critic_optimizer = optim.Adam(critic.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

2023-02-28 23:39:56.337438: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 23:39:56.625352: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-28 23:39:57.365888: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-28 23:39:57.366099: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [3]:
summary_reward_buf:list[float] = []
entropy_buf:list[float] = []
for epoch in range(TRAIN_EPOCHS):
    s_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[np.ndarray] = []
    d_batch:list[env.Advantage] = []
    v_batch:list[env.Value] = []
    for _ in range(EPISODES_PER_AGENT):
        e = env.Env(DIMS)

        s_t:list[env.Observation] = []
        a_t:list[env.Action] = []
        p_t:list[np.ndarray] = []
        r_t:list[env.Reward] = []
        

        actor_turn = np.random.random() > 0.5
        
        actor_player = player.ActorPlayer(actor, env.PLAYER1)
        opponent_player = player.RandomPlayer(env.PLAYER2)

        while not e.game_over():
            if actor_turn:
                obs, action_probs, chosen_action, reward = actor_player.play(e)
                s_t += [obs]
                p_t += [action_probs]
                a_t += [chosen_action]
                r_t += [reward]
            else:
                opponent_player.play(e)

            # flip turn
            actor_turn = not actor_turn

        v_t = network.compute_value(r_t)
        d_t = network.compute_advantage(critic, s_t, r_t)


        # now update the minibatch
        s_batch += s_t
        a_batch += a_t
        p_batch += p_t
        d_batch += d_t
        v_batch += v_t

        # statistics
        summary_reward_buf.append(float(v_t[-1]))

    actor_loss, critic_loss = network.train_ppo(
        actor,
        critic,
        actor_optimizer,
        critic_optimizer,
        s_batch,
        a_batch,
        p_batch,
        d_batch,
        v_batch
    )

    writer.add_scalar('actor_loss', actor_loss, step)
    writer.add_scalar('critic_loss', critic_loss, step)

    if epoch % SUMMARY_STATS_INTERVAL == 0:
        avg_reward = sum(summary_reward_buf)/len(summary_reward_buf)
        writer.add_scalar('avg_reward', avg_reward, step)
        # clear
        summary_reward_buf = []
    
    if epoch % MODEL_SAVE_INTERVAL == 0:
        # Save the neural net parameters to disk.
        actor_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt"
        critic_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt"
        torch.save(actor.state_dict(), actor_path)
        torch.save(critic.state_dict(), critic_path)
    
    step += 1


ValueError: operands could not be broadcast together with shapes (6,) (7,) 

In [None]:
actor.load_state_dict(torch.load('./summary/nn_model_ep_1500_actor.ckpt'))
critic.load_state_dict(torch.load('./summary/nn_model_ep_1500_critic.ckpt'))

<All keys matched successfully>

In [None]:
actor_optimizer.param_groups[0]['lr'] = 1e-5
critic_optimizer.param_groups[0]['lr'] = 1e-5

In [130]:
e = env.Env(DIMS)

e.step(1, 1)
e.step(1, 1)
e.step(1, 1)

e.step(5, 2)
e.step(5, 2)
e.step(5, 2)






o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
print('0 1 2 3 4 5 6 7')
print(actor.forward(network.obs_to_tensor(o, device))[0])
print(critic.forward(network.obs_to_tensor(o, device))[0])

[ True  True  True  True  True  True  True]
              
              
              
  #       O   
  #       O   
  #       O   

0 1 2 3 4 5 6 7
tensor([0.0248, 0.0144, 0.6293, 0.0128, 0.0111, 0.0146, 0.2930],
       device='cuda:0', grad_fn=<SelectBackward0>)
tensor([[[[False,  True, False, False, False, False, False],
          [False,  True, False, False, False, False, False],
          [False,  True, False, False, False, False, False],
          [False, False, False, False, False, False, False],
          [False, False, False, False, False, False, False],
          [False, False, False, False, False, False, False]],

         [[False, False, False, False, False,  True, False],
          [False, False, False, False, False,  True, False],
          [False, False, False, False, False,  True, False],
          [False, False, False, False, False, False, False],
          [False, False, False, False, False, False, False],
          [False, False, False, False, False, False, False]]

In [131]:
s_tensor = network.obs_batch_to_tensor(s_batch, device)
critic_guesses = critic.forward(s_tensor).to(cpu).detach().numpy()
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, obs, critic_guess, actor_guess in zip(v_batch, s_batch, critic_guesses, actor_guesses):
    print("real_value", v)
    print("pred_value", float(critic_guess))
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

tensor([[[[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]],

         [[False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False]]],


        [[[False, False, False,  ..., False, False,  True],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  ..., False, False, False],
          [False, False, False,  .