In [18]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import concurrent.futures
from torch import optim
import torch
import os
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 50
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
MAKE_OPPONENT_INTERVAL = 10000
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
# TODO: restore neural net parameters

actor = network.Actor(BOARD_XSIZE, BOARD_YSIZE).to(device)
critic = network.Critic(BOARD_XSIZE, BOARD_YSIZE).to(device)

def detect_nans(model, input, output):
    if output.isnan().any():
        raise ValueError("NaN found in activations pass!")

actor.register_forward_hook(detect_nans)


actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)
critic_optimizer = optim.Adam(critic.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

In [20]:
entropy_buf:list[float] = []

opponent_pool:list[player.Player] = [
    player.MinimaxPlayer(env.PLAYER2, 2, 0.5),
    player.MinimaxPlayer(env.PLAYER2, 2, 0.3),
]

rewards_vs: dict[str, list[float]] = {}

In [21]:
def play(actor:player.ActorPlayer, opponent: player.Player, actor_turn:bool) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[np.ndarray],
    list[env.Reward],
    list[env.Advantage],
    list[env.Reward],
    player.Player,
]:
    e = env.Env(DIMS)

    s_t:list[env.Observation] = []
    a_t:list[env.Action] = []
    p_t:list[np.ndarray] = []
    r_t:list[env.Reward] = []
    # play the game
    while not e.game_over():
        if actor_turn:
            obs, action_probs, chosen_action, reward = actor.play(e)
            s_t += [obs]
            p_t += [action_probs]
            a_t += [chosen_action]
            r_t += [reward]
        else:
            opponent.play(e)

        # flip turn
        actor_turn = not actor_turn

    # compute advantage and value
    d_t = network.compute_advantage(actor.critic, s_t, r_t)
    v_t = network.compute_value(r_t)

    return s_t, a_t, p_t, r_t, d_t, v_t, opponent

In [49]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for _ in range(TRAIN_EPOCHS):
        s_batch:list[env.Observation] = []
        a_batch:list[env.Action] = []
        p_batch:list[np.ndarray] = []
        d_batch:list[env.Advantage] = []
        v_batch:list[env.Value] = []
        
        # create actor player
        actor_player = player.ActorPlayer(actor, critic, step, env.PLAYER1)
        
        futures = []
        for _ in range(EPISODES_PER_AGENT):
            # pick a random opponent
            opponent_player = opponent_pool[np.random.randint(len(opponent_pool))]

            actor_turn = np.random.randint(2) == 0

            # play the game
            future = executor.submit(play, actor_player,opponent_player, actor_turn)
            futures.append(future)

        
        for future in concurrent.futures.as_completed(futures):
            s_t, a_t, p_t, r_t, d_t, v_t, opp = future.result()

            # now update the minibatch
            s_batch += s_t
            a_batch += a_t
            p_batch += p_t
            d_batch += d_t
            v_batch += v_t

            # statistics
            opp_name = opp.name()
            if opp_name in rewards_vs:
                rewards_vs[opp_name].append(float(v_t[-1]))
            else:
                rewards_vs[opp_name] = [float(v_t[-1])]

        actor_losses, critic_losses = network.train_ppo(
            actor,
            critic,
            actor_optimizer,
            critic_optimizer,
            s_batch,
            a_batch,
            p_batch,
            d_batch,
            v_batch
        )

        for actor_loss, critic_loss in zip(actor_losses, critic_losses):
            writer.add_scalar('actor_loss', actor_loss, step)
            writer.add_scalar('critic_loss', critic_loss, step)

            if step % SUMMARY_STATS_INTERVAL == 0:
                for opponent_name, rewards in rewards_vs.items():
                    if len(rewards) > 50:
                        avg_reward = np.array(rewards).mean()
                        writer.add_scalar(f'reward_against_{opponent_name}', avg_reward, step)
                        rewards_vs[opponent_name] = []

            if step % MAKE_OPPONENT_INTERVAL == 0:
                # create a new opponent
                frozen_actor = copy.deepcopy(actor)
                frozen_actor.eval()
                frozen_actor.to(device)
                frozen_critic = copy.deepcopy(critic)
                frozen_critic.eval()
                frozen_critic.to(device)
                opponent_pool.append(player.ActorPlayer(frozen_actor, frozen_critic, step, env.PLAYER2))

            if step % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                torch.save(actor.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_actor.ckpt")
                torch.save(critic.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_critic.ckpt")
            
            step += 1


In [None]:
actor.load_state_dict(torch.load('./summary/nn_model_ep_500_actor.ckpt'))
#critic.load_state_dict(torch.load('./summary/nn_model_ep_1500_critic.ckpt'))

<All keys matched successfully>

In [31]:
critic_optimizer.param_groups[0]['lr'] = 1e-4

In [None]:
actor_optimizer.param_groups[0]['lr'] = 5e-5

In [46]:
e = env.Env(DIMS)

e.step(1, 1)
e.step(1, 1)
e.step(1, 1)

e.step(5, 2)
e.step(5, 2)
e.step(5, 2)






o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
print('0 1 2 3 4 5 6 7')
print(actor.forward(network.obs_to_tensor(o, device))[0])
print(critic.forward(network.obs_to_tensor(o, device))[0])

[ True  True  True  True  True  True  True]
              
              
              
  #       O   
  #       O   
  #       O   

0 1 2 3 4 5 6 7
tensor([0.0756, 0.1005, 0.2424, 0.2119, 0.2252, 0.0839, 0.0604],
       device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.5012, device='cuda:0', grad_fn=<SelectBackward0>)


In [48]:
s_tensor = network.obs_batch_to_tensor(s_batch, device)
critic_guesses = critic.forward(s_tensor).to(cpu).detach().numpy()
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, obs, critic_guess, actor_guess in zip(v_batch, s_batch, critic_guesses, actor_guesses):
    print("real_value", v)
    print("pred_value", float(critic_guess))
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

real_value 0.0
pred_value 0.3193468451499939
actor_probs [0.12851158 0.10153183 0.14984994 0.22754751 0.17518383 0.12657927
 0.09079602]
              
              
              
              
              
O             

0 1 2 3 4 5 6 7
real_value 0.0
pred_value 0.3723761737346649
actor_probs [0.23418276 0.08959237 0.12540083 0.2116242  0.14798588 0.1128426
 0.07837135]
              
              
              
              
O             
O     #       

0 1 2 3 4 5 6 7
real_value 0.0
pred_value 0.4415988326072693
actor_probs [0.17712872 0.10017444 0.12390843 0.23100036 0.163973   0.11994864
 0.0838664 ]
              
              
              
              
O   O         
O   # #       

0 1 2 3 4 5 6 7
real_value 0.0
pred_value 0.5782452821731567
actor_probs [0.14529796 0.0655526  0.05218471 0.4924556  0.08195963 0.10450933
 0.05804019]
              
              
              
              
O   O #       
O   # #   O   

0 1 2 3 4 5 6 7
real_value 0.0
pred_value