In [21]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
from torch import optim
import torch
import os
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 40
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
MAKE_OPPONENT_INTERVAL = 1000
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# settings
#os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# TODO: restore neural net parameters

actor = network.Actor(BOARD_XSIZE, BOARD_YSIZE).to(device)
critic = network.Critic(BOARD_XSIZE, BOARD_YSIZE).to(device)

def detect_nans(model, input, output):
    if output.isnan().any():
        raise ValueError("NaN found in activations pass!")

actor.register_forward_hook(detect_nans)


actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)
critic_optimizer = optim.Adam(critic.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

In [None]:
entropy_buf:list[float] = []

opponent_pool:list[player.Player] = [
    player.RandomPlayer(env.PLAYER2)
]

rewards_vs: dict[str, list[float]] = {}

In [24]:
for epoch in range(TRAIN_EPOCHS):
    s_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[np.ndarray] = []
    d_batch:list[env.Advantage] = []
    v_batch:list[env.Value] = []
    for _ in range(EPISODES_PER_AGENT):
        e = env.Env(DIMS)

        s_t:list[env.Observation] = []
        a_t:list[env.Action] = []
        p_t:list[np.ndarray] = []
        r_t:list[env.Reward] = []

        # create actor player        
        actor_player = player.ActorPlayer(actor, step, env.PLAYER1)
        
        # pick a random opponent
        opponent_player = opponent_pool[np.random.randint(len(opponent_pool))]

        # 50% chance to start
        actor_turn = np.random.random() > 0.5

        # play the game
        while not e.game_over():
            if actor_turn:
                obs, action_probs, chosen_action, reward = actor_player.play(e)
                s_t += [obs]
                p_t += [action_probs]
                a_t += [chosen_action]
                r_t += [reward]
            else:
                opponent_player.play(e)

            # flip turn
            actor_turn = not actor_turn

        # compute advantage and value
        d_t = network.compute_advantage(critic, s_t, r_t)
        v_t = network.compute_value(r_t)

        # now update the minibatch
        s_batch += s_t
        a_batch += a_t
        p_batch += p_t
        d_batch += d_t
        v_batch += v_t

        # statistics
        opp_name = opponent_player.name()
        if opp_name in rewards_vs:
            rewards_vs[opp_name].append(float(v_t[-1]))
        else:
            rewards_vs[opp_name] = [float(v_t[-1])]

    actor_loss, critic_loss = network.train_ppo(
        actor,
        critic,
        actor_optimizer,
        critic_optimizer,
        s_batch,
        a_batch,
        p_batch,
        d_batch,
        v_batch
    )

    writer.add_scalar('actor_loss', actor_loss, step)
    writer.add_scalar('critic_loss', critic_loss, step)

    if epoch % SUMMARY_STATS_INTERVAL == 0:
        for opponent_name, rewards in rewards_vs.items():
            if len(rewards) > 10:
                avg_reward = np.array(rewards).mean()
                writer.add_scalar(f'reward_against_{opponent_name}', avg_reward, step)
                rewards_vs[opponent_name] = []

    if epoch % MAKE_OPPONENT_INTERVAL == 0:
        # create a new opponent
        frozen_actor = copy.deepcopy(actor)
        frozen_actor.eval()
        frozen_actor.to(device)
        opponent_pool.append(player.ActorPlayer(frozen_actor, epoch, env.PLAYER2))

    if epoch % MODEL_SAVE_INTERVAL == 0:
        # Save the neural net parameters to disk.
        torch.save(actor.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt")
        torch.save(critic.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt")
    
    step += 1


KeyboardInterrupt: 

In [None]:
actor.load_state_dict(torch.load('./summary/nn_model_ep_500_actor.ckpt'))
#critic.load_state_dict(torch.load('./summary/nn_model_ep_1500_critic.ckpt'))

<All keys matched successfully>

In [None]:
actor_optimizer.param_groups[0]['lr'] = 1e-5
critic_optimizer.param_groups[0]['lr'] = 1e-5

In [25]:
e = env.Env(DIMS)

e.step(1, 1)
e.step(1, 1)
e.step(1, 1)

e.step(5, 2)
e.step(5, 2)
e.step(5, 2)






o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
print('0 1 2 3 4 5 6 7')
print(actor.forward(network.obs_to_tensor(o, device))[0])
print(critic.forward(network.obs_to_tensor(o, device))[0])

[ True  True  True  True  True  True  True]
              
              
              
  #       O   
  #       O   
  #       O   

0 1 2 3 4 5 6 7
tensor([0.0781, 0.0785, 0.0908, 0.5243, 0.0794, 0.0661, 0.0827],
       device='cuda:0', grad_fn=<SelectBackward0>)
tensor(0.7750, device='cuda:0', grad_fn=<SelectBackward0>)


In [26]:
s_tensor = network.obs_batch_to_tensor(s_batch, device)
critic_guesses = critic.forward(s_tensor).to(cpu).detach().numpy()
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, obs, critic_guess, actor_guess in zip(v_batch, s_batch, critic_guesses, actor_guesses):
    print("real_value", v)
    print("pred_value", float(critic_guess))
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

real_value 0.6634204312890623
pred_value 0.6541104912757874
actor_probs [0.06844878 0.06737773 0.06775846 0.59212756 0.06863001 0.06765473
 0.06800277]
              
              
              
              
              
      O       

0 1 2 3 4 5 6 7
real_value 0.6983372960937497
pred_value 0.6963827013969421
actor_probs [0.06664877 0.06346399 0.07108135 0.6021465  0.06949881 0.06062622
 0.06653439]
              
              
              
      O       
      #       
      O       

0 1 2 3 4 5 6 7
real_value 0.7350918906249998
pred_value 0.6906959414482117
actor_probs [0.06503165 0.06426599 0.05157686 0.06854744 0.61358696 0.07280879
 0.06418227]
              
              
      #       
      O       
      #       
      O O     

0 1 2 3 4 5 6 7
real_value 0.7737809374999999
pred_value 0.7198836207389832
actor_probs [0.06260257 0.06099626 0.05128277 0.06607842 0.6254535  0.06462774
 0.06895877]
              
              
      #       
      O O     
      # #  