In [1]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import concurrent.futures
from torch import optim
import torch
import os
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 200
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

In [2]:
actor = network.Actor(BOARD_XSIZE, BOARD_YSIZE).to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

In [8]:
opponent_pool:list[player.Player] = [
    player.MinimaxPlayer(env.PLAYER2, 2, 0.5),
    player.MinimaxPlayer(env.PLAYER2, 2, 0.7),
]

rewards_vs: dict[str, list[float]] = {}

In [4]:
def play(actor:player.ActorPlayer, opponent: player.Player, actor_turn:bool) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[env.Reward],
    list[env.Value],
]:
    e = env.Env(DIMS)

    s_t:list[env.Observation] = []
    a_t:list[env.Action] = []
    r_t:list[env.Reward] = []
    # play the game
    while not e.game_over():
        if actor_turn:
            obs, chosen_action, reward = actor.play(e)
            s_t += [obs]
            a_t += [chosen_action]
            r_t += [reward]
        else:
            opponent.play(e)

        # flip turn
        actor_turn = not actor_turn

    # compute value
    v_t = network.compute_value(r_t)

    return s_t, a_t, r_t, v_t

In [9]:
# interrupt this cell when you're done training

for _ in range(TRAIN_EPOCHS):
    s_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[np.ndarray] = []
    v_batch:list[env.Value] = []
    
    # create actor player
    actor_player = player.ActorPlayer(actor, step, env.PLAYER1)
    
    for _ in range(EPISODES_PER_AGENT):
        # pick a random opponent
        opponent_player = opponent_pool[np.random.randint(len(opponent_pool))]

        # whether we or our opponent goes first
        go_first = np.random.randint(2) == 0

        # play the game
        s_t, a_t, r_t, v_t = play(actor_player,opponent_player, go_first)

        # now update the minibatch
        s_batch += s_t
        a_batch += a_t
        v_batch += v_t

        # statistics
        opp_name = opponent_player.name()
        total_reward = np.array(r_t).sum()
        if opp_name in rewards_vs:
            rewards_vs[opp_name].append(total_reward)
        else:
            rewards_vs[opp_name] = [total_reward]

    actor_losses = network.train_policygradient(
        actor,
        actor_optimizer,
        s_batch,
        a_batch,
        v_batch
    )

    for actor_loss in actor_losses:
        writer.add_scalar('actor_loss', actor_loss, step)

        for opponent_name, rewards in rewards_vs.items():
            if len(rewards) > 400:
                avg_reward = np.array(rewards).mean()
                writer.add_scalar(f'reward_against_{opponent_name}', avg_reward, step)
                rewards_vs[opponent_name] = []

        if step % MODEL_SAVE_INTERVAL == 0:
            # Save the neural net parameters to disk.
            torch.save(actor.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_actor.ckpt")
        
        step += 1

KeyboardInterrupt: 

In [None]:
actor.load_state_dict(torch.load('./summary/nn_model_ep_500_actor.ckpt'))

<All keys matched successfully>

In [6]:
e = env.Env(DIMS)

e.step(env.Action(1), env.PLAYER1)
e.step(env.Action(1), env.PLAYER1)
e.step(env.Action(1), env.PLAYER1)

e.step(env.Action(5), env.PLAYER2)
e.step(env.Action(5), env.PLAYER2)
e.step(env.Action(5), env.PLAYER2)




o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
print('0 1 2 3 4 5 6 7')
print(actor.forward(network.obs_batch_to_tensor([o], device))[0])

[ True  True  True  True  True  True  True]
              
              
              
  #       O   
  #       O   
  #       O   

0 1 2 3 4 5 6 7
tensor([0.1263, 0.1432, 0.1656, 0.1687, 0.1430, 0.1245, 0.1287],
       device='cuda:0', grad_fn=<SelectBackward0>)


In [7]:
# use this cell to observe some games from the network

s_tensor = network.obs_batch_to_tensor(s_batch, device)
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, obs, actor_guess in zip(v_batch, s_batch, actor_guesses):
    print("real_value", v)
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

real_value 0.0
actor_probs [0.14220831 0.14131944 0.14886895 0.15207864 0.1360455  0.14132166
 0.13815747]
              
              
              
              
              
O             

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [0.13906722 0.1445992  0.15301582 0.1504908  0.13614886 0.14249201
 0.13418609]
              
              
              
              
              
O   # O       

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [0.12964503 0.14736031 0.1539631  0.16093862 0.14231832 0.14279637
 0.12297823]
              
              
              
              
    #         
O O # O       

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [0.12661782 0.1462453  0.15647042 0.16732733 0.13755561 0.14332336
 0.12246013]
              
              
              
              
    #         
O O # O O #   

0 1 2 3 4 5 6 7
real_value 0.0
actor_probs [0.12457642 0.1425182  0.15907541 0.16654514 0.14075242 0.14309889
 0.12343348]
              
              
       