In [16]:
from io import TextIOWrapper
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
from torch import optim
import torch
import os
from scipy.special import softmax
import sys

%load_ext autoreload
%autoreload 2
import env
import network

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 20
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# settings
#os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
# TODO: restore neural net parameters

actor = network.Actor(BOARD_XSIZE, BOARD_YSIZE).to(device)
critic = network.Critic(BOARD_XSIZE, BOARD_YSIZE).to(device)

actor_optimizer = optim.Adam(actor.parameters(), lr=network.ACTOR_LR)
critic_optimizer = optim.Adam(actor.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

step=0

In [21]:
def agent(actor:network.Actor, critic:network.Critic) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[env.Advantage],
    list[env.Value]
]:
    e = env.Env(DIMS)

    ACTOR_ID = np.int8(1)
    OPPONENT_ID = np.int8(2)

    s_batch:list[env.Observation] = []
    s_prime_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    r_batch:list[env.Reward] = []
    actor_turn = True
    while not e.game_over():
        if actor_turn:
            obs = e.observe(ACTOR_ID)

            action_probs = actor.forward(network.obs_to_tensor(obs, device))[0].to(cpu).detach().numpy()
            if np.isnan(action_probs).any():
                raise ValueError("NaN found!")
            
            action_logprobs = np.log(action_probs)

            # apply noise to probs
            noise = 0.1*np.random.gumbel(size=len(action_logprobs))
            adjusted_action_probs = softmax(action_logprobs + noise) 

            legal_mask = e.legal_mask() 

            chosen_action: env.Action = np.argmax(adjusted_action_probs*legal_mask)

            s_batch.append(obs)

            reward,obs_prime = e.step(chosen_action, ACTOR_ID)
            
            s_prime_batch.append(obs_prime)
            a_batch.append(chosen_action)
            r_batch.append(reward)
        else:
            legal_mask = e.legal_mask()
            action_prob = np.random.random(size=BOARD_XSIZE)
            chosen_action: env.Action = np.argmax(action_prob*legal_mask)
            e.step(chosen_action, OPPONENT_ID)
            # else:
            #     obs = e.observe(OPPONENT_ID)
            #     action_prob = actor.predict_batch([obs])[0]
            #     legal_mask = e.legal_mask() 
            #     chosen_action: env.Action = np.argmax(action_prob*legal_mask)
            #     e.step(chosen_action, OPPONENT_ID)
            
        # flip turn
        actor_turn = not actor_turn

    v_batch = network.compute_value(r_batch)
    d_batch = network.compute_advantage(critic, s_batch, r_batch)

    return (s_batch, a_batch, d_batch, v_batch)


summary_reward_buf:list[float] = []
for epoch in range(TRAIN_EPOCHS):
    s_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    d_batch:list[env.Advantage] = []
    v_batch:list[env.Value] = []
    for _ in range(EPISODES_PER_AGENT):
        s_, a_, d_, v_  = agent(actor, critic)
        s_batch += s_
        a_batch += a_
        d_batch += d_
        v_batch += v_
        summary_reward_buf.append(float(v_[-1]))
        
    actor_loss, critic_loss = network.train(actor, critic, actor_optimizer, critic_optimizer, s_batch, a_batch, d_batch, v_batch)
    writer.add_scalar('actor_loss', actor_loss, step)
    writer.add_scalar('critic_loss', critic_loss, step)

    if epoch % SUMMARY_STATS_INTERVAL == 0:
        avg_reward = sum(summary_reward_buf)/len(summary_reward_buf)
        writer.add_scalar('avg_reward', avg_reward, step)
        # clear
        summary_reward_buf = []
    
    if epoch % MODEL_SAVE_INTERVAL == 0:
        # Save the neural net parameters to disk.
        actor_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt"
        critic_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt"
        torch.save(actor.state_dict(), actor_path)
        torch.save(critic.state_dict(), critic_path)
    
    step += 1






ValueError: NaN found!

In [26]:
list(actor.parameters())

[Parameter containing:
 tensor([[[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [    nan,     nan,     nan]]],
 
 
         [[[-0.2162, -0.3306, -0.2304],
           [-0.1490, -0.2258,  0.1059],
           [ 0.0896,  0.0538, -0.3124]]],
 
 
         [[[    nan,     nan,     nan],
           [    nan,     nan,     nan],
           [   

In [22]:
e = env.Env(DIMS)

e.step(3, 1)
e.step(3, 2)
o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
actor.forward(network.obs_to_tensor(o, device))[0]

[ True  True  True  True  True  True  True]
              
              
              
              
      O       
      #       



tensor([nan, nan, nan, nan, nan, nan, nan], device='cuda:0',
       grad_fn=<SelectBackward0>)

In [None]:
s_tensor = network.obs_batch_to_tensor(s_batch, device)
critic_guesses = critic.forward(s_tensor).to(cpu).detach().numpy()
actor_guesses = actor.forward(s_tensor).to(cpu).detach().numpy()
for v, obs, critic_guess, actor_guess in zip(v_batch, s_batch, critic_guesses, actor_guesses):
    print("real_value", v)
    print("pred_value", float(critic_guess))
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

real_value 0.0
pred_value -0.0017231465317308903
actor_probs [nan nan nan nan nan nan nan]
              
              
              
              
              
              

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.007051178719848394
actor_probs [nan nan nan nan nan nan nan]
              
              
              
              
              
#         O   

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.01274698693305254
actor_probs [nan nan nan nan nan nan nan]
              
              
              
              
#             
#   O     O   

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.014531351625919342
actor_probs [nan nan nan nan nan nan nan]
              
              
O             
#             
#             
#   O     O   

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.015428484417498112
actor_probs [nan nan nan nan nan nan nan]
              
#             
O             
#             
#             
#   O O   O   

0 1 2 3 4 5 6 7
real_value 0.0
