In [1]:
from io import TextIOWrapper
import multiprocessing as mp
import numpy as np
import numpy.typing as npt
import logging
import tensorflow as tf
import os
from scipy.special import softmax
import sys

%load_ext autoreload
%autoreload 2
import env
import network

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 20
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'
TRAIN_TRACES = './train/'
TEST_LOG_FOLDER = './test_results/'
LOG_FILE = SUMMARY_DIR + '/log'

# settings
#os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


2023-02-23 21:33:04.458994: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-23 21:33:04.577822: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-02-23 21:33:05.116533: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-23 21:33:05.116601: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
# TODO: restore neural net parameters
actor = network.PPOAgent(BOARD_XSIZE, BOARD_YSIZE)

# Get Writer
writer = tf.summary.create_file_writer(SUMMARY_DIR);

step=0

2023-02-23 21:33:11.385863: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 21:33:11.401356: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 21:33:11.401622: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-23 21:33:11.402373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [3]:
def agent(actor:network.PPOAgent) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[npt.NDArray[np.float32]],
    list[env.Advantage],
    list[env.Value]
]:
    e = env.Env(DIMS)

    ACTOR_ID = np.int8(1)
    OPPONENT_ID = np.int8(2)

    s_batch:list[env.Observation] = []
    s_prime_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[npt.NDArray[np.float32]]  = []
    r_batch:list[env.Reward] = []
    actor_turn = True
    while not e.game_over():
        if actor_turn:
            obs = e.observe(ACTOR_ID)

            action_prob = actor.predict_batch([obs])[0]
            
            # apply noise to probs
            noise = 0.1*np.random.gumbel(size=len(action_prob))
            adjusted_action_probs = softmax(np.log(action_prob) + noise) 

            legal_mask = e.legal_mask() 

            chosen_action: env.Action = np.argmax(adjusted_action_probs*legal_mask)

            s_batch.append(obs)

            reward,obs_prime = e.step(chosen_action, ACTOR_ID)
            
            s_prime_batch.append(obs_prime)
            a_batch.append(chosen_action)
            r_batch.append(reward)
            p_batch.append(action_prob)
        else:
            if np.random.random() > 0.0:
                legal_mask = e.legal_mask()
                action_prob = np.random.random(size=BOARD_XSIZE)
                chosen_action: env.Action = np.argmax(action_prob*legal_mask)
                e.step(chosen_action, OPPONENT_ID)
            else:
                obs = e.observe(OPPONENT_ID)
                action_prob = actor.predict_batch([obs])[0]
                legal_mask = e.legal_mask() 
                chosen_action: env.Action = np.argmax(action_prob*legal_mask)
                e.step(chosen_action, OPPONENT_ID)
            
        # flip turn
        actor_turn = not actor_turn

    v_batch = actor.compute_value(r_batch)
    d_batch = actor.compute_advantage(s_batch, r_batch)

    return (s_batch, a_batch, p_batch, d_batch, v_batch)


summary_reward_buf:list[float] = []

with writer.as_default():
    for epoch in range(TRAIN_EPOCHS):
        s_batch:list[env.Observation] = []
        a_batch:list[env.Action] = []
        p_batch:list[npt.NDArray[np.float32]]  = []
        d_batch:list[env.Advantage] = []
        v_batch:list[env.Value] = []
        for _ in range(EPISODES_PER_AGENT):
            s_, a_, p_, d_, v_  = agent(actor)
            s_batch += s_
            a_batch += a_
            p_batch += p_
            d_batch += d_
            v_batch += v_

            summary_reward_buf.append(float(v_[-1]))

        step += actor.train(s_batch, a_batch, d_batch, v_batch, p_batch, step)

        if epoch % SUMMARY_STATS_INTERVAL == 0:
            avg_reward = sum(summary_reward_buf)/len(summary_reward_buf)
            tf.summary.scalar('avg_reward', avg_reward, step=step)
            # clear
            summary_reward_buf = []

        if epoch % MODEL_SAVE_INTERVAL == 0:
            # Save the neural net parameters to disk.
            actor_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt"
            critic_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt"
            save_path = actor.save(actor_path, critic_path)



2023-02-23 21:33:16.329334: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8600
2023-02-23 21:33:16.825381: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


KeyboardInterrupt: 

In [None]:
e = env.Env(DIMS)

e.step(3, 1)
e.step(3, 2)
o = e.observe(1)
print(e.legal_mask())
env.print_obs(o)
actor.critic_batch([o])[0]

[ True  True  True  True  True  True  True]
              
              
              
              
      O       
      #       



<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.03745661], dtype=float32)>

In [None]:
critic_guesses = actor.critic_batch(s_batch)
actor_guesses = actor.predict_batch(s_batch)
for v, obs, critic_guess, actor_guess in zip(v_batch, s_batch, critic_guesses, actor_guesses):
    print("real_value", v)
    print("pred_value", float(critic_guess[0]))
    print("actor_probs", np.array(actor_guess))
    env.print_obs(obs)
    print('0 1 2 3 4 5 6 7')

real_value 0.0
pred_value 0.004199341870844364
actor_probs [0.14269835 0.1427772  0.14304662 0.14269224 0.14289969 0.14298207
 0.1429038 ]
              
              
              
              
              
              

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.00491247558966279
actor_probs [0.14819911 0.14306787 0.14059506 0.14109488 0.14158984 0.14284289
 0.14261028]
              
              
              
              
              
        #   O 

0 1 2 3 4 5 6 7
real_value 0.0
pred_value -0.01695859059691429
actor_probs [0.15275525 0.14252272 0.13701543 0.143052   0.14089507 0.1408881
 0.14287148]
              
              
              
              
O             
#       #   O 

0 1 2 3 4 5 6 7
real_value 0.0
pred_value 0.020738760009407997
actor_probs [0.14546241 0.14796333 0.14648548 0.13906989 0.14352448 0.13984652
 0.1376479 ]
              
              
              
              
O         O   
#       # # O 

0 1 2 3 4 5 6 7
real_value 0.0
pr