In [6]:
from io import TextIOWrapper
import multiprocessing as mp
import numpy as np
import numpy.typing as npt
import logging
import tensorflow as tf
import os
import sys
import importlib

EPISODES_PER_AGENT = 20
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'
TRAIN_TRACES = './train/'
TEST_LOG_FOLDER = './test_results/'
LOG_FILE = SUMMARY_DIR + '/log'

# settings
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


In [7]:
import env
import network

importlib.reload(env)
importlib.reload(network)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# TODO: restore neural net parameters
actor = network.PPOAgent()

# Get Writer
writer = tf.summary.create_file_writer(SUMMARY_DIR);

step=0

In [9]:

def agent(actor:network.PPOAgent) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[npt.NDArray[np.float32]],
    list[env.Advantage],
    list[env.Value]
]:
    e = env.Env()

    ACTOR_ID = np.int8(1)
    OPPONENT_ID = np.int8(2)

    s_batch:list[env.Observation] = []
    s_prime_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[npt.NDArray[np.float32]]  = []
    r_batch:list[env.Reward] = []
    actor_turn = True
    while not e.game_over():
        if actor_turn:
            obs = e.observe(ACTOR_ID)

            action_prob = actor.predict_batch([obs])[0]

            noise = 0.01*np.random.gumbel(size=len(action_prob))
            adjusted_probs = (np.log(action_prob) + noise)*e.legal_mask(ACTOR_ID)
            chosen_action: env.Action = np.argmax(adjusted_probs)

            s_batch.append(obs)

            reward,obs_prime = e.step(chosen_action, ACTOR_ID)
            
            s_prime_batch.append(obs_prime)
            a_batch.append(chosen_action)
            r_batch.append(reward)
            p_batch.append(action_prob)
        else:
            # random opponent for now
            opponent_probs = 0.01*np.random.gumbel(size=9)*e.legal_mask(OPPONENT_ID)
            opponent_action = opponent_probs.argmax()
            e.step(opponent_action, OPPONENT_ID)
        # flip turn
        actor_turn = not actor_turn

    v_batch = actor.compute_value(r_batch)
    d_batch = actor.compute_advantage(s_batch, r_batch)

    return (s_batch, a_batch, p_batch, d_batch, v_batch)

summary_reward_buf:list[float] = []

with writer.as_default():
    for epoch in range(TRAIN_EPOCHS):
        s_batch:list[env.Observation] = []
        a_batch:list[env.Action] = []
        p_batch:list[npt.NDArray[np.float32]]  = []
        d_batch:list[env.Advantage] = []
        v_batch:list[env.Value] = []
        for _ in range(EPISODES_PER_AGENT):
            s_, a_, p_, d_, v_  = agent(actor)
            s_batch += s_
            a_batch += a_
            p_batch += p_
            d_batch += d_
            v_batch += v_

            summary_reward_buf.append(float(v_[-1]))

        step += actor.train(s_batch, a_batch, d_batch, v_batch, p_batch, step)

        if epoch % SUMMARY_STATS_INTERVAL == 0:
            avg_reward = sum(summary_reward_buf)/len(summary_reward_buf)
            tf.summary.scalar('avg_reward', avg_reward, step=step)
            # clear
            summary_reward_buf = []

        if epoch % MODEL_SAVE_INTERVAL == 0:
            # Save the neural net parameters to disk.
            actor_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt"
            critic_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt"
            save_path = actor.save(actor_path, critic_path)


KeyboardInterrupt: 

In [10]:
p_batch

[<tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([0.11008313, 0.11587206, 0.11087313, 0.11072966, 0.11040531,
        0.11049209, 0.11054732, 0.11049438, 0.11050291], dtype=float32)>,
 <tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([0.10992572, 0.1174943 , 0.11083517, 0.10942519, 0.11042286,
        0.11086432, 0.11032386, 0.11048271, 0.11022585], dtype=float32)>,
 <tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([0.11065196, 0.11321422, 0.11077045, 0.1116381 , 0.11105046,
        0.11052281, 0.11080979, 0.11030626, 0.11103599], dtype=float32)>,
 <tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([0.1082436 , 0.11383949, 0.11027901, 0.11178917, 0.11304598,
        0.11174852, 0.11250384, 0.10932403, 0.10922636], dtype=float32)>,
 <tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([0.10921537, 0.11680456, 0.11177598, 0.10946206, 0.11151192,
        0.10942218, 0.11147845, 0.11040717, 0.1099223 ], dtype=float32)>,
 <tf.Tensor: shape=(9,), dtype=float32, numpy=
 array([

In [None]:
e = env.Env()
e.step(0, 2)
e.step(2, 2)
e.step(3, 2)
e.step(8, 1)
#e.step(3, 1)
#e.step(3, 2)
o = e.observe(2)
env.print_obs(o)
#actor.critic_batch([o])[0]

X   X 
X     
    O 



In [None]:
list(zip(v_batch, s_batch))

[(0.0,
  Observation(board=array([[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[0, 0, 0, 1, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 0, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 1, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 1, 2, 1],
         [0