In [1]:
from io import TextIOWrapper
import multiprocessing as mp
import numpy as np
import numpy.typing as npt
import logging
import tensorflow as tf
import os
import sys
import importlib

BOARD_XSIZE=7
BOARD_YSIZE=6
DIMS=(BOARD_YSIZE,BOARD_XSIZE)

EPISODES_PER_AGENT = 20
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'
TRAIN_TRACES = './train/'
TEST_LOG_FOLDER = './test_results/'
LOG_FILE = SUMMARY_DIR + '/log'

# settings
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)


2022-11-13 17:14:04.700229: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-13 17:14:04.817683: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-13 17:14:05.217423: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-13 17:14:05.217499: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
import env
import network

importlib.reload(env)
importlib.reload(network)

<module 'network' from '/home/fidgetsinner/myworkspace/connect4/ppo/network.py'>

In [3]:
def agent(actor:network.PPOAgent) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[npt.NDArray[np.float32]],
    list[env.Advantage],
    list[env.Value]
]:
    e = env.Env(DIMS)

    ACTOR_ID = np.int8(1)
    OPPONENT_ID = np.int8(2)

    s_batch:list[env.Observation] = []
    s_prime_batch:list[env.Observation] = []
    a_batch:list[env.Action] = []
    p_batch:list[npt.NDArray[np.float32]]  = []
    r_batch:list[env.Reward] = []
    actor_turn = True
    while not e.game_over():
        if actor_turn:
            obs = e.observe(ACTOR_ID)

            action_prob = actor.predict_batch([obs])[0]

            # gumbel noise
            noise = np.random.gumbel(size=len(action_prob))
            chosen_action: env.Action = np.argmax(np.log(action_prob) + noise)

            s_batch.append(obs)

            reward,obs_prime = e.step(chosen_action, ACTOR_ID)
            
            s_prime_batch.append(obs_prime)
            a_batch.append(chosen_action)
            r_batch.append(reward)
            p_batch.append(action_prob)
        else:
            # random opponent for now
            opponent_action = np.int8(np.random.randint(0, BOARD_XSIZE))
            e.step(opponent_action, OPPONENT_ID)
        # flip turn
        actor_turn = not actor_turn

    v_batch = actor.compute_value(r_batch)
    d_batch = actor.compute_advantage(s_batch, r_batch)

    return (s_batch, a_batch, p_batch, d_batch, v_batch)


In [4]:
# TODO: restore neural net parameters
actor = network.PPOAgent(BOARD_XSIZE, BOARD_YSIZE)

# Get Writer
writer = tf.summary.create_file_writer(SUMMARY_DIR);

step=0

2022-11-13 17:14:05.973674: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-11-13 17:14:05.973704: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: susinator
2022-11-13 17:14:05.973709: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: susinator
2022-11-13 17:14:05.973803: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 520.56.6
2022-11-13 17:14:05.973819: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 520.56.6
2022-11-13 17:14:05.973823: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 520.56.6
2022-11-13 17:14:05.974107: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical 

In [5]:
summary_reward_buf:list[float] = []

with writer.as_default():
    for epoch in range(TRAIN_EPOCHS):
        s_batch:list[env.Observation] = []
        a_batch:list[env.Action] = []
        p_batch:list[npt.NDArray[np.float32]]  = []
        d_batch:list[env.Advantage] = []
        v_batch:list[env.Value] = []
        for _ in range(EPISODES_PER_AGENT):
            s_, a_, p_, d_, v_  = agent(actor)
            s_batch += s_
            a_batch += a_
            p_batch += p_
            d_batch += d_
            v_batch += v_

            summary_reward_buf.append(float(v_[-1]))

        step += actor.train(s_batch, a_batch, d_batch, v_batch, p_batch, step)

        if epoch % SUMMARY_STATS_INTERVAL == 0:
            avg_reward = sum(summary_reward_buf)/len(summary_reward_buf)
            tf.summary.scalar('avg_reward', avg_reward, step=step)
            # clear
            summary_reward_buf = []

        if epoch % MODEL_SAVE_INTERVAL == 0:
            # Save the neural net parameters to disk.
            actor_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_actor.ckpt"
            critic_path = f"{SUMMARY_DIR}/nn_model_ep_{epoch}_critic.ckpt"
            save_path = actor.save(actor_path, critic_path)



In [None]:
d_batch

[0.039396627200000034,
 0.04924578400000004,
 0.061557230000000046,
 0.07694653750000005,
 0.09618317187500006,
 0.12022896484375006,
 0.15028620605468757,
 0.18785775756835943,
 0.23482219696044926,
 0.29352774620056155,
 0.3669096827507019,
 0.09221519769600006,
 0.11526899712000008,
 0.1440862464000001,
 0.1801078080000001,
 0.2251347600000001,
 0.2814184500000001,
 0.3517730625000001,
 0.4397163281250001,
 0.5496454101562501,
 0.6870567626953126,
 0.8588209533691407,
 1.0735261917114258,
 1.3419077396392822,
 0.039505056000000024,
 0.04938132000000003,
 0.06172665000000003,
 0.07715831250000003,
 0.09644789062500003,
 0.12055986328125004,
 0.15069982910156254,
 0.18837478637695315,
 0.2354684829711914,
 0.29433560371398926,
 0.3679195046424866,
 0.04318856500000002,
 0.05398570625000002,
 0.06748213281250003,
 0.08435266601562502,
 0.10544083251953128,
 0.1318010406494141,
 0.1647513008117676,
 0.20593912601470948,
 0.25742390751838684,
 0.31639062500000004,
 0.39548828125,
 0.4943

In [None]:
e = env.Env(DIMS)
e.step(1, 2)
e.step(2, 2)
e.step(4, 2)
#e.step(3, 1)
#e.step(3, 2)
o = e.observe(1)
env.print_obs(o)
actor.critic_batch([o])[0]

              
              
              
              
              
  O O   O     



<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.11904492], dtype=float32)>

In [None]:
list(zip(v_batch, s_batch))

[(0.0,
  Observation(board=array([[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[0, 0, 0, 1, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 0, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 1, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 2, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]], dtype=int8))),
 (0.0,
  Observation(board=array([[1, 0, 0, 1, 1, 2, 1],
         [0