In [1]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import concurrent.futures
from torch import optim
import torch
import os
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player


BOARD_XSIZE = env.BOARD_XSIZE
BOARD_YSIZE = env.BOARD_YSIZE

DIMS=(BOARD_XSIZE,BOARD_YSIZE)


EPISODES_PER_AGENT = 100
TRAIN_EPOCHS = 500000
MODEL_SAVE_INTERVAL = 100
MAKE_OPPONENT_INTERVAL = 1000
SUMMARY_STATS_INTERVAL = 10
RANDOM_SEED = 42

SUMMARY_DIR = './summary'
MODEL_DIR = './models'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)

use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

In [2]:
# TODO: restore neural net parameters

impostor_actor = network.Actor().to(device)
impostor_critic = network.Critic().to(device)
impostor_actor_optimizer = optim.Adam(impostor_actor.parameters(), lr=network.ACTOR_LR)
impostor_critic_optimizer = optim.Adam(impostor_critic.parameters(), lr=network.CRITIC_LR)

crewmate_actor = network.Actor().to(device)
crewmate_critic = network.Critic().to(device)
crewmate_actor_optimizer = optim.Adam(crewmate_actor.parameters(), lr=network.ACTOR_LR)
crewmate_critic_optimizer = optim.Adam(crewmate_critic.parameters(), lr=network.CRITIC_LR)

# Get Writer
writer = SummaryWriter(log_dir=SUMMARY_DIR)

impostor_step = 0
crewmate_step = 0

2023-03-16 00:56:29.607022: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 00:56:29.935301: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-16 00:56:30.881474: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-16 00:56:30.881754: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [3]:
crewmate_reward_buf:list[float] = []
impostor_reward_buf:list[float] = []

crewmate_pool : list[player.Player] = [
    player.RandomPlayer(),
]
impostor_pool : list[player.Player] = [
    player.RandomPlayer(),
]

AttributeError: module 'env' has no attribute 'PLAYER2'

In [None]:

def random_valid_location() -> tuple[int, int]:
    x = np.random.randint(0, BOARD_XSIZE)
    y = np.random.randint(0, BOARD_YSIZE)
    return (x, y)


def play(actor: player.ActorPlayer, actor_is_impostor: bool, others: list[player.Player]) -> tuple[
    list[env.Observation],
    list[env.Action],
    list[np.ndarray],
    list[env.Reward],
    list[env.Advantage],
    list[env.Reward],
    bool
]:
    e = env.Env()

    # create the players at random locations on the board.
    e.state.players = [env.PlayerState(
        random_valid_location(), actor_is_impostor, False)]
    e.state.players += [env.PlayerState(random_valid_location(), False, True)
                        for _ in others]
    # If the actor is not an impostor, then the impostor is randomly chosen from the others.
    if not actor_is_impostor:
        e.state.players[np.random.randint(
            1, len(e.state.players))].impostor = True

    players = [actor] + others

    s_t: list[env.Observation] = []
    a_t: list[env.Action] = []
    p_t: list[np.ndarray] = []
    r_t: list[env.Reward] = []
    # play the game
    while not e.game_over():
        for playerid, player in enumerate(players):
            if player == actor:
                if e.game_over_for(env.Player(playerid)):
                    break
                obs, action_probs, chosen_action, reward = actor.play(env.Player(playerid), e)
                s_t += [obs]
                p_t += [action_probs]
                a_t += [chosen_action]
                r_t += [reward]
            else:
                if e.game_over_for(env.Player(playerid)):
                    continue
                player.play(env.Player(playerid), e)
        e.step()

    # compute advantage and value
    d_t = network.compute_advantage(actor.critic, s_t, r_t)
    v_t = network.compute_value(r_t)

    return s_t, a_t, p_t, r_t, d_t, v_t, actor_is_impostor


In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for _ in range(TRAIN_EPOCHS):
        crewmate_s_batch:list[env.Observation] = []
        crewmate_a_batch:list[env.Action] = []
        crewmate_p_batch:list[np.ndarray] = []
        crewmate_d_batch:list[env.Advantage] = []
        crewmate_v_batch:list[env.Value] = []
        
        impostor_s_batch:list[env.Observation] = []
        impostor_a_batch:list[env.Action] = []
        impostor_p_batch:list[np.ndarray] = []
        impostor_d_batch:list[env.Advantage] = []
        impostor_v_batch:list[env.Value] = []

        # create actor player
        crewmate_nn_player = player.ActorPlayer(crewmate_actor, crewmate_critic, step)
        impostor_nn_player = player.ActorPlayer(impostor_actor, impostor_critic, step)

        futures = []
        for i in range(EPISODES_PER_AGENT):
            is_impostor = (i % 2) == 0
            actor_player = impostor_nn_player if is_impostor else crewmate_nn_player

            others = []
            for _ in range(3):
                others.append(player.RandomPlayer())

            # play the game
            future = executor.submit(play, actor_player, is_impostor, others)
            futures.append(future)

        
        for future in concurrent.futures.as_completed(futures):
            s_t, a_t, p_t, r_t, d_t, v_t, was_impostor = future.result()

            # now update the minibatch
            if was_impostor:
                impostor_s_batch += s_t
                impostor_a_batch += a_t
                impostor_p_batch += p_t
                impostor_d_batch += d_t
                impostor_v_batch += v_t
            else:
                crewmate_s_batch += s_t
                crewmate_a_batch += a_t
                crewmate_p_batch += p_t
                crewmate_d_batch += d_t
                crewmate_v_batch += v_t

            # statistics
            if was_impostor:
                impostor_reward_buf.append(np.sum(r_t))
            else:
                crewmate_reward_buf.append(np.sum(r_t))

        crewmate_actor_losses, crewmate_critic_losses = network.train_ppo(
            crewmate_actor,
            crewmate_critic,
            crewmate_actor_optimizer,
            crewmate_critic_optimizer,
            crewmate_s_batch,
            crewmate_a_batch,
            crewmate_p_batch,
            crewmate_d_batch,
            crewmate_v_batch
        )

        impostor_actor_losses, impostor_critic_losses = network.train_ppo(
            impostor_actor,
            impostor_critic,
            impostor_actor_optimizer,
            impostor_critic_optimizer,
            impostor_s_batch,
            impostor_a_batch,
            impostor_p_batch,
            impostor_d_batch,
            impostor_v_batch
        )

        for actor_loss, critic_loss in zip(actor_losses, critic_losses):
            writer.add_scalar('actor_loss', actor_loss, step)
            writer.add_scalar('critic_loss', critic_loss, step)

            if step % SUMMARY_STATS_INTERVAL == 0:
                for opponent_name, rewards in rewards_vs.items():
                    if len(rewards) > 50:
                        avg_reward = np.array(rewards).mean()
                        writer.add_scalar(f'reward_against_{opponent_name}', avg_reward, step)
                        rewards_vs[opponent_name] = []

            if step % MAKE_OPPONENT_INTERVAL == 0:
                # create a new opponent
                frozen_actor = copy.deepcopy(actor)
                frozen_actor.eval()
                frozen_actor.to(device)
                frozen_critic = copy.deepcopy(critic)
                frozen_critic.eval()
                frozen_critic.to(device)
                opponent_pool.append(player.ActorPlayer(frozen_actor, frozen_critic, step, env.PLAYER2))

            if step % MODEL_SAVE_INTERVAL == 0:
                # Save the neural net parameters to disk.
                torch.save(actor.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_actor.ckpt")
                torch.save(critic.state_dict(), f"{SUMMARY_DIR}/nn_model_ep_{step}_critic.ckpt")
            
            step += 1