In [1]:
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
from dataclasses import dataclass
import concurrent.futures
from collections import defaultdict
import typing
from torch import optim
import torch
import os
import random
import copy


%load_ext autoreload
%autoreload 2
import env
import network
import player
import visualize

BOARD_XSIZE = env.BOARD_XSIZE
BOARD_YSIZE = env.BOARD_YSIZE

DIMS=(BOARD_XSIZE,BOARD_YSIZE)

RANDOM_SEED = 42

SUMMARY_DIR = './summary'

# create result directory
if not os.path.exists(SUMMARY_DIR):
    os.makedirs(SUMMARY_DIR)

use_cuda = torch.cuda.is_available()
torch.manual_seed(RANDOM_SEED)

cuda = torch.device("cuda")
cpu = torch.device("cpu")

if use_cuda:
    device = cuda
else:
    device = cpu

2023-06-01 22:10:25.799695: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# how we saved the  models:
#         # Save the neural net parameters to disk.
#         if impostor_step % MODEL_SAVE_INTERVAL == 0:
#             torch.save(impostor_actor.state_dict(), f"{SUMMARY_DIR}/impostor_model_ep_{impostor_step}_actor.ckpt")
#             torch.save(impostor_critic.state_dict(), f"{SUMMARY_DIR}/impostor_model_ep_{impostor_step}_critic.ckpt")
# 
#         # Save the neural net parameters to disk.
#         if crewmate_step % MODEL_SAVE_INTERVAL == 0:
#             torch.save(crewmate_actor.state_dict(), f"{SUMMARY_DIR}/crewmate_model_ep_{crewmate_step}_actor.ckpt")
#             torch.save(crewmate_critic.state_dict(), f"{SUMMARY_DIR}/crewmate_model_ep_{crewmate_step}_critic.ckpt")
        
CREWMATE_STEP = 1000
IMPOSTOR_STEP = 1000

# load models
impostor_actor = network.Actor().to(device)
impostor_actor.load_state_dict(torch.load(f"{SUMMARY_DIR}/impostor_model_ep_{IMPOSTOR_STEP}_actor.ckpt"))
impostor_actor.eval()

impostor_critic = network.Critic().to(device)
impostor_critic.load_state_dict(torch.load(f"{SUMMARY_DIR}/impostor_model_ep_{IMPOSTOR_STEP}_critic.ckpt"))
impostor_critic.eval()

crewmate_actor = network.Actor().to(device)
crewmate_actor.load_state_dict(torch.load(f"{SUMMARY_DIR}/crewmate_model_ep_{CREWMATE_STEP}_actor.ckpt"))
crewmate_actor.eval()

crewmate_critic = network.Critic().to(device)
crewmate_critic.load_state_dict(torch.load(f"{SUMMARY_DIR}/crewmate_model_ep_{CREWMATE_STEP}_critic.ckpt"))
crewmate_critic.eval()

Critic(
  (conv1): Conv2d(5, 100, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (fc1): Linear(in_features=4901, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1, bias=True)
)

In [3]:
# create nn player
nn_player = player.ActorPlayer(
    impostor_actor,
    impostor_critic,
    IMPOSTOR_STEP,
    crewmate_actor,
    crewmate_critic,
    CREWMATE_STEP,
)

In [4]:
def random_valid_location() -> tuple[int, int]:
    x = np.random.randint(0, BOARD_XSIZE)
    y = np.random.randint(0, BOARD_YSIZE)
    return (x, y)

def play_benchmark(
    actor_engine: player.Player,
    actor_is_impostor: bool,
    other_engines: list[player.Player],
) ->     list[float]:
    # create environment
    initial_state = env.State(
        {},
        np.zeros((BOARD_XSIZE, BOARD_YSIZE), dtype=np.int8),
        np.zeros((BOARD_XSIZE, BOARD_YSIZE), dtype=np.int8),
    )

    # randomize task location
    for _ in range(10):
        location = random_valid_location()
        initial_state.tasks[location] += 3

    # create actor player at random location
    actor_state = env.PlayerState(random_valid_location(), actor_is_impostor)
    # create other players at random locations
    other_state = [
        env.PlayerState(random_valid_location(), False) for _ in other_engines
    ]

    # set the players in the environment
    initial_state.players = {str(i): s for i, s in enumerate(other_state)}
    initial_state.players["actor"] = actor_state

    # set the player data
    agent_engines = {str(i): e for i, e in enumerate(other_engines)}
    agent_engines["actor"] = actor_engine

    impostor = (
        str(np.random.randint(0, len(other_engines))) if actor_is_impostor else "actor"
    )

    e = env.AmogusEnv(initial_state)

    r_t: list[float] = []
    # play the game
    last_obs = e.reset()

    done = False
    while not done:
        # gather actions
        actions = {}
        for agent, agent_engine in agent_engines.items():
            chosen_action = agent_engine.play(agent == impostor, last_obs[agent])
            actions[agent] = chosen_action
        
        # step
        last_obs, rewards, terminateds, truncateds, _ = e.step(actions)

        # add rewards
        r_t += [rewards["actor"]]

        for agent in last_obs.keys():
            if terminateds[agent] or truncateds[agent]:
                del agent_engines[agent]
                # if the actor we're gathering data for is dead, then we need to stop
                if agent == "actor":
                    done = True
    return r_t

In [5]:
SAMPLES = 50

nn_crewmate_vs_random_impostor = []
random_crewmate_vs_random_impostor = []
greedy_crewmate_vs_random_impostor = []

# run a couple simulations to find how well the learned policy does against a random impostor
for _ in range(SAMPLES):
    r_t = play_benchmark(nn_player, False, [player.RandomPlayer()]*3)
    nn_crewmate_vs_random_impostor.append(np.sum(r_t))

# run a couple simulations to find how well the random policy does against a random impostor
for _ in range(SAMPLES):
    r_t = play_benchmark(player.RandomPlayer(), False, [player.RandomPlayer()]*3)
    random_crewmate_vs_random_impostor.append(np.sum(r_t))

# run a couple simulations to find how well the greedy policy does against a random impostor
for _ in range(SAMPLES):
    r_t = play_benchmark(player.GreedyPlayer(), False, [player.RandomPlayer()]*3)
    greedy_crewmate_vs_random_impostor.append(np.sum(r_t))


nn_impostor_vs_random_crewmate = []
random_impostor_vs_random_crewmate = []
greedy_impostor_vs_random_crewmate = []

# run a couple simulations to find how well the learned policy does against a random crewmate
for _ in range(SAMPLES):
    r_t = play_benchmark(nn_player, True, [player.RandomPlayer()]*3)
    nn_impostor_vs_random_crewmate.append(np.sum(r_t))

# run a couple simulations to find how well the random policy does against a random crewmate
for _ in range(SAMPLES):
    r_t = play_benchmark(player.RandomPlayer(), True, [player.RandomPlayer()]*3)
    random_impostor_vs_random_crewmate.append(np.sum(r_t))

# run a couple simulations to find how well the greedy policy does against a random crewmate
for _ in range(SAMPLES):
    r_t = play_benchmark(player.GreedyPlayer(), True, [player.RandomPlayer()]*3)
    greedy_impostor_vs_random_crewmate.append(np.sum(r_t))

In [6]:
print("nn_crewmate_vs_random_impostor", np.mean(nn_crewmate_vs_random_impostor))
print("random_crewmate_vs_random_impostor", np.mean(random_crewmate_vs_random_impostor))
print("engineered_crewmate_vs_random_impostor", np.mean(greedy_crewmate_vs_random_impostor))
print()
print("nn_impostor_vs_random_crewmate", np.mean(nn_impostor_vs_random_crewmate))
print("random_impostor_vs_random_crewmate", np.mean(random_impostor_vs_random_crewmate))
print("engineered_impostor_vs_random_crewmate", np.mean(greedy_impostor_vs_random_crewmate))

nn_crewmate_vs_random_impostor 2.05
random_crewmate_vs_random_impostor 1.73
engineered_crewmate_vs_random_impostor 0.28

nn_impostor_vs_random_crewmate 1.02
random_impostor_vs_random_crewmate 0.92
engineered_impostor_vs_random_crewmate 0.64


In [8]:
print("nn_policy_vs_random_policy (crewmate):" , np.mean(nn_crewmate_vs_random_impostor)/np.mean(random_crewmate_vs_random_impostor))
print("nn_policy_vs_engineered_policy (crewmate):" , np.mean(nn_crewmate_vs_random_impostor)/np.mean(greedy_crewmate_vs_random_impostor))
print()
print("nn_policy_vs_random_policy (impostor):" , np.mean(nn_impostor_vs_random_crewmate)/np.mean(random_impostor_vs_random_crewmate))
print("nn_policy_vs_engineered_policy (impostor):" , np.mean(nn_impostor_vs_random_crewmate)/np.mean(greedy_impostor_vs_random_crewmate))

nn_policy_vs_random_policy (crewmate): 1.1849710982658959
nn_policy_vs_engineered_policy (crewmate): 7.32142857142857

nn_policy_vs_random_policy (impostor): 1.108695652173913
nn_policy_vs_engineered_policy (impostor): 1.59375
