In [1]:
from copy import deepcopy
import os
import ray
from ray import tune
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.env import PettingZooEnv
from rlskyjo.environment import simple_skyjo_env
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env
from gym.spaces import Box
from ray.rllib.agents.dqn.dqn_torch_model import DQNTorchModel
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MAX
from supersuit.multiagent_wrappers import pad_action_space_v0

torch, nn = try_import_torch()


In [2]:
ray.init(num_cpus=2)

{'node_ip_address': '172.30.80.93',
 'raylet_ip_address': '172.30.80.93',
 'redis_address': '172.30.80.93:6379',
 'object_store_address': '/tmp/ray/session_2022-01-26_15-40-00_717476_11940/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-01-26_15-40-00_717476_11940/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-01-26_15-40-00_717476_11940',
 'metrics_export_port': 61266,
 'node_id': '30c940d017b6306028bb97f1373daae1667d052aab24ba1b8eb22356'}

In [5]:
class TorchMaskedActions(DQNTorchModel):
    """PyTorch version of above ParametricActionsModel."""

    def __init__(self,
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 **kw):
        DQNTorchModel.__init__(self, obs_space, action_space, num_outputs,
                               model_config, name, **kw)

        obs_len = obs_space.shape[0]-action_space.n

        orig_obs_space = Box(shape=(obs_len,), low=obs_space.low[:obs_len], high=obs_space.high[:obs_len])
        self.action_embed_model = TorchFC(orig_obs_space, action_space, action_space.n, model_config, name + "_action_embed")

    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        print("input_dict",input_dict)
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the predicted action embedding
        action_logits, _ = self.action_embed_model({
            "obs": input_dict["obs"]['observation']
        })
        # turns probit action mask into logit action mask
        inf_mask = torch.clamp(torch.log(action_mask), -1e10, FLOAT_MAX)

        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()

In [6]:
if __name__ == "__main__":
    alg_name = "DQN"
    env_name  = "pettingzoo_skyjo"
    ModelCatalog.register_custom_model(
        "pa_model", TorchMaskedActions
    )
    # function that outputs the environment you wish to register.

    def env_creator():
        env = simple_skyjo_env.env(**{"name":env_name, "num_players": 2})
        return env


    config = deepcopy(get_agent_class(alg_name)._default_config)

    register_env(env_name,
                 lambda config: PettingZooEnv(env_creator()))

    test_env = PettingZooEnv(env_creator())
    obs_space = test_env.observation_space
    print("obs_space", obs_space)
    act_space = test_env.action_space
    print("act_space", act_space)

    config["multiagent"] = {
        "policies": {
            "draw": (None, obs_space, act_space, {}),
            "place": (None, obs_space, act_space, {}),
        },
        "policy_mapping_fn": lambda agent_id: agent_id.split("_")[0]
    }

    config["num_gpus"] = int(os.environ.get("RLLIB_NUM_GPUS", "0"))
    config["log_level"] = "DEBUG"
    config["num_workers"] = 1
    config["rollout_fragment_length"] = 30
    config["train_batch_size"] = 200
    config["horizon"] = 200
    config["no_done_at_end"] = False
    config["framework"] = "torch"
    config["model"] = {
        "custom_model": "pa_model",
    }
    config['n_step'] = 1

    config["exploration_config"] = {
        # The Exploration class to use.
        "type": "EpsilonGreedy",
        # Config for the Exploration class' constructor:
        "initial_epsilon": 0.1,
        "final_epsilon": 0.0,
        "epsilon_timesteps": 100000,  # Timesteps over which to anneal epsilon.
    }
    config['hiddens'] = []
    config['dueling'] = False
    config['env'] = env_name

    

    tune.run(
        alg_name,
        name="DQN",
        stop={"timesteps_total": 10000000},
        checkpoint_freq=10,
        config=config
        )

observe agent draw_player_1
obs_space Dict(action_mask:Box([0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1], (14,), int8), observation:Box([-16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16], [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
 16 16 16 16 16 16 16], (31,), int8))
act_space Discrete(14)




Trial name,status,loc
DQN_pettingzoo_skyjo_672af_00000,PENDING,


[2m[36m(DQN pid=12158)[0m 2022-01-26 15:44:04,087	INFO dqn.py:141 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.


[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_1


[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:09,520	INFO rollout_worker.py:1705 -- Validating sub-env at vector index=0 ... (ok)


[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(32: ['obs', 'new_obs', 'actions', 'prev_actions', 'rewards', 'prev_rewards', 'dones', 'infos', 'eps_id', 'unroll_id', 'agent_index', 't', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m {'obs': OrderedDict([('action_mask', tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pid=12186)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(RolloutWorker pi

[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:09,715	DEBUG rollout_worker.py:1534 -- Creating policy for draw
[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:09,715	DEBUG preprocessors.py:262 -- Creating sub-preprocessor for Box([0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1], (14,), int8)
[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:09,716	DEBUG preprocessors.py:262 -- Creating sub-preprocessor for Box([-16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
[2m[36m(RolloutWorker pid=12186)[0m  -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16], [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
[2m[36m(RolloutWorker pid=12186)[0m  16 16 16 16 16 16 16], (31,), int8)
[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:09,717	DEBUG catalog.py:706 -- Created preprocessor <ray.rllib.models.preprocessors.DictFlatteningPreprocessor object at 0x7fafa8c599a0>: Dict(action_mask:Box([0 0 0 0 0 0 0 0 0 0

Trial name,status,loc
DQN_pettingzoo_skyjo_672af_00000,RUNNING,172.30.80.93:12158


[2m[36m(DQN pid=12158)[0m SampleBatch(32: ['obs', 'new_obs', 'actions', 'prev_actions', 'rewards', 'prev_rewards', 'dones', 'infos', 'eps_id', 'unroll_id', 'agent_index', 't', 'obs_flat'])
[2m[36m(DQN pid=12158)[0m {'obs': OrderedDict([('action_mask', tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[2m[36m(D

[2m[36m(DQN pid=12158)[0m 2022-01-26 15:44:09,770	INFO worker_set.py:104 -- Inferred observation/action spaces from remote worker (local worker has no env): {'draw': (Dict(action_mask:Box([0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1], (14,), int8), observation:Box([-16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
[2m[36m(DQN pid=12158)[0m  -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16], [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
[2m[36m(DQN pid=12158)[0m  16 16 16 16 16 16 16], (31,), int8)), Discrete(14)), 'place': (Dict(action_mask:Box([0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1], (14,), int8), observation:Box([-16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16
[2m[36m(DQN pid=12158)[0m  -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16 -16], [16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
[2m[36m(DQN pid=12158)[0m  16 16 16 16 16 16 16]


[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
[2m[36m(DQN pid=12158)[0m          0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
[2m[36m(DQN pid=12158)[0m          0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
[2m[36m(DQN pid=12158)[0m          0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
[2m[36m(DQN pid=12158)[0m          0., 0., 0., 0., 0., 0., 0.],
[2m[36m(DQN pid=12158)[0m         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
[2m[36m(DQN pid=12158)[0m          0., 0., 0., 0., 0., 0.,

[2m[36m(DQN pid=12158)[0m 2022-01-26 15:44:10,052	INFO replay_buffer.py:48 -- Estimated max memory usage for replay buffer is 0.02065 GB (50000.0 batches of size 1, 413 bytes each), available system memory is 20.649459712 GB
[2m[36m(RolloutWorker pid=12186)[0m 2022-01-26 15:44:10,037	INFO simple_list_collector.py:781 -- Trajectory fragment after postprocess_trajectory():
[2m[36m(RolloutWorker pid=12186)[0m 
[2m[36m(RolloutWorker pid=12186)[0m { 'draw_player_0': { 'actions': np.ndarray((7,), dtype=int32, min=12.0, max=13.0, mean=12.429),
[2m[36m(RolloutWorker pid=12186)[0m                      'agent_index': np.ndarray((7,), dtype=int64, min=2.0, max=2.0, mean=2.0),
[2m[36m(RolloutWorker pid=12186)[0m                      'dones': np.ndarray((7,), dtype=bool, min=0.0, max=0.0, mean=0.0),
[2m[36m(RolloutWorker pid=12186)[0m                      'eps_id': np.ndarray((7,), dtype=int64, min=825675331.0, max=825675331.0, mean=825675331.0),
[2m[36m(RolloutWorker pid=121

[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2

Trial name,status,loc
DQN_pettingzoo_skyjo_672af_00000,ERROR,172.30.80.93:12158

Trial name,# failures,error file
DQN_pettingzoo_skyjo_672af_00000,1,/home/michi/ray_results/DQN/DQN_pettingzoo_skyjo_672af_00000_0_2022-01-26_15-44-02/error.txt


[2m[36m(DQN pid=12158)[0m 2022-01-26 15:44:10,082	INFO trainer.py:876 -- Worker crashed during call to train(). To attempt to continue training without the failed worker, set `'ignore_worker_failures': True`.


[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_0
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent draw_player_1
[2m[36m(RolloutWorker pid=12186)[0m SampleBatch(1: ['obs', 'obs_flat'])
[2m[36m(RolloutWorker pid=12186)[0m observe agent place_player_1
[

TuneError: ('Trials did not complete', [DQN_pettingzoo_skyjo_672af_00000])