In [1]:
import argparse
import random
import copy
import numpy as np
from gym.spaces import Discrete
from gym import spaces

from ray import tune
from ray.rllib.agents.pg.pg import PGTrainer
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.policy.policy import Policy
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils import try_import_tf

parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=1000)

tf = try_import_tf()

ROCK = 0
PAPER = 1
SCISSORS = 2


class RockPaperScissorsEnv(MultiAgentEnv):
    """Two-player environment for rock paper scissors.
    The observation is simply the last opponent action."""

    def __init__(self, _):
        self.action_space = Discrete(3)
        self.observation_space = Discrete(3)
        self.player1 = "player1"
        self.player2 = "player2"
        self.last_move = None
        self.num_moves = 0

    def reset(self):
        self.last_move = (0, 0)
        self.num_moves = 0
        return {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }

    def step(self, action_dict):
        move1 = action_dict[self.player1]
        move2 = action_dict[self.player2]
        self.last_move = (move1, move2)
        obs = {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }
        r1, r2 = {
            (ROCK, ROCK): (0, 0),
            (ROCK, PAPER): (-1, 1),
            (ROCK, SCISSORS): (1, -1),
            (PAPER, ROCK): (1, -1),
            (PAPER, PAPER): (0, 0),
            (PAPER, SCISSORS): (-1, 1),
            (SCISSORS, ROCK): (-1, 1),
            (SCISSORS, PAPER): (1, -1),
            (SCISSORS, SCISSORS): (0, 0),
        }[move1, move2]
        rew = {
            self.player1: r1,
            self.player2: r2,
        }
        self.num_moves += 1
        done = {
            "__all__": self.num_moves >= 10,
        }
        return obs, rew, done, {}

  from ._conv import register_converters as _register_converters


In [18]:
class SimpleEnv(MultiAgentEnv):
    """
    Simple test environment
    The agents have a state represented by 2 variables. The agents can move the
    value of any agent’s state variable (including their own) up or down by 1. 
    
    Action space 
    List of number of points to add to each variable in range [0, 5)
    [agent1.var1, agent1.var2, agent2.var1, agent2.var2]
    
    Observation space
    The value of every variable for every agent. The last value is the id of the agent.
    [agent1.var1, agent1.var2, agent2.var1, agent2.var2, id]
    
    Reward
    """

    def __init__(self, _):
        self.n_agents = 2
        self.n_vars = 2
        self._step_count = None
        
        action_space = tuple([(spaces.Box(low=0, high=5, shape=(self.n_vars,))) for i in range(self.n_agents)])
        self.action_space = spaces.Tuple(action_space)        
        
        observation_space = [(spaces.Box(low=-np.inf, high=np.inf, shape=(self.n_vars,))) for i in range(self.n_agents)]
        observation_space.append(spaces.Discrete(self.n_agents))
        observation_space = tuple(observation_space)
        self.observation_space = spaces.Tuple(observation_space)

        self.agents_var = None
        self._agent_dones = None
        self._total_episode_reward = None
        self.steps_beyond_done = None
        
        self.agent_ids = []
        self.agent_idx = {}
        for i in range(self.n_agents):
            self.agent_ids.append('agent_' + str(i))
            self.agent_idx['agent_' + str(i)] = i
        
    def get_agent_obs(self):
        _obs = {}
        for agent_i in range(self.n_agents):
            # add state
            _agent_i_obs = copy.copy(self.agent_var)

            #add agent id
            _agent_i_obs.append(agent_i)

            _obs[self.agent_ids[agent_i]] = _agent_i_obs

        return _obs
    
    def reset(self):
        self.agent_var = [([0] * self.n_vars) for i in range(self.n_agents)]
        self._step_count = 0
        self._total_episode_reward = [0 for _ in range(self.n_agents)]
        self._agent_dones = [False for _ in range(self.n_agents)]
        
        return self.get_agent_obs()

    def __update_agent_action(self, agent_i, action):
        action = (action - np.mean(action)).astype(int)
        # Make the actions have a bigger affect on the other agent than itself.
        scale = [2] * self.n_agents
        scale[self.agent_idx[agent_i]] = 1
        action *= scale
        self.agent_var = (np.array(self.agent_var) + action).tolist()
        #print(self.agent_var)

    def get_first_values(self, agent_var):
        """Gives the first value for each agent."""
        return np.array(agent_var)[:,0]
    
    def get_rewards(self, pre_agent_var):
        """Rewards"""
        rewards = self.get_first_values(self.agent_var) - self.get_first_values(pre_agent_var)
        reward_dict = {}
        for i, r in enumerate(rewards):
            reward_dict[self.agent_ids[i]] = r
        for i in range(self.n_agents):
            self._total_episode_reward[i] += rewards[i]
        return reward_dict
    
    def step(self, action_dict):
        assert len(action_dict) == self.n_agents

        self._step_count += 1

        pre_agent_var = self.agent_var

        for agent_i, action in action_dict.items():
            self.__update_agent_action(agent_i, action)

        rewards = self.get_rewards(pre_agent_var)
        
        
        done = {
            "__all__": False,
        }

        return self.get_agent_obs(), rewards, done, {}

In [19]:
env = SimpleEnv(None)
env.reset()
new_obs, rewards, dones, infos = env.step(action_dict={'agent_0': [0, 0], 'agent_1': [0, 0]})
env.n_agents



2

In [None]:
from ray.rllib.agents import ppo, pg
trainer = pg.PGTrainer(env=SimpleEnv)
#trainer = ppo.PPOTrainer(env=SimpleEnv)
while True:
    print(trainer.train())  # distributed training step

Log sync requires rsync to be installed.
Install gputil for GPU system monitoring.


{'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'off_policy_estimator': {}, 'num_healthy_workers': 0, 'timesteps_total': 400, 'timers': {'sample_time_ms': 315.185, 'sample_throughput': 1269.097, 'learn_time_ms': 161.834, 'learn_throughput': 2471.672}, 'info': {'learner': {'model': {}}, 'num_steps_sampled': 400, 'num_steps_trained': 400}, 'done': False, 'episodes_total': 0, 'training_iteration': 1, 'experiment_id': 'f7f492935452438d80a6ccb66444ea54', 'date': '2020-04-28_00-52-06', 'timestamp': 1588049526, 'time_this_iter_s': 0.4772932529449463, 'time_total_s': 0.4772932529449463, 'pid': 8897, 'hostname': 'victor-solus', 'node_ip': '192.168.1.236', 'config': {'num_workers': 0, 'num_envs_per_worker': 1, 'rollout_fragment_length':