# Chapter 6: Finding optimal restrictions via Reinforcement Learning

## Setup and Definitions

### Imports

In [5]:
import glob
from operator import itemgetter

import numpy as np

from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from ray.rllib.env import MultiAgentEnv

from gym.spaces import Discrete, Box, Tuple, MultiDiscrete, Dict, MultiBinary

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import matplotlib.pyplot as plt

### Definitions

#### Agent

In [6]:
class ParametricAgentModel(TFModelV2):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, *args, **kwargs):
        super(ParametricAgentModel, self).__init__(obs_space, action_space, num_outputs, model_config, name, *args, **kwargs)

        assert isinstance(action_space, Discrete), f'action_space is a {type(action_space)}, but should be Discrete!'

        true_obs_shape = (3, )
        action_embed_size = action_space.n

        self.action_embed_model = FullyConnectedNetwork(Box(0, 1, shape=true_obs_shape), action_space, action_embed_size, model_config, name + '_action_embedding')

    def forward(self, input_dict, state, seq_lens):
        action_mask = input_dict['obs']['allowed_actions']
        action_embedding, _ = self.action_embed_model({ 'obs': input_dict['obs']['obs'] })
        intent_vector = tf.expand_dims(action_embedding, 1)
        action_logits = tf.reduce_sum(intent_vector, axis=1)
        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)

        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()

#### Governance

In [7]:
class PassiveGovernancePolicy(Policy):
    """
    Always allows all actions
    """

    def __init__(self, observation_space, action_space, config={}):
        super().__init__(observation_space, action_space, config)

        assert isinstance(action_space, MultiDiscrete), f'action_space is not MultiDiscrete ({type(action_space)})'
        self.NUMBER_OF_ACTIONS = len(action_space.nvec)

    def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None,
                        info_batch=None, episodes=None, **kwargs):

        return [np.ones(self.NUMBER_OF_ACTIONS).astype(bool) for _ in obs_batch], state_batches, {}

    def get_weights(self):
        return {}

    def set_weights(self, weights) -> None:
        pass

#### Logger

In [8]:
class CustomMetricsLogger(DefaultCallbacks):
    def __init__(self):
        super().__init__()

        # Unknown until on_episode_start
        self.NUMBER_OF_AGENTS = self.NUMBER_OF_ACTIONS = self.NUMBER_OF_STEPS_PER_EPISODE = self.ALPHA = None

    def on_episode_start(self, *, worker, base_env, policies, episode, env_index, **kwargs):
        if self.NUMBER_OF_AGENTS is None:
            env = base_env.get_unwrapped()[0]

            self.NUMBER_OF_AGENTS = env.NUMBER_OF_AGENTS
            self.NUMBER_OF_ACTIONS = env.NUMBER_OF_ACTIONS
            self.NUMBER_OF_STEPS_PER_EPISODE = env.NUMBER_OF_STEPS_PER_EPISODE
            self.ALPHA = env.ALPHA

        episode.user_data['gov_info'] = []

    def on_episode_step(self, *, worker, base_env, policies=None, episode, env_index, **kwargs):
        env = base_env.get_unwrapped()[0]

        if env.is_reward_step:
            episode.user_data['gov_info'].append(np.array(episode.last_info_for('gov')))

    def on_episode_end(self, *, worker, base_env, policies, episode, env_index, **kwargs):
        gov_info = np.vstack(episode.user_data['gov_info'])
        episode_state_reward, episode_restriction_reward, episode_degree_of_restriction = np.mean(gov_info, axis=0)

        episode.custom_metrics['episode_degree_of_restriction/gov'] = episode_degree_of_restriction
        episode.custom_metrics['episode_state_reward/gov'] = episode_state_reward
        episode.custom_metrics['episode_restriction_reward/gov'] = episode_restriction_reward
        episode.custom_metrics['episode_reward/gov'] = episode_state_reward + episode_restriction_reward

    def on_postprocess_trajectory(self, *, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs):
        if agent_id == 'gov':
            rewards = postprocessed_batch['rewards']
            infos = postprocessed_batch['infos']
            assert len(rewards.shape) == 1

            # Distribute rewards over all gov_actions in one environment step, not just the last one
            for i in range(len(rewards)):
                if rewards[i] != 0:
                    rewards[max(0, i - self.NUMBER_OF_AGENTS + 1):(i+1)] = rewards[i] / self.NUMBER_OF_AGENTS

#### Environment

In [9]:
class CC_Environment(MultiAgentEnv):
    """
    Governance and agent reward is only 1 if all actions coincide.
    """

    def __init__(self, env_config: dict = {}):
        assert 'NUMBER_OF_ACTIONS' in env_config
        assert 'NUMBER_OF_AGENTS' in env_config
        assert 'NUMBER_OF_STEPS_PER_EPISODE' in env_config
        assert 'ALPHA' in env_config

        self.NUMBER_OF_ACTIONS = env_config['NUMBER_OF_ACTIONS']
        self.NUMBER_OF_STEPS_PER_EPISODE = env_config['NUMBER_OF_STEPS_PER_EPISODE']
        self.ALPHA = env_config['ALPHA']
        self.NUMBER_OF_AGENTS = env_config['NUMBER_OF_AGENTS']
        self.agents = [str(i) for i in range(self.NUMBER_OF_AGENTS)]

        self.agent_observations = { agent: np.array(np.mod([index - 1, index, index + 1], self.NUMBER_OF_AGENTS)) for index, agent in enumerate(self.agents) }

        # Define obs and action spaces for agents
        self.observation_space = Dict({ 'obs': Box(0, self.NUMBER_OF_ACTIONS - 1, shape=(3,)),
                             'allowed_actions': MultiBinary(self.NUMBER_OF_ACTIONS) })
        self.action_space = Discrete(self.NUMBER_OF_ACTIONS)

        self.state = None
        self.current_step = None
        self.current_agent_index = None

        self.allowed_actions = None

        self.is_reward_step = None

    def step(self, actions):
        self.is_reward_step = False
        if 'gov' in actions:
            # Only the governance has acted
            assert len(actions) == 1

            # Governance action is a set of allowed actions; save it for later
            self.allowed_actions[self.agents[self.current_agent_index]] = actions['gov'].astype(bool)
            self.current_agent_index += 1
        else:
            # All agents have acted
            assert len(actions) == len(self.agents)

            # Execute transition
            self.state = np.array([actions[agent] for agent in self.agents])

            state_reward = 1 if np.all(self.state == self.state[0]) else 0
            degree_of_restriction = np.mean([1 - (np.sum(allowed_actions) / self.NUMBER_OF_ACTIONS) for allowed_actions in self.allowed_actions.values()])
            restriction_reward = -self.ALPHA * degree_of_restriction
            gov_reward = state_reward + restriction_reward

            self.current_step += 1
            self.current_agent_index = 0
            self.allowed_actions = { } # Could be removed for speed-up

            # Governance needs to decide on allowed actions for first agent on the list (if episode is not over)
            self.is_reward_step = True
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': gov_reward }, \
                   { '__all__': self.current_step >= self.NUMBER_OF_STEPS_PER_EPISODE }, \
                   { 'gov': (state_reward, restriction_reward, degree_of_restriction) }

        # Governance needs to decide on allowed actions for next agent on the list
        if self.current_agent_index < len(self.agents):
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': 0 }, \
                   { '__all__': False }, \
                   { 'gov': { } }

        # Agents need to act
        else:
            agent_reward = 1 if np.all(self.state == self.state[0]) else 0

            return { agent: { 'obs': self.get_observation(agent), 'allowed_actions': self.allowed_actions[agent] } for agent in self.agents }, \
                   { agent: agent_reward for agent in self.agents }, \
                   { '__all__': False }, \
                   { }

    def reset(self):
        self.current_step = 0
        self.current_agent_index = 0

        self.state = np.random.randint(0, self.NUMBER_OF_ACTIONS, (len(self.agents), ))

        self.allowed_actions = { }

        return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }

    def get_observation(self, agent: str):
        return self.state[self.agent_observations[agent]]

In [10]:
class GMAS_Environment(MultiAgentEnv):
    """
    Governance reward is only 1 if all actions coincide.
    Agents get their rewards if their observations coincide.
    """

    def __init__(self, env_config: dict = {}):
        assert 'NUMBER_OF_ACTIONS' in env_config
        assert 'NUMBER_OF_AGENTS' in env_config
        assert 'NUMBER_OF_STEPS_PER_EPISODE' in env_config
        assert 'ALPHA' in env_config

        self.NUMBER_OF_ACTIONS = env_config['NUMBER_OF_ACTIONS']
        self.NUMBER_OF_STEPS_PER_EPISODE = env_config['NUMBER_OF_STEPS_PER_EPISODE']
        self.ALPHA = env_config['ALPHA']
        self.NUMBER_OF_AGENTS = env_config['NUMBER_OF_AGENTS']
        self.agents = [str(i) for i in range(self.NUMBER_OF_AGENTS)]

        self.agent_observations = { agent: np.array(np.mod([index - 1, index, index + 1], self.NUMBER_OF_AGENTS)) for index, agent in enumerate(self.agents) }

        # Define obs and action spaces for agents
        self.observation_space = Dict({ 'obs': Box(0, self.NUMBER_OF_ACTIONS - 1, shape=(3,)),
                             'allowed_actions': MultiBinary(self.NUMBER_OF_ACTIONS) })
        self.action_space = Discrete(self.NUMBER_OF_ACTIONS)

        self.state = None
        self.current_step = None
        self.current_agent_index = None

        self.allowed_actions = None

        self.is_reward_step = None

    def step(self, actions):
        self.is_reward_step = False
        if 'gov' in actions:
            # Only the governance has acted
            assert len(actions) == 1

            # Governance action is a set of allowed actions; save it for later
            self.allowed_actions[self.agents[self.current_agent_index]] = actions['gov'].astype(bool)
            self.current_agent_index += 1
        else:
            # All agents have acted
            assert len(actions) == len(self.agents)

            # Execute transition
            self.state = np.array([actions[agent] for agent in self.agents])

            state_reward = 1 if np.all(self.state == self.state[0]) else 0
            degree_of_restriction = np.mean([1 - (np.sum(allowed_actions) / self.NUMBER_OF_ACTIONS) for allowed_actions in self.allowed_actions.values()])
            restriction_reward = -self.ALPHA * degree_of_restriction
            gov_reward = state_reward + restriction_reward

            self.current_step += 1
            self.current_agent_index = 0
            self.allowed_actions = { } # Could be removed for speed-up

            # Governance needs to decide on allowed actions for first agent on the list (if episode is not over)
            self.is_reward_step = True
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': gov_reward }, \
                   { '__all__': self.current_step >= self.NUMBER_OF_STEPS_PER_EPISODE }, \
                   { 'gov': (state_reward, restriction_reward, degree_of_restriction) }

        # Governance needs to decide on allowed actions for next agent on the list
        if self.current_agent_index < len(self.agents):
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': 0 }, \
                   { '__all__': False }, \
                   { 'gov': { } }

        # Agents need to act
        else:
            agent_observations = { agent: self.state[self.agent_observations[agent]] for agent in self.agents }
            agent_rewards = { agent: 1 if np.all(obs == obs[0]) else 0 for agent, obs in agent_observations.items() }

            return { agent: { 'obs': self.get_observation(agent), 'allowed_actions': self.allowed_actions[agent] } for agent in self.agents }, \
                   { agent: agent_rewards[agent] for agent in self.agents }, \
                   { '__all__': False }, \
                   { }

    def reset(self):
        self.current_step = 0
        self.current_agent_index = 0

        self.state = np.random.randint(0, self.NUMBER_OF_ACTIONS, (len(self.agents), ))

        self.allowed_actions = { }

        return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }

    def get_observation(self, agent: str):
        return self.state[self.agent_observations[agent]]


#### Methods

In [14]:
def run_experiment_without_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']
    LOG_DIR = config['LOG_DIR']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (PassiveGovernancePolicy, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)]
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=LOG_DIR)

In [21]:
def run_experiment_with_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']
    LOG_DIR = config['LOG_DIR']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { 'model': {'custom_model': ParametricAgentModel }, 'framework': 'tf' }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (None, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)] + ['gov']
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=LOG_DIR)

## Experiments

### Test

In [22]:
config = {
        'NUMBER_OF_AGENTS': 5,
        'NUMBER_OF_ACTIONS': 3,
        'NUMBER_OF_STEPS_PER_EPISODE': 100,
        'ALPHA': 0.0,
        'NUMBER_OF_TIMESTEPS': 1_000,
        'NUMBER_OF_SAMPLES': 3,
        'NAME': 'chapter_6',
        'LOG_DIR': '/Users/michael/Documents/Professional/Promotion/InES/Research/PhD Thesis/thesis/chapter_6/data'
    }

# UMAS
config['ENV'] = GMAS_Environment
run_experiment_with_governance(config)

TypeError: '<' not supported between instances of 'Version' and 'ValueError'

### Simulation

In [None]:
config = {
        'NUMBER_OF_AGENTS': 10,
        'NUMBER_OF_ACTIONS': 5,
        'NUMBER_OF_STEPS_PER_EPISODE': 100,
        'ALPHA': 0.0,
        'NUMBER_OF_TIMESTEPS': 12_000,
        'NUMBER_OF_SAMPLES': 3,
        'NAME': 'aamas2022',
        'LOG_DIR': './data/'
    }

# UMAS
config['ENV'] = GMAS_Environment
run_experiment_without_governance(config)

# CC
config['ENV'] = CC_Environment
run_experiment_with_governance(config)

# GMAS
config['ENV'] = GMAS_Environment
run_experiment_with_governance(config)

### Visualization

In [None]:
ids = {
        ('tiny', 'umas'): '<id>',
        ('tiny', 'cc'): '<id>',
        ('tiny', 'gmas'): '<id>',
        ('small', 'umas'): '<id>',
        ('small', 'cc'): '<id>',
        ('small', 'gmas'): '<id>',
        ('medium', 'umas'): '<id>',
        ('medium', 'cc'): '<id>',
        ('medium', 'gmas'): '<id>',
        ('large', 'umas'): '<id>',
        ('large', 'cc'): '<id>',
        ('large', 'gmas'): '<id>'
    }

def create_aamas_charts(ids):
    outer_path = f'{log_dir}/aamas2022'

    configurations = list(set(configuration for configuration, scenario in ids.keys()))
    scenarios = list(set(scenario for configuration, scenario in ids.keys()))
    kpis = ['governance_reward', 'degree_of_restriction']

    metrics = {
        'governance_reward': 'ray/tune/custom_metrics/episode_state_reward/gov_mean',
        'degree_of_restriction': 'ray/tune/custom_metrics/episode_degree_of_restriction/gov_mean'
    }

    experiment_folders = { key: glob.glob(f'{outer_path}/*{id}*/') for key, id in ids.items() }
    event_accumulators = { key: [EventAccumulator(f) for f in folders] for key, folders in experiment_folders.items() }

    current, total = 1, sum(len(ea) for ea in event_accumulators.values())
    for key, experiment in event_accumulators.items():
        for ea in experiment:
            print(f'\rLoading EventAccumulator {current}/{total}...', end='')
            ea.Reload()
            current += 1

    raw_data = { (configuration, scenario, kpi): [list(zip(*ea.Scalars(metrics[kpi]))) for ea in experiment] for kpi in kpis for (configuration, scenario), experiment in event_accumulators.items() }
    processed_data = { key: { 'x': np.array(experiment[0][1]), 'y': [np.array(sample[2]) for sample in experiment] } for key, experiment in raw_data.items() }
    final_data = { key: { 'x': experiment['x'], 'y': experiment['y'], 'mean': np.mean(experiment['y'], axis=0) } for key, experiment in processed_data.items() }

    print(f'Finished!')

    save_path = f'{log_dir}/aamas2022/charts'
    plt.style.use({'figure.facecolor':'white'})

    scenario_names = {
        'umas': 'UMAS',
        'cc': 'CC',
        'gmas': 'GMAS'
    }

    colors = {
        'umas': 'blue',
        'cc': 'red',
        'gmas': 'green'
    }

    for i, configuration in enumerate(configurations):
        for j, kpi in enumerate(kpis):
            for scenario in scenarios:
                x, ys, mean = itemgetter('x', 'y', 'mean')(final_data[(configuration, scenario, kpi)])
                color = colors[scenario]
                for y in ys:
                    plt.plot(x, y, color=color, alpha=0.4, linewidth=0.5)

                plt.plot(x, mean, color=color, label=scenario_names[scenario])

            plt.ticklabel_format(axis='x', useMathText=True)
            plt.xlabel('$t$')
            plt.legend()

            plt.savefig(f'{save_path}/{configuration}_{kpi}.png', format='png', bbox_inches='tight')

            plt.show()

# Deprecated

### Instructions
Using this notebook, you can reproduce the experiments shown in the submitted paper.
Please read the paper before running the experiments as it defines the terminology
required for the setup of the notebook.

The notebook contains three cells:
- _Setup_ - Contains all imports and function definitions
- _Run_ - Runs the experiments and saves the results as Tensorboard log files
- _Visualize_ - Reads existing Tensorboard log files and creates matplotlib charts

To facilitate parallel execution on different machines, a single run only executes one
configuration (this might take several hours to complete).

Please execute the following steps to reproduce the experiments:
1. Install all packages which are imported in the _Setup_ cell
2. Choose an appropriate value for the log directory (marked with the comment `# CUSTOMIZE FOLDER`)
3. Execute the _Setup_ cell
4. Choose appropriate values for the configuration (marked with the comment `# CUSTOMIZE CONFIGURATION`); exemplary values are provided in the comments
5. Execute the _Run_ cell
6. Repeat steps 4 and 5 with different configurations 
7. Start Tensorboard and look for the five-letter experiment identifiers of all scenarios
   (e.g., when the log folder is named *PPO_GMAS_4_GA_Environment_aabc7_00002_2_2021-09-28_13-46-08*,
   the identifier is _aabc7_), or check the log directory for the identifiers
8. Insert the identifiers in the variables marked with the comment `# CUSTOMIZE IDENTIFIERS`
9. Execute the _Visualize_ cell

The charts are both shown in the notebook and saved as PNG files in the subfolder _/charts/_.

### Original experiments
Unfortunately, it is not possible to include here the log files of the experiments shown in the paper 
(a single sample of one scenario in one configuration has ~50 MB, and there are 120 such files). 
Nevertheless, the log files can be accessed on a public repository after publication of the paper 
in order to provide full transparency.


In [None]:
import glob
from operator import itemgetter

import numpy as np
import tensorflow as tf

from ray import tune
from ray.rllib.policy.policy import Policy
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.env import MultiAgentEnv

from gym.spaces import Discrete, Box, Tuple, MultiDiscrete, Dict, MultiBinary

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import matplotlib.pyplot as plt


###########################################################
# CUSTOMIZE FOLDER
log_dir = '<log_dir>'
###########################################################

# Parametric-action agent model
class ParametricAgentModel(TFModelV2):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name, *args, **kwargs):
        super(ParametricAgentModel, self).__init__(obs_space, action_space, num_outputs, model_config, name, *args, **kwargs)

        assert isinstance(action_space, Discrete), f'action_space is a {type(action_space)}, but should be Discrete!'

        true_obs_shape = (3, )
        action_embed_size = action_space.n

        self.action_embed_model = FullyConnectedNetwork(Box(0, 1, shape=true_obs_shape), action_space, action_embed_size, model_config, name + '_action_embedding')

    def forward(self, input_dict, state, seq_lens):
        action_mask = input_dict['obs']['allowed_actions']
        action_embedding, _ = self.action_embed_model({ 'obs': input_dict['obs']['obs'] })
        intent_vector = tf.expand_dims(action_embedding, 1)
        action_logits = tf.reduce_sum(intent_vector, axis=1)
        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)

        return action_logits + inf_mask, state

    def value_function(self):
        return self.action_embed_model.value_function()

# Passive governance policy (for UMAS scenario)
class PassiveGovernancePolicy(Policy):
    """
    Always allows all actions
    """

    def __init__(self, observation_space, action_space, config={}):
        super().__init__(observation_space, action_space, config)

        assert isinstance(action_space, MultiDiscrete), f'action_space is not MultiDiscrete ({type(action_space)})'
        self.NUMBER_OF_ACTIONS = len(action_space.nvec)

    def compute_actions(self, obs_batch, state_batches=None, prev_action_batch=None, prev_reward_batch=None,
                        info_batch=None, episodes=None, **kwargs):

        return [np.ones(self.NUMBER_OF_ACTIONS).astype(bool) for _ in obs_batch], state_batches, {}

    def get_weights(self):
        return {}

    def set_weights(self, weights) -> None:
        pass

# Custom metrics logger
class CustomMetricsLogger(DefaultCallbacks):
    def __init__(self):
        super().__init__()

        # Unknown until on_episode_start
        self.NUMBER_OF_AGENTS = self.NUMBER_OF_ACTIONS = self.NUMBER_OF_STEPS_PER_EPISODE = self.ALPHA = None

    def on_episode_start(self, *, worker, base_env, policies, episode, env_index, **kwargs):
        if self.NUMBER_OF_AGENTS is None:
            env = base_env.get_unwrapped()[0]

            self.NUMBER_OF_AGENTS = env.NUMBER_OF_AGENTS
            self.NUMBER_OF_ACTIONS = env.NUMBER_OF_ACTIONS
            self.NUMBER_OF_STEPS_PER_EPISODE = env.NUMBER_OF_STEPS_PER_EPISODE
            self.ALPHA = env.ALPHA

        episode.user_data['gov_info'] = []

    def on_episode_step(self, *, worker, base_env, policies=None, episode, env_index, **kwargs):
        env = base_env.get_unwrapped()[0]

        if env.is_reward_step:
            episode.user_data['gov_info'].append(np.array(episode.last_info_for('gov')))

    def on_episode_end(self, *, worker, base_env, policies, episode, env_index, **kwargs):
        gov_info = np.vstack(episode.user_data['gov_info'])
        episode_state_reward, episode_restriction_reward, episode_degree_of_restriction = np.mean(gov_info, axis=0)

        episode.custom_metrics['episode_degree_of_restriction/gov'] = episode_degree_of_restriction
        episode.custom_metrics['episode_state_reward/gov'] = episode_state_reward
        episode.custom_metrics['episode_restriction_reward/gov'] = episode_restriction_reward
        episode.custom_metrics['episode_reward/gov'] = episode_state_reward + episode_restriction_reward

    def on_postprocess_trajectory(self, *, worker, episode, agent_id, policy_id, policies, postprocessed_batch, original_batches, **kwargs):
        if agent_id == 'gov':
            rewards = postprocessed_batch['rewards']
            infos = postprocessed_batch['infos']
            assert len(rewards.shape) == 1

            # Distribute rewards over all gov_actions in one environment step, not just the last one
            for i in range(len(rewards)):
                if rewards[i] != 0:
                    rewards[max(0, i - self.NUMBER_OF_AGENTS + 1):(i+1)] = rewards[i] / self.NUMBER_OF_AGENTS

# Dining diplomats environments
class CC_Environment(MultiAgentEnv):
    """
    Governance and agent reward is only 1 if all actions coincide.
    """

    def __init__(self, env_config: dict = {}):
        assert 'NUMBER_OF_ACTIONS' in env_config
        assert 'NUMBER_OF_AGENTS' in env_config
        assert 'NUMBER_OF_STEPS_PER_EPISODE' in env_config
        assert 'ALPHA' in env_config

        self.NUMBER_OF_ACTIONS = env_config['NUMBER_OF_ACTIONS']
        self.NUMBER_OF_STEPS_PER_EPISODE = env_config['NUMBER_OF_STEPS_PER_EPISODE']
        self.ALPHA = env_config['ALPHA']
        self.NUMBER_OF_AGENTS = env_config['NUMBER_OF_AGENTS']
        self.agents = [str(i) for i in range(self.NUMBER_OF_AGENTS)]

        self.agent_observations = { agent: np.array(np.mod([index - 1, index, index + 1], self.NUMBER_OF_AGENTS)) for index, agent in enumerate(self.agents) }

        # Define obs and action spaces for agents
        self.observation_space = Dict({ 'obs': Box(0, self.NUMBER_OF_ACTIONS - 1, shape=(3,)),
                             'allowed_actions': MultiBinary(self.NUMBER_OF_ACTIONS) })
        self.action_space = Discrete(self.NUMBER_OF_ACTIONS)

        self.state = None
        self.current_step = None
        self.current_agent_index = None

        self.allowed_actions = None

        self.is_reward_step = None

    def step(self, actions):
        self.is_reward_step = False
        if 'gov' in actions:
            # Only the governance has acted
            assert len(actions) == 1

            # Governance action is a set of allowed actions; save it for later
            self.allowed_actions[self.agents[self.current_agent_index]] = actions['gov'].astype(bool)
            self.current_agent_index += 1
        else:
            # All agents have acted
            assert len(actions) == len(self.agents)

            # Execute transition
            self.state = np.array([actions[agent] for agent in self.agents])

            state_reward = 1 if np.all(self.state == self.state[0]) else 0
            degree_of_restriction = np.mean([1 - (np.sum(allowed_actions) / self.NUMBER_OF_ACTIONS) for allowed_actions in self.allowed_actions.values()])
            restriction_reward = -self.ALPHA * degree_of_restriction
            gov_reward = state_reward + restriction_reward

            self.current_step += 1
            self.current_agent_index = 0
            self.allowed_actions = { } # Could be removed for speed-up

            # Governance needs to decide on allowed actions for first agent on the list (if episode is not over)
            self.is_reward_step = True
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': gov_reward }, \
                   { '__all__': self.current_step >= self.NUMBER_OF_STEPS_PER_EPISODE }, \
                   { 'gov': (state_reward, restriction_reward, degree_of_restriction) }

        # Governance needs to decide on allowed actions for next agent on the list
        if self.current_agent_index < len(self.agents):
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': 0 }, \
                   { '__all__': False }, \
                   { 'gov': { } }

        # Agents need to act
        else:
            agent_reward = 1 if np.all(self.state == self.state[0]) else 0

            return { agent: { 'obs': self.get_observation(agent), 'allowed_actions': self.allowed_actions[agent] } for agent in self.agents }, \
                   { agent: agent_reward for agent in self.agents }, \
                   { '__all__': False }, \
                   { }

    def reset(self):
        self.current_step = 0
        self.current_agent_index = 0

        self.state = np.random.randint(0, self.NUMBER_OF_ACTIONS, (len(self.agents), ))

        self.allowed_actions = { }

        return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }

    def get_observation(self, agent: str):
        return self.state[self.agent_observations[agent]]

class GMAS_Environment(MultiAgentEnv):
    """
    Governance reward is only 1 if all actions coincide.
    Agents get their rewards if their observations coincide.
    """

    def __init__(self, env_config: dict = {}):
        assert 'NUMBER_OF_ACTIONS' in env_config
        assert 'NUMBER_OF_AGENTS' in env_config
        assert 'NUMBER_OF_STEPS_PER_EPISODE' in env_config
        assert 'ALPHA' in env_config

        self.NUMBER_OF_ACTIONS = env_config['NUMBER_OF_ACTIONS']
        self.NUMBER_OF_STEPS_PER_EPISODE = env_config['NUMBER_OF_STEPS_PER_EPISODE']
        self.ALPHA = env_config['ALPHA']
        self.NUMBER_OF_AGENTS = env_config['NUMBER_OF_AGENTS']
        self.agents = [str(i) for i in range(self.NUMBER_OF_AGENTS)]

        self.agent_observations = { agent: np.array(np.mod([index - 1, index, index + 1], self.NUMBER_OF_AGENTS)) for index, agent in enumerate(self.agents) }

        # Define obs and action spaces for agents
        self.observation_space = Dict({ 'obs': Box(0, self.NUMBER_OF_ACTIONS - 1, shape=(3,)),
                             'allowed_actions': MultiBinary(self.NUMBER_OF_ACTIONS) })
        self.action_space = Discrete(self.NUMBER_OF_ACTIONS)

        self.state = None
        self.current_step = None
        self.current_agent_index = None

        self.allowed_actions = None

        self.is_reward_step = None

    def step(self, actions):
        self.is_reward_step = False
        if 'gov' in actions:
            # Only the governance has acted
            assert len(actions) == 1

            # Governance action is a set of allowed actions; save it for later
            self.allowed_actions[self.agents[self.current_agent_index]] = actions['gov'].astype(bool)
            self.current_agent_index += 1
        else:
            # All agents have acted
            assert len(actions) == len(self.agents)

            # Execute transition
            self.state = np.array([actions[agent] for agent in self.agents])

            state_reward = 1 if np.all(self.state == self.state[0]) else 0
            degree_of_restriction = np.mean([1 - (np.sum(allowed_actions) / self.NUMBER_OF_ACTIONS) for allowed_actions in self.allowed_actions.values()])
            restriction_reward = -self.ALPHA * degree_of_restriction
            gov_reward = state_reward + restriction_reward

            self.current_step += 1
            self.current_agent_index = 0
            self.allowed_actions = { } # Could be removed for speed-up

            # Governance needs to decide on allowed actions for first agent on the list (if episode is not over)
            self.is_reward_step = True
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': gov_reward }, \
                   { '__all__': self.current_step >= self.NUMBER_OF_STEPS_PER_EPISODE }, \
                   { 'gov': (state_reward, restriction_reward, degree_of_restriction) }

        # Governance needs to decide on allowed actions for next agent on the list
        if self.current_agent_index < len(self.agents):
            return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }, \
                   { 'gov': 0 }, \
                   { '__all__': False }, \
                   { 'gov': { } }

        # Agents need to act
        else:
            agent_observations = { agent: self.state[self.agent_observations[agent]] for agent in self.agents }
            agent_rewards = { agent: 1 if np.all(obs == obs[0]) else 0 for agent, obs in agent_observations.items() }

            return { agent: { 'obs': self.get_observation(agent), 'allowed_actions': self.allowed_actions[agent] } for agent in self.agents }, \
                   { agent: agent_rewards[agent] for agent in self.agents }, \
                   { '__all__': False }, \
                   { }

    def reset(self):
        self.current_step = 0
        self.current_agent_index = 0

        self.state = np.random.randint(0, self.NUMBER_OF_ACTIONS, (len(self.agents), ))

        self.allowed_actions = { }

        return { 'gov': { 'state': self.state, 'obs': self.get_observation(self.agents[self.current_agent_index]) } }

    def get_observation(self, agent: str):
        return self.state[self.agent_observations[agent]]

# Experiment without governance
def run_experiment_without_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (PassiveGovernancePolicy, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)]
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=log_dir)

# Experiment with governance
def run_experiment_with_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { 'model': {'custom_model': ParametricAgentModel }, 'framework': 'tf' }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (None, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)] + ['gov']
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=log_dir)

# Chart creation
def create_aamas_charts(ids):
    outer_path = f'{log_dir}/aamas2022'

    configurations = list(set(configuration for configuration, scenario in ids.keys()))
    scenarios = list(set(scenario for configuration, scenario in ids.keys()))
    kpis = ['governance_reward', 'degree_of_restriction']

    metrics = {
        'governance_reward': 'ray/tune/custom_metrics/episode_state_reward/gov_mean',
        'degree_of_restriction': 'ray/tune/custom_metrics/episode_degree_of_restriction/gov_mean'
    }

    experiment_folders = { key: glob.glob(f'{outer_path}/*{id}*/') for key, id in ids.items() }
    event_accumulators = { key: [EventAccumulator(f) for f in folders] for key, folders in experiment_folders.items() }

    current, total = 1, sum(len(ea) for ea in event_accumulators.values())
    for key, experiment in event_accumulators.items():
        for ea in experiment:
            print(f'\rLoading EventAccumulator {current}/{total}...', end='')
            ea.Reload()
            current += 1

    raw_data = { (configuration, scenario, kpi): [list(zip(*ea.Scalars(metrics[kpi]))) for ea in experiment] for kpi in kpis for (configuration, scenario), experiment in event_accumulators.items() }
    processed_data = { key: { 'x': np.array(experiment[0][1]), 'y': [np.array(sample[2]) for sample in experiment] } for key, experiment in raw_data.items() }
    final_data = { key: { 'x': experiment['x'], 'y': experiment['y'], 'mean': np.mean(experiment['y'], axis=0) } for key, experiment in processed_data.items() }

    print(f'Finished!')

    save_path = f'{log_dir}/aamas2022/charts'
    plt.style.use({'figure.facecolor':'white'})

    scenario_names = {
        'umas': 'UMAS',
        'cc': 'CC',
        'gmas': 'GMAS'
    }

    colors = {
        'umas': 'blue',
        'cc': 'red',
        'gmas': 'green'
    }

    for i, configuration in enumerate(configurations):
        for j, kpi in enumerate(kpis):
            for scenario in scenarios:
                x, ys, mean = itemgetter('x', 'y', 'mean')(final_data[(configuration, scenario, kpi)])
                color = colors[scenario]
                for y in ys:
                    plt.plot(x, y, color=color, alpha=0.4, linewidth=0.5)

                plt.plot(x, mean, color=color, label=scenario_names[scenario])

            plt.ticklabel_format(axis='x', useMathText=True)
            plt.xlabel('$t$')
            plt.legend()

            plt.savefig(f'{save_path}/{configuration}_{kpi}.png', format='png', bbox_inches='tight')

            plt.show()


In [None]:
def run():
    # Values for original experiments:
    # Tiny: 'NUMBER_OF_AGENTS': 5, 'NUMBER_OF_ACTIONS': 3
    # Small: 'NUMBER_OF_AGENTS': 10, 'NUMBER_OF_ACTIONS': 5
    # Medium: 'NUMBER_OF_AGENTS': 15, 'NUMBER_OF_ACTIONS': 7
    # Large: 'NUMBER_OF_AGENTS': 20, 'NUMBER_OF_ACTIONS': 10

    # All other values stay the same (to see the general functionality and save time, 
    # you can reduce the number of steps and/or number of samples)

    ###########################################################
    # CUSTOMIZE CONFIGURATION
    config = {
        'NUMBER_OF_AGENTS': 10,
        'NUMBER_OF_ACTIONS': 5,
        'NUMBER_OF_STEPS_PER_EPISODE': 100,
        'ALPHA': 0.0,
        'NUMBER_OF_TIMESTEPS': 12_000,
        'NUMBER_OF_SAMPLES': 3,
        'NAME': 'aamas2022'
    }
    ###########################################################

    # UMAS
    config['ENV'] = GMAS_Environment
    run_experiment_without_governance(config)

    # CC
    config['ENV'] = CC_Environment
    run_experiment_with_governance(config)

    # GMAS
    config['ENV'] = GMAS_Environment
    run_experiment_with_governance(config)

run()


In [None]:
def visualize():
    ###########################################################
    # CUSTOMIZE IDENTIFIERS
    ids = {
        ('tiny', 'umas'): '<id>',
        ('tiny', 'cc'): '<id>',
        ('tiny', 'gmas'): '<id>',
        ('small', 'umas'): '<id>',
        ('small', 'cc'): '<id>',
        ('small', 'gmas'): '<id>',
        ('medium', 'umas'): '<id>',
        ('medium', 'cc'): '<id>',
        ('medium', 'gmas'): '<id>',
        ('large', 'umas'): '<id>',
        ('large', 'cc'): '<id>',
        ('large', 'gmas'): '<id>'
    }
    ###########################################################

    create_aamas_charts(ids)

visualize()