# Chapter 6: Finding optimal restrictions via Reinforcement Learning

## Setup and Definitions

### Imports

In [5]:
import glob
from operator import itemgetter

import numpy as np

from gymnasium.spaces import Box, MultiDiscrete, Dict
from ray import tune
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import matplotlib.pyplot as plt

from src.agent import ParametricAgentModel
from src.governance import PassiveGovernancePolicy
from src.logger import CustomMetricsLogger
from src.env import FMAS_Environment, GMAS_Environment

### Definitions

In [14]:
def run_experiment_without_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']
    LOG_DIR = config['LOG_DIR']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (PassiveGovernancePolicy, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)]
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=LOG_DIR)

In [21]:
def run_experiment_with_governance(config):
    NUMBER_OF_AGENTS = config['NUMBER_OF_AGENTS']
    NUMBER_OF_ACTIONS = config['NUMBER_OF_ACTIONS']
    NUMBER_OF_STEPS_PER_EPISODE = config['NUMBER_OF_STEPS_PER_EPISODE']
    ALPHA = config['ALPHA']
    ENV = config['ENV']
    NUMBER_OF_TIMESTEPS = config['NUMBER_OF_TIMESTEPS']
    NUMBER_OF_SAMPLES = config['NUMBER_OF_SAMPLES']
    NAME = config['NAME']
    LOG_DIR = config['LOG_DIR']

    gov_obs_space = Dict({ 'state': MultiDiscrete([NUMBER_OF_ACTIONS] * NUMBER_OF_AGENTS),
                           'obs': Box(0, NUMBER_OF_ACTIONS - 1, shape=(3,))})
    gov_action_space = MultiDiscrete([2] * NUMBER_OF_ACTIONS)

    def policy_mapping_fn(agent_id, episode, **kwargs):
        # TODO: Why is agent0 sometimes called?
        if 'agent' in agent_id:
            print(f'Invalid agent_id ({agent_id})!')

        return agent_id if 'agent' not in agent_id else agent_id[5:]

    run_config = {
        'env': ENV,
        'env_config': {
          'NUMBER_OF_STEPS_PER_EPISODE': NUMBER_OF_STEPS_PER_EPISODE,
          'NUMBER_OF_AGENTS': NUMBER_OF_AGENTS,
          'NUMBER_OF_ACTIONS': NUMBER_OF_ACTIONS,
          'ALPHA': ALPHA
        },
        'multiagent': {
            'policies': {
                **{str(i): (None, None, None, { 'model': {'custom_model': ParametricAgentModel }, 'framework': 'tf' }) for i in range(NUMBER_OF_AGENTS)},
                'gov': (None, gov_obs_space, gov_action_space, { })
            },
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': [str(i) for i in range(NUMBER_OF_AGENTS)] + ['gov']
        },
        'callbacks': CustomMetricsLogger
    }

    return tune.run('PPO', verbose=1, config=run_config, stop={'timesteps_total': NUMBER_OF_TIMESTEPS},
             num_samples=NUMBER_OF_SAMPLES, checkpoint_at_end=True,
             name=NAME, local_dir=LOG_DIR)

In [None]:
def create_charts(ids):
    outer_path = f'{log_dir}/chapter_6'

    configurations = list(set(configuration for configuration, scenario in ids.keys()))
    scenarios = list(set(scenario for configuration, scenario in ids.keys()))
    kpis = ['governance_reward', 'degree_of_restriction']

    metrics = {
        'governance_reward': 'ray/tune/custom_metrics/episode_state_reward/gov_mean',
        'degree_of_restriction': 'ray/tune/custom_metrics/episode_degree_of_restriction/gov_mean'
    }

    experiment_folders = { key: glob.glob(f'{outer_path}/*{id}*/') for key, id in ids.items() }
    event_accumulators = { key: [EventAccumulator(f) for f in folders] for key, folders in experiment_folders.items() }

    current, total = 1, sum(len(ea) for ea in event_accumulators.values())
    for key, experiment in event_accumulators.items():
        for ea in experiment:
            print(f'\rLoading EventAccumulator {current}/{total}...', end='')
            ea.Reload()
            current += 1

    raw_data = { (configuration, scenario, kpi): [list(zip(*ea.Scalars(metrics[kpi]))) for ea in experiment] for kpi in kpis for (configuration, scenario), experiment in event_accumulators.items() }
    processed_data = { key: { 'x': np.array(experiment[0][1]), 'y': [np.array(sample[2]) for sample in experiment] } for key, experiment in raw_data.items() }
    final_data = { key: { 'x': experiment['x'], 'y': experiment['y'], 'mean': np.mean(experiment['y'], axis=0) } for key, experiment in processed_data.items() }

    print(f'Finished!')

    save_path = f'{log_dir}/chapter_6/charts'
    plt.style.use({'figure.facecolor':'white'})

    scenario_names = {
        'umas': 'UMAS',
        'fmas': 'FMAS',
        'gmas': 'GMAS'
    }

    colors = {
        'umas': 'blue',
        'fmas': 'red',
        'gmas': 'green'
    }

    for i, configuration in enumerate(configurations):
        for j, kpi in enumerate(kpis):
            for scenario in scenarios:
                x, ys, mean = itemgetter('x', 'y', 'mean')(final_data[(configuration, scenario, kpi)])
                color = colors[scenario]
                for y in ys:
                    plt.plot(x, y, color=color, alpha=0.4, linewidth=0.5)

                plt.plot(x, mean, color=color, label=scenario_names[scenario])

            plt.ticklabel_format(axis='x', useMathText=True)
            plt.xlabel('$t$')
            plt.legend()

            plt.savefig(f'{save_path}/{configuration}_{kpi}.png', format='png', bbox_inches='tight')

            plt.show()

## Experiments

### Simulation

In [None]:
config = {
        'NUMBER_OF_AGENTS': 10,
        'NUMBER_OF_ACTIONS': 5,
        'NUMBER_OF_STEPS_PER_EPISODE': 100,
        'ALPHA': 0.0,
        'NUMBER_OF_TIMESTEPS': 12_000,
        'NUMBER_OF_SAMPLES': 3,
        'NAME': 'dining_diplomats',
        'LOG_DIR': './data/'
    }

# UMAS
config['ENV'] = GMAS_Environment
run_experiment_without_governance(config)

# FMAS
config['ENV'] = FMAS_Environment
run_experiment_with_governance(config)

# GMAS
config['ENV'] = GMAS_Environment
run_experiment_with_governance(config)

### Visualization

In [None]:
ids = {
        ('tiny', 'umas'): '<id>',
        ('tiny', 'fmas'): '<id>',
        ('tiny', 'gmas'): '<id>',
        ('small', 'umas'): '<id>',
        ('small', 'fmas'): '<id>',
        ('small', 'gmas'): '<id>',
        ('medium', 'umas'): '<id>',
        ('medium', 'fmas'): '<id>',
        ('medium', 'gmas'): '<id>',
        ('large', 'umas'): '<id>',
        ('large', 'fmas'): '<id>',
        ('large', 'gmas'): '<id>'
    }

create_charts(ids)