# Application: Learning optimal restrictions in a continuous-action game

This notebook corresponds to Section 5.3 of the paper "Grams & Oesterle (forthcoming). _DRAMA at the PettingZoo: Dynamically Restricted Action Spaces for Multi-Agent Reinforcement Learning Frameworks_."

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os, sys
sys.path.append(f'{os.getcwd()}/../../')

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from gymnasium.spaces import Discrete, Box, Space

from src.restrictions import DiscreteVectorRestriction
from src.wrapper import RestrictionWrapper
from src.restrictors import Restrictor, RestrictorActionSpace, DiscreteVectorActionSpace
from src.utils import flatdim, flatten, unflatten

from examples.utils import play, ReplayBuffer
from examples.traffic.env import TrafficEnvironment
from examples.traffic.agent import TrafficAgent
from examples.traffic.restrictor import TrafficRestrictor

from examples.traffic.utils import create_graph, analyze_graph, edge_path_to_node_path

In [None]:
graph = create_graph([
    ((0, 1), (0, 8, 1)), 
    ((0, 2), (11, 0, 0)), 
    ((1, 2), (1, 0, 0)), 
    ((1, 3), (11, 0, 0)), 
    ((2, 3), (0, 8, 1))
])

In [None]:
possible_agent_routes = [(0, 3)]

In [None]:
import itertools
import networkx as nx

def powerset(iterable):
    s = list(iterable)
    return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))

number_of_nodes = graph.number_of_nodes()
number_of_edges = graph.number_of_edges()

edge_list = list(graph.edges)
edge_indices = {e: i for i, e in enumerate(edge_list)}
edge_latencies = {i: graph[s][t]["latency"] for i, [s, t] in enumerate(edge_list)}

minimum_node_set = set(sum(possible_agent_routes, tuple()))

valid_edge_restrictions = []
for allowed_edges in powerset(range(number_of_edges)):
    subgraph = graph.edge_subgraph(edge_list[i] for i in allowed_edges)
    if minimum_node_set.issubset(subgraph.nodes) and all(nx.has_path(subgraph, s, t) for s, t in possible_agent_routes):
        valid_edge_restrictions.append(set(allowed_edges))

route_list = [tuple(edge_indices[e] for e in path)
        for s, t in possible_agent_routes
        for path in nx.all_simple_edge_paths(graph, s, t)
]
number_of_routes = len(route_list)

valid_route_restrictions = [np.array([set(route).issubset(edge_restriction) for route in route_list]) for edge_restriction in valid_edge_restrictions]

source_target_map = [(s, t) for s, t in possible_agent_routes for _ in nx.all_simple_edge_paths(graph, s, t)]

## Test: Without Governance

In [None]:
number_of_agents = 2

edge_list, edge_indices, edge_latencies, routes, route_list, route_indices = analyze_graph(graph)

agents = {f'agent_{i}': TrafficAgent(routes, route_indices, edge_indices) for i in range (number_of_agents)}
env = TrafficEnvironment(graph, list(agents), possible_routes, number_of_steps=100)
policies = {id: agent.act for id, agent in agents.items()}

trajectory = play(env, policies, max_iter=50, verbose=False, record_trajectory=True, render_mode=None)

In [None]:
trajectory.groupby('agent')['reward'].plot(legend=True, xlabel='Time step', ylabel='Reward');

In [None]:
ax, *_ = trajectory.groupby('agent')['action'].plot(style='.', legend=True)
ax.set_yticks(list(route_indices.values()), [edge_path_to_node_path(route, edge_list) for route in route_indices.keys()]);
ax.set_ylabel('Route taken')
ax.set_xlabel('Time step')

## With governance

In [None]:
number_of_agents = 2

edge_list, edge_indices, edge_latencies, routes, route_list, route_indices = analyze_graph(graph)
number_of_edges = graph.number_of_edges()

agents = {f'agent_{i}': TrafficAgent(routes, route_indices, edge_indices) for i in range (number_of_agents)}
env = TrafficEnvironment(graph, list(agents), possible_routes, number_of_steps=100)

restrictor = TrafficRestrictor(Box(0, np.inf, shape=(number_of_edges, )), DiscreteVectorActionSpace(Discrete(len(routes))))
wrapper = RestrictionWrapper(env, restrictor, restrictor_reward_fns={'restrictor_0': lambda env, rewards: rewards[env.agent_selection]})

policies = {**{id: agent.act for id, agent in agents.items()}, 'restrictor_0': restrictor.act}

trajectory = play(wrapper, policies, max_iter=50, verbose=False, record_trajectory=True, render_mode=None)

In [None]:
trajectory.groupby('agent')['reward'].plot(legend=True, xlabel='Time step', ylabel='Reward');

In [None]:
ax, *_ = trajectory[trajectory['agent'] != 'restrictor_0'].groupby('agent')['action'].plot(style='.', legend=True)
ax.set_yticks(list(route_indices.values()), [edge_path_to_node_path(route, edge_list) for route in route_indices.keys()]);
ax.set_ylabel('Route taken')
ax.set_xlabel('Time step')

## With self-learning restrictor

In [None]:
number_of_agents = 2

total_timesteps = 100_000

number_of_edges = graph.number_of_edges()

agents = {f'agent_{i}': TrafficAgent(route_list, source_target_map) for i in range (number_of_agents)}
restrictor = TrafficRestrictor(number_of_edges, number_of_routes,
                               valid_route_restrictions, total_timesteps=total_timesteps)

env = TrafficEnvironment(graph, list(agents), possible_agent_routes, number_of_routes, edge_latencies, route_list, number_of_steps=100)

env = RestrictionWrapper(env, restrictor)

In [None]:
history = pd.DataFrame(columns=['episode', 'episode_step', 'agent', 'observation', 'reward', 'action'], index=(range(total_timesteps)))
replay_buffer = ReplayBuffer(state_dim=flatdim(restrictor.observation_space), action_dim=flatdim(restrictor.action_space))

# Do not render during training
env.unwrapped.render_mode = None

current_timestep = 0
current_episode = 0
t = tqdm(total=total_timesteps)

while current_timestep < total_timesteps:
    env.reset()
    current_episode += 1
    current_episode_timestep = 0
    previous_restrictor_observation = None

    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()

        if agent == 'restrictor_0':
            if previous_restrictor_observation is not None:
                restrictor.learn(previous_restrictor_observation, previous_restrictor_action, observation, reward, termination or truncation)

            action = restrictor.act(observation)

            previous_restrictor_observation = observation
            previous_restrictor_action = action
        else:
            action = agents[agent].act(observation)

        if termination or truncation:
            action = None
        else:

        # print(f'{agent=}, {observation=}, {reward=}, {action=}')

            history.loc[current_timestep] = pd.Series({'episode': current_episode, 
                                               'episode_step': current_episode_timestep, 
                                               'agent': agent,
                                               'observation': observation, 
                                               'reward': reward, 
                                               'action': action}
                                               )
            
            current_timestep += 1
            current_episode_timestep += 1

        env.step(action)

        t.update()

In [None]:
restrictor_history = history[history.agent == 'restrictor_0']
restrictor_history

In [None]:
restrictor_actions = restrictor_history['action'].astype(int)
restrictor_actions

In [None]:
actions = list(range(15))

restrictor_action_counts = np.empty((len(restrictor_actions), len(actions)))
counts = np.zeros(len(actions))

for i, action in enumerate(restrictor_actions):
    counts[action] += 1
    restrictor_action_counts[i] = counts

pd.DataFrame(restrictor_action_counts, columns=map(str, valid_edge_restrictions)).plot()

In [None]:
ax = restrictor_actions.astype(int).plot(style='.', ms=0.5)
ax.set_yticks(range(len(valid_edge_restrictions)), valid_edge_restrictions);
ax.set_ylabel('Allowed edges')
ax.set_xlabel('Time step');

ax.get_figure().savefig('result.pdf', bbox_inches='tight')

In [None]:
valid_edge_restrictions[11]

In [None]:
restrictor_actions.iloc[388]

In [None]:
history[history.agent == 'restrictor_0'].reward.rolling(1000).mean().plot()

In [None]:
history_begin = history[:10000]
history_end = history[-10000:]

In [None]:
history_begin.groupby('agent')['reward'].plot();

In [None]:
history_end.groupby('agent')['reward'].plot();

In [None]:
ax, *_ = history_end[history_end['agent'] != 'restrictor_0'].groupby('agent')['action'].plot(style='.', legend=True)
ax.set_yticks(range(len(route_list)), [edge_path_to_node_path(route, edge_list) for route in route_list]);
ax.set_ylabel('Route taken')
ax.set_xlabel('Time step');

In [None]:
ax = history[history['agent'] == 'restrictor_0']['action'].astype(int).plot(style='.', legend=True)
ax.set_yticks(range(len(valid_edge_restrictions)), valid_edge_restrictions);
ax.set_ylabel('Route restriction')
ax.set_xlabel('Time step');

In [None]:
df = pd.DataFrame({'E': [0, 0, 0, 0, 1, 1], 'T': [0, 0, 1, 1, 0, 0], 'A': [0, 1, 0, 1, 0, 1], 'R': [5, 4, 4, 3, 3, 2]})

In [None]:
df.index = [(e, t) for e, t in zip(df['E'], df['T'])]
df

In [None]:
pd.concat((df[df.A == a]['R'] for a in df.A.unique()), axis=1)
    


In [None]:
df0 = df[df.A == a]

In [None]:
df0.index = [(e, t) for e, t in zip(df0['E'], df0['T'])]

In [None]:
df0