## State shaping

In [1]:
import numpy as np
from maro.rl import AbstractStateShaping
from maro.simulator.scenarios.citi_bike import DecisionType, DecisionType, Action

class CitiBikeStateShaping(AbstractStateShaping):
    def __init__(self, *, relative_tick_list, max_neighbor, station_attribute_list):
        super().__init__()
        self._relative_tick_list = relative_tick_list
        self._station_attribute_list = station_attribute_list
        self._max_neighbor = max_neighbor
        self._dim = len(relative_tick_list) * (max_neighbor + 1) * len(station_attribute_list) + max_neighbor + 2 # 2: current station idx, current decision event type

    def __call__(self, decision_event, snapshot_list):
        tick, station_idx, decision_type, action_scope = decision_event.tick, decision_event.station_idx, decision_event.type, decision_event.action_scope
        decision_type = 0 if decision_type == DecisionType.Supply else 1 
        tick_list = [tick + rt for rt in self._relative_tick_list]
        station_idx_list = action_scope.keys()
        features = snapshot_list["stations"][tick_list:station_idx_list:self._station_attribute_list]
        neighbor_idx_list = [i for i in station_idx_list if i != station_idx]
        features = np.concatenate((features, np.array(neighbor_idx_list), np.array([station_idx, decision_type])))

        return features.astype(np.float32)

    @property
    def dim(self):
        return self._dim
    

state_shaping = CitiBikeStateShaping(relative_tick_list=[-2, -1, 0], max_neighbor=20,\
                                     station_attribute_list=["bikes", "shortage", "trip_requirement", "capacity", "weekday", 
                                                             "temperature", "weather", "holiday", "extra_cost", "transfer_cost"])

## Build model

In [2]:
from collections import defaultdict

from torch.nn.functional import smooth_l1_loss
from torch.optim import Adam

from maro.rl.models.torch.learning_model import LearningModel
from maro.rl.models.torch.mlp_representation import MLPRepresentation
from maro.rl.models.torch.q_decision import QDecision
from maro.rl.algorithms.torch.dqn import DQN


# TODO: env.get_agent_idx_list
agent_idx_list = [i for i in range(449)]
neighbor_action_dim = 21
reposition_action_dim = 21

supply_alg_list = []
demand_alg_list = []

# build algorithm list
for idx in agent_idx_list:
    shared_representation_layers = MLPRepresentation(name="shared_representation",\
                                                     input_dim=state_shaping.dim,\
                                                     hidden_dims=[state_shaping.dim//2, state_shaping.dim//4],
                                                     output_dim=state_shaping.dim//4,
                                                     dropout_p=0.1)

    supply_neighbor_decision_layers = QDecision(name="supply_neighbor_decision",\
                                                input_dim=shared_representation_layers.output_dim,
                                                hidden_dims=None,
                                                output_dim=neighbor_action_dim)

    demand_neighbor_decision_layers = QDecision(name="demand_neighbor_decision",\
                                                input_dim=shared_representation_layers.output_dim,
                                                hidden_dims=None,
                                                output_dim=neighbor_action_dim)

    supply_reposition_decision_layers = QDecision(name="supply_reposition_decision",\
                                                  input_dim=shared_representation_layers.output_dim,
                                                  hidden_dims=None,
                                                  output_dim=reposition_action_dim)

    demand_reposition_decision_layers = QDecision(name="demand_reposition_decision",\
                                                  input_dim=shared_representation_layers.output_dim,
                                                  hidden_dims=None,
                                                  output_dim=reposition_action_dim)

    # build supply neighbor algorithm
    supply_neighbor_learning_model = LearningModel(representation_layers=shared_representation_layers,
                                                   decision_layers=supply_neighbor_decision_layers)
    supply_neighbor_alg = DQN(num_actions=neighbor_action_dim, gamma=0.9, eval_model=supply_neighbor_learning_model,\
                              loss_func=smooth_l1_loss, optimizer_cls=Adam, optimizer_params={"lr":1e-3},
                              replace_target_frequency=2)
    
    # build demand neighbor algorithm
    demand_neighbor_learning_model = LearningModel(representation_layers=shared_representation_layers,
                                                   decision_layers=demand_neighbor_decision_layers)
    demand_neighbor_alg = DQN(num_actions=neighbor_action_dim, gamma=0.9, eval_model=demand_neighbor_learning_model,\
                              loss_func=smooth_l1_loss, optimizer_cls=Adam, optimizer_params={"lr":1e-3},
                              replace_target_frequency=2)

    # build supply reposition algorithm
    supply_reposition_learning_model = LearningModel(representation_layers=shared_representation_layers,
                                                     decision_layers=supply_reposition_decision_layers)
    supply_reposition_alg = DQN(num_actions=reposition_action_dim, gamma=0.9, eval_model=supply_reposition_learning_model,\
                              loss_func=smooth_l1_loss, optimizer_cls=Adam, optimizer_params={"lr":1e-3},
                              replace_target_frequency=2)

    # build demand reposition algorithm
    demand_reposition_learning_model = LearningModel(representation_layers=shared_representation_layers,
                                                     decision_layers=demand_reposition_decision_layers)
    demand_reposition_alg = DQN(num_actions=reposition_action_dim, gamma=0.9, eval_model=demand_reposition_learning_model,\
                              loss_func=smooth_l1_loss, optimizer_cls=Adam, optimizer_params={"lr":1e-3},
                              replace_target_frequency=2)
    
    supply_alg_list.append({
        "neighbor": supply_neighbor_alg,
        "reposition": supply_reposition_alg
    })
    demand_alg_list.append({
        "neighbor": demand_neighbor_alg,
        "reposition": demand_reposition_alg
    })

## Action shaping

In [3]:
import math

from maro.rl import AbstractActionShaping
from maro.simulator.scenarios.citi_bike import DecisionType, Action

class CitiBikeActionShaping(AbstractActionShaping):
    def __init__(self, action_space: [float]):
        super().__init__(action_space)
        self._action_space = action_space

    def __call__(self, model_action, decision_event) -> Action:
        neighbor_action, reposition_action = model_action
    
        action = None
        if neighbor_action == 0 or reposition_action == 0:
            return action

        if decision_event.type == DecisionType.Supply:
            self_bike = decision_event.action_scope[decision_event.station_idx]
            target_idx = neighbor_action
            target_dock = list(decision_event.action_scope.keys())[target_idx]
            available_supplied_bike = min(self_bike, target_dock)
            supplied_bike = math.ceil(self._action_space[reposition_action] * available_supplied_bike)
            action = Action(decision_event.station_idx, target_idx, supplied_bike)
        elif decision_event.type == DecisionType.Demand:
            self_dock = decision_event.action_scope[decision_event.station_idx]
            target_idx = neighbor_action
            target_bike = list(decision_event.action_scope.keys())[target_idx]
            available_demanded_bike = min(self_dock, target_bike)
            demanded_bike = math.ceil(self._action_space[reposition_action] * available_demanded_bike)
            action = Action(target_idx, decision_event.station_idx, demanded_bike)
        else:
            pass
        
        return action

    @property
    def action_space(self):
        return self._action_space
    
    
action_shaping = CitiBikeActionShaping(action_space=[i/(neighbor_action_dim-1) for i in range(neighbor_action_dim)])

## Reward shaping

In [4]:
import numpy as np
import pickle

from maro.rl import AbstractRewardShaping, ReplayBuffer
from maro.simulator.scenarios.citi_bike import DecisionEvent

import numpy as np

from maro.rl import AbstractRewardShaping, ReplayBuffer
from maro.simulator.scenarios.citi_bike import DecisionEvent


class CitiBikeRewardShaping(AbstractRewardShaping):
    def __init__(self, fulfillment_factor: float = 1.0, shortage_factor: float = 1.0, cost_factor: float = 0.01,
                 time_window: int = 10, time_decay: float = 0.97):
        super().__init__()
        self._fulfillment_factor = fulfillment_factor
        self._shortage_factor = shortage_factor
        self._cost_factor = cost_factor
        self._time_window = time_window
        self._time_decay = time_decay

    def __call__(self, snapshot_list, replay_buffer: ReplayBuffer, state_shaping):
        for i in range(replay_buffer.size):
            decision_type = replay_buffer.get(i, "decision_type")
            station_idx = replay_buffer.get(i, "station_idx")
            action_scope = replay_buffer.get(i, "action_scope")
            neighbor_idxs = [ne for ne in action_scope.keys() if ne != station_idx]

            start_tick = replay_buffer.get(i, "tick")
            end_tick = start_tick + self._time_window

            ticks = list(range(start_tick, end_tick))
            # broadcast
            global_decay_list = [self._time_decay ** i for i in range(self._time_window)
                                 for _ in range(449)]
            local_decay_list = [self._time_decay ** i for i in range(self._time_window)
                                for _ in range(len(neighbor_idxs))]

            global_fulfillment = np.dot(snapshot_list["stations"][ticks::"fulfillment"], global_decay_list)
            local_fulfillment = np.dot(snapshot_list["stations"][ticks:neighbor_idxs:"fulfillment"], local_decay_list)

            global_shortage = np.dot(snapshot_list["stations"][ticks::"shortage"], global_decay_list)
            local_shortage = np.dot(snapshot_list["stations"][ticks:neighbor_idxs:"shortage"], local_decay_list)

            global_cost = np.dot(snapshot_list["stations"][ticks::"transfer_cost"], global_decay_list)
            local_cost = np.dot(snapshot_list["stations"][ticks:neighbor_idxs:"transfer_cost"], local_decay_list)

            replay_buffer.put(i, "decision_type", decision_type)
            replay_buffer.put(i, "global_reward", np.float32(self._fulfillment_factor * global_fulfillment -
                                                             self._shortage_factor * global_shortage -
                                                             self._cost_factor * global_cost))

            replay_buffer.put(i, "local_reward", np.float32(self._fulfillment_factor * local_fulfillment -
                                                            self._shortage_factor * local_shortage -
                                                            self._cost_factor * local_cost))

            hack_decision_event = DecisionEvent(station_idx=station_idx, tick=end_tick,
                                                frame_index=end_tick, action_scope_func=None,
                                                decision_type=decision_type)
            hack_decision_event._action_scope = action_scope

            replay_buffer.put(i, "next_state", state_shaping(hack_decision_event, snapshot_list))

                  
reward_shaping = CitiBikeRewardShaping()

## Init replay buffer

In [5]:
from maro.utils import ExperiencePool, ExperiencePoolType
from maro.rl import ReplayBuffer
from maro.simulator.scenarios.citi_bike import DecisionType

class CitiBikeReplayBuffer(ReplayBuffer):
    def __init__(self, experience_pool: ExperiencePool):
        super().__init__(experience_pool)

    def sample_from_experience_pool(self, batch_size, is_local=True):
        neighbor_idx_list = self._experience_store.apply_multi_samplers(
            category_samplers=[("neighbor_td_error", [(lambda i, o: (i, o), batch_size)])])["neighbor_td_error"]
        reposition_idx_list = self._experience_store.apply_multi_samplers(
            category_samplers=[("reposition_td_error", [(lambda i, o: (i, o), batch_size)])])["reposition_td_error"]

        if is_local:
            neighbor_sample_dict = self._experience_store.get(category_idx_batches=[
                ('state', neighbor_idx_list),
                ('local_reward', neighbor_idx_list),
                ('neighbor_action', neighbor_idx_list),
                ('next_state', neighbor_idx_list)
            ])
            reposition_sample_dict = self._experience_store.get(category_idx_batches=[
                ('state', reposition_idx_list),
                ('local_reward', reposition_idx_list),
                ('reposition_action', reposition_idx_list),
                ('next_state', reposition_idx_list)
            ])
        else:
            neighbor_sample_dict = self._experience_store.get(category_idx_batches=[
                ('state', neighbor_idx_list),
                ('global_reward', neighbor_idx_list),
                ('neighbor_action', neighbor_idx_list),
                ('next_state', neighbor_idx_list)
            ])
            reposition_sample_dict = self._experience_store.get(category_idx_batches=[
                ('state', reposition_idx_list),
                ('global_reward', reposition_idx_list),
                ('reposition_action', reposition_idx_list),
                ('next_state', reposition_idx_list)
            ])

        neighbor_res_dict = {}
        for key in neighbor_sample_dict.keys():
            if key == "local_reward" or key == "global_reward":
                neighbor_res_dict["reward"] = neighbor_sample_dict[key]
            elif key == "neighbor_action":
                neighbor_res_dict["action"] = neighbor_sample_dict[key]
            else:
                neighbor_res_dict[key] = neighbor_sample_dict[key]
                
        reposition_res_dict = {}
        for key in reposition_sample_dict.keys():
            if key == "local_reward" or key == "global_reward":
                reposition_res_dict["reward"] = reposition_sample_dict[key]
            elif key == "reposition_action":
                reposition_res_dict["action"] = reposition_sample_dict[key]
            else:
                reposition_res_dict[key] = reposition_sample_dict[key]

        for k in neighbor_res_dict.keys():
            neighbor_res_dict[k] = np.asarray(neighbor_res_dict[k])

        for k in reposition_res_dict.keys():
            reposition_res_dict[k] = np.asarray(reposition_res_dict[k])

        return neighbor_idx_list, neighbor_res_dict, reposition_idx_list, reposition_res_dict

supply_experience_pool_list = [ExperiencePool(ExperiencePoolType.FixedSize, 20000, "random") for _ in agent_idx_list]
supply_replay_buffer_list = [CitiBikeReplayBuffer(supply_experience_pool_list[idx]) for idx in agent_idx_list]
demand_experience_pool_list = [ExperiencePool(ExperiencePoolType.FixedSize, 20000, "random") for _ in agent_idx_list]
demand_replay_buffer_list = [CitiBikeReplayBuffer(demand_experience_pool_list[idx]) for idx in agent_idx_list]

## Create agents

In [6]:
from maro.rl import Agent, AgentMode, TrainingLoopHyperparameters
from maro.simulator.scenarios.citi_bike import DecisionEvent

class CitiBikeAgent(Agent):
    def __init__(self, name, algorithm, replay_buffer, training_loop_hyperparams,
                 mode: AgentMode = AgentMode.TRAIN_INFERENCE, state_shaping=None, action_shaping=None,
                 reward_shaping=None):
        super().__init__(name, algorithm, replay_buffer, training_loop_hyperparams, mode, state_shaping,
                         action_shaping, reward_shaping)

    def choose_action(self, decision_event: DecisionEvent, snapshot_list, exploration_rate=0):
        self._assert_inference_mode()
        model_state = self._state_shaping(decision_event, snapshot_list)
        neighbor_action = self._algorithm["neighbor"].choose_action(model_state, exploration_rate)
        reposition_action = self._algorithm["reposition"].choose_action(model_state, exploration_rate)
        env_action = self._action_shaping((neighbor_action, reposition_action), decision_event)

        self._current_transition = {"state": model_state,
                                    "next_state": None,
                                    "neighbor_action": neighbor_action,
                                    "reposition_action": reposition_action,
                                    "tick": decision_event.tick,
                                    "neighbor_td_error": 1e7,
                                    "reposition_td_error": 1e7,
                                    "local_reward": None,
                                    "global_reward": None,
                                    "decision_type": decision_event.type,
                                    "station_idx": decision_event.station_idx,
                                    "action_scope": decision_event.action_scope,
                                    }

        return env_action

    def postprocess(self, snapshot_list):
        self._assert_inference_mode()
        if self._reward_shaping is not None:
            self._reward_shaping(snapshot_list, self._trajectory, self._state_shaping)

    def train(self):
        self._assert_train_mode()
        if self._trajectory.num_experiences < self._min_experiences_to_train:
            return

        for _ in range(self._num_steps):
            neighbor_idx_list, neighbor_bath, reposition_idx_list, reposition_bath = self._trajectory.sample_from_experience_pool(
                self._batch_size)

            neighbor_loss = self._algorithm["neighbor"].train_on_batch(neighbor_bath)
            self._trajectory.update_experience_pool(neighbor_idx_list, "neighbor_td_error", neighbor_loss)
            reposition_loss = self._algorithm["reposition"].train_on_batch(reposition_bath)
            self._trajectory.update_experience_pool(reposition_idx_list, "reposition_td_error", reposition_loss)

    
training_loop_hyperparams = TrainingLoopHyperparameters(num_steps=10, batch_size=128, min_experiences_to_train=1024)

supply_agent_dict = {}
demand_agent_dict = {}
agent_idx_list = [i for i in range(449)]
for agent_idx in agent_idx_list :
    supply_agent_dict[agent_idx] = CitiBikeAgent(name=agent_idx,
                                                 algorithm=supply_alg_list[agent_idx],
                                                 replay_buffer=supply_replay_buffer_list[agent_idx],
                                                 training_loop_hyperparams=training_loop_hyperparams,
                                                 mode=AgentMode.TRAIN_INFERENCE,
                                                 state_shaping=state_shaping,
                                                 action_shaping=action_shaping,
                                                 reward_shaping=reward_shaping)

    demand_agent_dict[agent_idx] = CitiBikeAgent(name=agent_idx,
                                                 algorithm=demand_alg_list[agent_idx],
                                                 replay_buffer=demand_replay_buffer_list[agent_idx],
                                                 training_loop_hyperparams=training_loop_hyperparams,
                                                 mode=AgentMode.TRAIN_INFERENCE,
                                                 state_shaping=state_shaping,
                                                 action_shaping=action_shaping,
                                                 reward_shaping=reward_shaping)


## Train/Test

In [None]:
# training phase
max_train_ep = 100
initial_train_seed = 1024
max_eps = 0.4

import time
from random import randint

from tqdm import tqdm

from maro.simulator import Env
from maro.simulator.scenarios.citi_bike.common import Action, DecisionEvent, DecisionType

# 1st step: init CitiBike environment, tick unit is minute, resolution unit is tick
train_env = Env(scenario="citi_bike", topology="arthur_test", start_tick=0, durations=3600, snapshot_resolution=10)
test_env = Env(scenario="citi_bike", topology="arthur_test", start_tick=0, durations=3600, snapshot_resolution=10)

def exploration_schedule(ep, max_eps, max_train_ep):
    step = max_eps/max_train_ep
    return max_eps - (ep + 1) * step

for ep in range(max_train_ep):
    is_done: bool = False
    reward: int = None
    decision_event: DecisionEvent = None
    action: Action = None
    decision_count = 0
    epsilon = exploration_schedule(ep, max_eps, max_train_ep)
    
    env_start = time.time()
    reward, decision_event, is_done = train_env.step(action)
    while not is_done:
        if decision_event.type == DecisionType.Supply:
            cur_agent = supply_agent_dict[decision_event.station_idx]
        elif decision_event.type == DecisionType.Demand:
            cur_agent = demand_agent_dict[decision_event.station_idx]

        action = cur_agent.choose_action(decision_event, train_env.snapshot_list, epsilon)
        reward, decision_event, is_done = train_env.step(action)
        cur_agent.on_env_feedback(reward, train_env.snapshot_list)
        decision_count += 1
        
    env_end = time.time()
    print(f"ep-{ep} env run time: {(env_end-env_start)/60} min")
    print(f"ep-{ep} epsilon: {epsilon}")
    print(f"ep-{ep} total decision number: {decision_count}")
    no_action_total_shortage = train_env.snapshot_list["stations"][::"shortage"].sum()
    print(f"ep-{ep} total shortage: {no_action_total_shortage}")
    no_action_total_fulfillment = train_env.snapshot_list["stations"][::"fulfillment"].sum()
    print(f"ep-{ep} total fulfillment: {no_action_total_fulfillment}")
    no_action_total_trip_requirement = train_env.snapshot_list["stations"][::"trip_requirement"].sum()
    print(f"ep-{ep} total trip requirement: {no_action_total_trip_requirement}")
    
    for supply_agent, demand_agent in tqdm(zip(supply_agent_dict.values(), demand_agent_dict.values())):
        supply_train_start = time.time()
        supply_agent.postprocess(train_env.snapshot_list)
        supply_agent.flush_replay_buffer()
        supply_agent.train()
        supply_train_end = time.time()
        # print(f"ep-{ep} supply agent {supply_agent._agent_name} train time: {(supply_train_end-supply_train_start)/60} min")
        demand_train_start = time.time()
        demand_agent.postprocess(train_env.snapshot_list)
        demand_agent.flush_replay_buffer()
        demand_agent.train()
        demand_train_end = time.time()
        # print(f"ep-{ep} demand agent {demand_agent._agent_name} train time: {(demand_train_end-demand_train_start)/60} min")

    train_env.reset()


# Test
is_done: bool = False
reward: int = None
decision_event: DecisionEvent = None
action: Action = None
decision_count = 0
epsilon = 0

reward, decision_event, is_done = test_env.step(action)
while not is_done:
    if decision_event.type == DecisionType.Supply:
        cur_agent = supply_agent_dict[decision_event.station_idx]
    elif decision_event.type == DecisionType.Demand:
        cur_agent = demand_agent_dict[decision_event.station_idx]

    action = cur_agent.choose_action(decision_event, test_env.snapshot_list, epsilon)
    reward, decision_event, is_done = train_env.step(action)
    cur_agent.on_env_feedback(reward, train_env.snapshot_list)
    decision_count += 1

print(f"test phase total decision number: {decision_count}")
no_action_total_shortage = test_env.snapshot_list["stations"][::"shortage"].sum()
print(f"test phase total shortage: {no_action_total_shortage}")
no_action_total_fulfillment = test_env.snapshot_list["stations"][::"fulfillment"].sum()
print(f"test phase total fulfillment: {no_action_total_fulfillment}")
no_action_total_trip_requirement = test_env.snapshot_list["stations"][::"trip_requirement"].sum()
print(f"test phase total trip requirement: {no_action_total_trip_requirement}")

2it [00:00, 17.56it/s]

ep-0 env run time: 0.4954318086306254 min
ep-0 epsilon: 0.396
ep-0 total decision number: 2159
ep-0 total shortage: 2204.0
ep-0 total fulfillment: 50631.0
ep-0 total trip requirement: 52835.0


449it [00:06, 70.14it/s] 
2it [00:00, 15.70it/s]

ep-1 env run time: 0.48678418397903445 min
ep-1 epsilon: 0.392
ep-1 total decision number: 2101
ep-1 total shortage: 2391.0
ep-1 total fulfillment: 50444.0
ep-1 total trip requirement: 52835.0


449it [00:05, 84.40it/s] 
0it [00:00, ?it/s]

ep-2 env run time: 0.5125433882077535 min
ep-2 epsilon: 0.388
ep-2 total decision number: 2210
ep-2 total shortage: 2505.0
ep-2 total fulfillment: 50330.0
ep-2 total trip requirement: 52835.0


449it [00:05, 78.18it/s] 
