Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [1]:
%%capture
!git clone https://github.com/cuongtv312/marl-delivery.git
%cd marl-delivery
!pip install -r requirements.txt

In [2]:
%%capture
!pip install stable-baselines3

In [3]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [16]:
# TODO: Modify this one to add more information to the Agents

# def convert_state(state):
#     ret_state = {}
#     # state["time_step"] = np.array([state["time_step"]]).astype(np.float32).flatten(0)
#     # state["map"] = np.array(state["map"]).astype(np.float32)
#     ret_state["robots"] = np.array(state["robots"]).astype(np.float32).flatten()
#     ret_state["packages"] = np.array(state["packages"]).astype(np.float32).flatten()[:100]
#     if len(ret_state["packages"]) < 1000:
#         ret_state["packages"] = np.concatenate((ret_state["packages"], np.zeros(100-len(ret_state["packages"]))))
#     return np.concatenate(list(ret_state.values()))

import numpy as np

def convert_state(state):
    ret_state = {}

    # Normalize and flatten robot states (e.g., position, load status)
    robot_data = np.array(state["robots"], dtype=np.float32)
    if robot_data.ndim == 2:
        robot_data = robot_data.flatten()
    ret_state["robots"] = robot_data / 100.0  # assuming coordinates in [0, 100]

    # Normalize package states and pad to fixed length
    packages = np.array(state["packages"], dtype=np.float32).flatten()
    packages = packages / 100.0  # normalize similarly
    max_packages = 100
    if len(packages) < max_packages:
        packages = np.concatenate([packages, np.zeros(max_packages - len(packages))])
    else:
        packages = packages[:max_packages]
    ret_state["packages"] = packages

    # Optional: include time step (normalized)
    if "time_step" in state:
        time_step = np.array([state["time_step"]], dtype=np.float32) / 1000.0
        ret_state["time_step"] = time_step

    # Optional: include map if small enough
    if "map" in state and np.array(state["map"]).size < 1000:
        map_data = np.array(state["map"], dtype=np.float32).flatten() / 10.0
        ret_state["map"] = map_data

    return np.concatenate(list(ret_state.values()))

In [None]:
# TODO: Modify this one to make the agent learn faster

# def reward_shaping(r, env, state, action):
#     return r

def reward_shaping(r, env, state, action):
    shaped_reward = r

    # # Reward for picking up or delivering a package
    # if env.last_event == "delivery":
    #     shaped_reward += 10.0
    # elif env.last_event == "pickup":
    #     shaped_reward += 2.0

    # Small negative reward for idling (e.g., no movement)
    if np.all(np.array(action) == 0):
        shaped_reward -= 1.0

    # Penalty for collisions if detectable
    if hasattr(env, "collision_occurred") and env.collision_occurred:
        shaped_reward -= 5.0

    # Optional: slight negative reward to encourage faster completion
    shaped_reward -= 0.01  # time penalty

    return shaped_reward

In [18]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback


# Parallel environments
vec_env = make_vec_env(lambda: Env('map1.txt', 100, 5, 20, -0.01, 10., 1., 10), n_envs=10)
eval_env = Monitor(Env('map1.txt', 100, 5, 20, -0.01, 10., 1., 10), "ppo_delivery")

eval_callback = EvalCallback(eval_env, best_model_save_path="./best_model/",
                             log_path="./logs/", eval_freq=5000,
                             deterministic=True, render=False)

model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=100000, callback=eval_callback)
model.save("ppo_delivery")


Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -4.08    |
| time/              |          |
|    fps             | 3643     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 20480    |
---------------------------------


In [22]:
obs,_ = eval_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = eval_env.step(action)
    #print('='*10)
    #eval_env.unwrapped.env.render()
    if dones:
        break

print(info)

{'total_reward': -3.2599999999999976, 'total_time_steps': 100, 'episode': {'r': -4.26, 'l': 100, 't': 38.620815}}
