### **Imports**

In [52]:
# Unity Environment
from mlagents_envs.environment import UnityEnvironment
from util import UnityParallelEnv
from wrappers import ConcatParallelEnv

# Utils
import numpy as np
from copy import copy

## **Analyze Environment**

In [2]:
ENV_NAME = "SoccerTwos"

def create_env(no_graphics=False):
    # Create unity environment
    unity_env = UnityEnvironment(file_name="../../../envs/SoccerTwos", worker_id=np.random.randint(10000), no_graphics=no_graphics)

    # Wrap it into Zookeeper Parallel API
    parallel_env = UnityParallelEnv(unity_env)

    return parallel_env

### **Test**

In [3]:
def test_env(env):
    # Agents
    print("Number of Agents:", len(env.agents))

    # Get example agent
    agent = env.agents[0]
    print("Example Agent:", agent)

    # Print action space
    action_space = env.action_space(agent)
    print("Agent Action Space:", action_space, type(action_space))

    # Print observation space
    observation_space = env.observation_space(agent)
    print("Agent Observation Space:", observation_space, type(observation_space))

    # Each observation/action is a dict keyed with agent with value the individual action/obs space of each agent
    # Each agent in this case should share the same policy (-> all have the same obs/action space)

    obs, info = env.reset()
    print("Observation Example:", obs[agent])

    # Simple test
    while env.agents:
        actions = {a: env.action_space(a).sample() for a in env.agents}
        print("Action Example:", actions[agent], type(actions[agent]))
        obs, rew, term, trunc, info = env.step(actions)
        print("Reward:", term)
        print("Term:", term)
        print("Trunc:", trunc)
        print("Info:", info)
        break

    # Should do nothing, test to see if it exists
    env.render() 

In [4]:
env = create_env(no_graphics=True)
test_env(env)
env.close()

'''OUTPUT:
Number of Agents: 32
Example Agent: SoccerTwos?team=0?agent_id=10
Agent Action Space: MultiDiscrete([3 3 3]) <class 'gymnasium.spaces.multi_discrete.MultiDiscrete'>
Agent Observation Space: Tuple(Box(-inf, inf, (264,), float32), Box(-inf, inf, (72,), float32)) <class 'gymnasium.spaces.tuple.Tuple'>
'''

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

"OUTPUT:\nNumber of Agents: 32\nExample Agent: SoccerTwos?team=0?agent_id=10\nAgent Action Space: MultiDiscrete([3 3 3]) <class 'gymnasium.spaces.multi_discrete.MultiDiscrete'>\nAgent Observation Space: Tuple(Box(-inf, inf, (264,), float32), Box(-inf, inf, (72,), float32)) <class 'gymnasium.spaces.tuple.Tuple'>\n"

### Reduce Observation/Action Space
Currently we have the observation space split up into 2 1D vectors and the action space as 3 discrete outputs each with 3 options. For our model and for full compatibility with petting zoo its better to not unnecassirly divide things that can be concatenated, especially the action space with is really 27 options not independent 3 sets of 3 options.
- Concat observation vector.
- Flatten 3D action space into 1D. (including action_mask)

In [4]:
def create_concat_env(no_graphics=False):
    # Create unity environment
    unity_env = UnityEnvironment(file_name="../../../envs/SoccerTwos", worker_id=np.random.randint(10000), no_graphics=no_graphics)

    # Wrap it into Zookeeper Parallel API
    parallel_env = UnityParallelEnv(unity_env)

    # Concat observation and flatten action
    env = ConcatParallelEnv(parallel_env)

    return env

In [6]:
env = create_concat_env(no_graphics=True)
test_env(env)
env.close()

'''OUTPUT:
Number of Agents: 32
Example Agent: SoccerTwos?team=0?agent_id=10
Agent Action Space: Discrete(27) <class 'gymnasium.spaces.discrete.Discrete'>
Agent Observation Space: Dict('action_mask': MultiBinary(27), 'observation': Box(-inf, inf, (336,), float32)) <class 'gymnasium.spaces.dict.Dict'>
'''

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

"OUTPUT:\nNumber of Agents: 32\nExample Agent: SoccerTwos?team=0?agent_id=10\nAgent Action Space: Discrete(27) <class 'gymnasium.spaces.discrete.Discrete'>\nAgent Observation Space: Dict('action_mask': MultiBinary(27), 'observation': Box(-inf, inf, (336,), float32)) <class 'gymnasium.spaces.dict.Dict'>\n"

### Official Test

In [None]:
# from pettingzoo.test import parallel_api_test
# env = create_concat_env(no_graphics=True)
# parallel_api_test(env, num_cycles=10000)
# env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

RuntimeError: Observation was not a dict: [array([0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.9228603 , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.4873765 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.16673389, 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.43415108, 0.        , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.15238026,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.3524268 , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.98099804, 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.30792153, 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.7885321 , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.28443366,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.60545486, 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.92139846, 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.47987977, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.4157082 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.16901653, 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.33745557, 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.16291979, 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.29484093, 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.81018656,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.27235085, 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.6220816 , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.7194396 , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.4762687 , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.952659  ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.36797518, 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.25066006, 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.3108753 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.16567215, 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.27969792,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.17185207, 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.26471898, 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.702773  ], dtype=float32), array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.6577469 , 0.        , 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.25705233, 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.53172666, 0.        ,
       1.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.65244246, 0.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.24826813,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.5214516 , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.67442834, 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.47944722, 0.        ,
       0.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 0.52287793], dtype=float32)]

### **Random Policy Game**

In [9]:
# env = create_concat_env(no_graphics=False)
# obs, info = env.reset()

# timestamps = 0
# while env.agents:
#     timestamps += 1
#     print(f"STEP: {timestamps}, AGENTS: {env.agents}")
#     agent = env.agents[0]

#     if "action_mask" in obs[agent]:
#         action_mask = obs[agent]["action_mask"]
#         print(f"ACTION MASK of {agent}: {action_mask}")

#     actions = {a: env.action_space(a).sample() for a in env.agents}
#     print(f"ACTION of {agent}: {actions[agent]}, INFO: {info[agent]}")
#     obs, rew, term, trunc, info = env.step(actions)

#     for agent, t in term.items():
#         if t:
#             print(f"AGENT TERMINATED: {agent}, REWARD: {rew[agent]}, INFO: {info[agent]}")
#     for agent, t in trunc.items():
#         if t:
#             print(f"AGENT TRUNCATED: {agent}, REWARD: {rew[agent]}, INFO: {info[agent]}")

# env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

# Torch RL

In [90]:
# Torch
import torch
import torch.nn as nn

# Model
from ffn import MLP

### Torch RL
# Env
from torchrl.envs import PettingZooWrapper
from torchrl.envs import TransformedEnv
from torchrl.envs.utils import MarlGroupMapType


# Modules
from tensordict.nn import TensorDictModule, TensorDictSequential, ProbabilisticTensorDictModule
from torchrl.modules import MaskedCategorical

# Data Collection
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer

In [91]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

Using Device: cpu


### Inspect TorchRL Environment

In [92]:
def create_torch_env(no_graphics=True):
    env = PettingZooWrapper(
        env=create_concat_env(no_graphics=no_graphics), 
        # use_mask=True,
        group_map=MarlGroupMapType.ALL_IN_ONE_GROUP
    )

    return env

In [93]:
env = create_torch_env()
print("action_keys:", env.action_spec)
print("reward_keys:", env.reward_spec)
print("done_keys:", env.done_spec)
print("observation_spec:", env.observation_spec)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [94]:
from torchrl.envs.utils import check_env_specs
check_env_specs(env)

[92m2025-09-04 00:12:43,641 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


## Model

### Model HyperParameters

In [77]:
HIDDEN_DIM = 256
N_BLOCKS = 3

In [98]:
observation_shape = env.observation_spec["agents", "observation", "observation"].shape[1]
action_shape = env.action_spec["agents", "action"].space.n

print(f"obs shape: {observation_shape}, action shape: {action_shape}")

obs shape: 336, action shape: 27


In [99]:
MODEL_CONFIG = {
    "hidden_dim": HIDDEN_DIM,
    "n_blocks": N_BLOCKS,
    "in_features": observation_shape,
    "out_features": action_shape,
}

def create_policy(config):
    model = MLP(**config)

    td_model = TensorDictModule(model, in_keys=[("agents", "observation", "observation")], out_keys=["logits"])
    dist = ProbabilisticTensorDictModule(
        in_keys={"logits": "logits", "mask": ("agents", "action_mask")},
        out_keys=[("agents", "action")],
        distribution_class=MaskedCategorical,
        return_log_prob=True,
        log_prob_key=("agents", "log_prob"),
        cache_dist=True,
    )

    policy = TensorDictSequential(td_model, dist)

    return policy

In [102]:
policy = create_policy(MODEL_CONFIG)
data = env.rollout(5, policy=policy)
data

TensorDict(
    fields={
        agents: TensorDict(
            fields={
                action: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.int64, is_shared=False),
                action_mask: Tensor(shape=torch.Size([5, 32, 27]), device=cpu, dtype=torch.bool, is_shared=False),
                done: Tensor(shape=torch.Size([5, 32, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                log_prob: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.float32, is_shared=False),
                mask: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([5, 32, 336]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([5, 32]),
                    device=None,
                    is_shared=False),
                terminated: Tensor(shape=torch.Size([5, 32, 1

In [103]:
def create_critic(config):
    # Remove out_features from config
    config = copy(config)
    config.pop("out_features", None)

    model = MLP(out_features=1, **config)
    value = TensorDictModule(model, in_keys=[("agents", "observation", "observation")], out_keys=[("agents", "state_value")])
    return value

In [104]:
create_critic(MODEL_CONFIG)

TensorDictModule(
    module=MLP(
      (proj_in): Linear(in_features=336, out_features=256, bias=True)
      (mlp_blocks): ModuleList(
        (0-2): 3 x MLPBlock(
          (norm): RMSNorm((256,), eps=None, elementwise_affine=True)
          (geglu): GeGLU(
            (linear): Linear(in_features=256, out_features=1364, bias=True)
            (gelu): GELU(approximate='none')
          )
          (proj_down): Linear(in_features=682, out_features=256, bias=True)
        )
      )
      (proj_out): Sequential(
        (0): RMSNorm((256,), eps=None, elementwise_affine=True)
        (1): Linear(in_features=256, out_features=1, bias=True)
      )
    ),
    device=cpu,
    in_keys=[('agents', 'observation', 'observation')],
    out_keys=[('agents', 'state_value')])

### Training Hyperparameters

In [108]:
policy = create_policy(MODEL_CONFIG)
collector = SyncDataCollector(create_torch_env(), policy, frames_per_batch=500, total_frames=5001)
# replay_buffer = ReplayBuffer(storage=LazyTensorStorage(frames_per_batch), sampler=SamplerWithoutReplacement(), batch_size=minibatch_size)

for i, tensordict_data in enumerate(collector):
    print(f"Iteration: {i} {tensordict_data}")

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz



Iteration: 0 TensorDict(
    fields={
        agents: TensorDict(
            fields={
                action: Tensor(shape=torch.Size([500, 32]), device=cpu, dtype=torch.int64, is_shared=False),
                action_mask: Tensor(shape=torch.Size([500, 32, 27]), device=cpu, dtype=torch.bool, is_shared=False),
                done: Tensor(shape=torch.Size([500, 32, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                log_prob: Tensor(shape=torch.Size([500, 32]), device=cpu, dtype=torch.float32, is_shared=False),
                mask: Tensor(shape=torch.Size([500, 32]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([500, 32, 336]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([500, 32]),
                    device=None,
                    is_shared=False),
                terminated: Tensor

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x103a82020>>
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/mlagents3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 