## **Analyze Environment**

### **Imports**

In [1]:
# Unity Environment
from mlagents_envs.environment import UnityEnvironment
from util import UnityParallelEnv
from wrappers import ConcatParallelEnv

# Utils
import numpy as np
from copy import copy

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


### **Unity -> PettingZoo Parallel Env**

In [2]:
ENV_NAME = "SoccerTwos"

def create_env(no_graphics=False):
    # Create unity environment
    unity_env = UnityEnvironment(file_name="../../../envs/SoccerTwos", worker_id=np.random.randint(10000), no_graphics=no_graphics)

    # Wrap it into Zookeeper Parallel API
    parallel_env = UnityParallelEnv(unity_env)

    return parallel_env

### **Test/Inspect**

In [3]:
def test_env(env):
    # Agents
    print("Number of Agents:", len(env.agents))

    # Get example agent
    agent = env.agents[0]
    print("Example Agent:", agent)

    # Print action space
    action_space = env.action_space(agent)
    print("Agent Action Space:", action_space, type(action_space))

    # Print observation space
    observation_space = env.observation_space(agent)
    print("Agent Observation Space:", observation_space, type(observation_space))

    # Each observation/action is a dict keyed with agent with value the individual action/obs space of each agent
    # Each agent in this case should share the same policy (-> all have the same obs/action space)

    obs, info = env.reset()
    print("Observation Example:", obs[agent])

    # Simple test
    while env.agents:
        actions = {a: env.action_space(a).sample() for a in env.agents}
        print("Action Example:", actions[agent], type(actions[agent]))
        obs, rew, term, trunc, info = env.step(actions)
        print("Reward:", term)
        print("Term:", term)
        print("Trunc:", trunc)
        print("Info:", info)
        break

    # Should do nothing, test to see if it exists
    env.render() 

In [4]:
env = create_env(no_graphics=True)
test_env(env)
env.close()

'''OUTPUT:
Number of Agents: 32
Example Agent: SoccerTwos?team=0?agent_id=10
Agent Action Space: MultiDiscrete([3 3 3]) <class 'gymnasium.spaces.multi_discrete.MultiDiscrete'>
Agent Observation Space: Tuple(Box(-inf, inf, (264,), float32), Box(-inf, inf, (72,), float32)) <class 'gymnasium.spaces.tuple.Tuple'>
'''

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

"OUTPUT:\nNumber of Agents: 32\nExample Agent: SoccerTwos?team=0?agent_id=10\nAgent Action Space: MultiDiscrete([3 3 3]) <class 'gymnasium.spaces.multi_discrete.MultiDiscrete'>\nAgent Observation Space: Tuple(Box(-inf, inf, (264,), float32), Box(-inf, inf, (72,), float32)) <class 'gymnasium.spaces.tuple.Tuple'>\n"

### **Reduce Observation/Action Space**
Currently we have the observation space split up into 2 1D vectors and the action space as 3 discrete outputs each with 3 options. For our model and for full compatibility with petting zoo its better to not unnecassirly divide things that can be concatenated, especially the action space with is really 27 options not independent 3 sets of 3 options.
- Concat observation vector.
- Flatten 3D action space into 1D. (including action_mask)

In [5]:
def create_concat_env(no_graphics=False):
    # Create unity environment
    unity_env = UnityEnvironment(file_name="../../../envs/SoccerTwos", worker_id=np.random.randint(10000), no_graphics=no_graphics)

    # Wrap it into Zookeeper Parallel API
    parallel_env = UnityParallelEnv(unity_env)

    # Concat observation and flatten action
    env = ConcatParallelEnv(parallel_env)

    return env

In [6]:
env = create_concat_env(no_graphics=True)
test_env(env)
env.close()

'''OUTPUT:
Number of Agents: 32
Example Agent: SoccerTwos?team=0?agent_id=10
Agent Action Space: Discrete(27) <class 'gymnasium.spaces.discrete.Discrete'>
Agent Observation Space: Dict('action_mask': MultiBinary(27), 'observation': Box(-inf, inf, (336,), float32)) <class 'gymnasium.spaces.dict.Dict'>
'''

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

"OUTPUT:\nNumber of Agents: 32\nExample Agent: SoccerTwos?team=0?agent_id=10\nAgent Action Space: Discrete(27) <class 'gymnasium.spaces.discrete.Discrete'>\nAgent Observation Space: Dict('action_mask': MultiBinary(27), 'observation': Box(-inf, inf, (336,), float32)) <class 'gymnasium.spaces.dict.Dict'>\n"

### **Official Test**

In [7]:
# from pettingzoo.test import parallel_api_test
# env = create_concat_env(no_graphics=True)
# parallel_api_test(env, num_cycles=10000)
# env.close()

### **Random Policy Game**

In [8]:
# env = create_concat_env(no_graphics=False)
# obs, info = env.reset()

# timestamps = 0
# while env.agents:
#     timestamps += 1
#     print(f"STEP: {timestamps}, AGENTS: {env.agents}")
#     agent = env.agents[0]

#     if "action_mask" in obs[agent]:
#         action_mask = obs[agent]["action_mask"]
#         print(f"ACTION MASK of {agent}: {action_mask}")

#     actions = {a: env.action_space(a).sample() for a in env.agents}
#     print(f"ACTION of {agent}: {actions[agent]}, INFO: {info[agent]}")
#     obs, rew, term, trunc, info = env.step(actions)

#     for agent, t in term.items():
#         if t:
#             print(f"AGENT TERMINATED: {agent}, REWARD: {rew[agent]}, INFO: {info[agent]}")
#     for agent, t in trunc.items():
#         if t:
#             print(f"AGENT TRUNCATED: {agent}, REWARD: {rew[agent]}, INFO: {info[agent]}")

# env.close()

# **Torch RL**

### **Imports**

In [171]:
# Torch
import torch
from torch import nn
from torch import optim

# Model
from ffn import MLP

### Torch RL
# Env
from util import SafePettingZooWrapper
from torchrl.envs.utils import MarlGroupMapType


# Modules
from tensordict.nn import TensorDictModule
from torchrl.modules import ProbabilisticActor, MaskedCategorical

# Data Collection
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement

# Loss
from torchrl.objectives import ClipPPOLoss, ValueEstimators

In [142]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

Using Device: cpu


### **Inspect TorchRL Environment**

The environment is wrapped to work with tensordicts. The tensordicts have batch dimension along agents, so the agents are expected to be consistent and always there, if they aren't (which is possible) we need to use use_mask=True which makes it so that the batch dimension will stay inconsistent even if some entries in it are considered undefined.

In [143]:
def create_torch_env(no_graphics=True):
    env = SafePettingZooWrapper(
        env=create_concat_env(no_graphics=no_graphics), 
        use_mask=True,
        group_map=MarlGroupMapType.ALL_IN_ONE_GROUP
    )

    return env

In [144]:
env = create_torch_env()
print("action_keys:", env.action_spec)
print("reward_keys:", env.reward_spec)
print("done_keys:", env.done_spec)
print("observation_spec:", env.observation_spec)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz



Thread 0x16d003000 may have been prematurely finalized
Setting up 7 worker threads for Enlighten.
Thread 0x16baef000 may have been prematurely finalized
Memory Statistics:
[ALLOC_TEMP_TLS] TLS Allocator
  StackAllocators : 
    [ALLOC_TEMP_MAIN]
      Peak usage frame count: [0-1.0 KB]: 1284 frames, [2.0 MB-4.0 MB]: 1 frames
      Initial Block Size 4.0 MB
      Current Block Size 4.0 MB
      Peak Allocated Bytes 2.1 MB
      Overflow Count 0
    [ALLOC_TEMP_Background Job.worker 4]
      Initial Block Size 32.0 KB
      Current Block Size 32.0 KB
      Peak Allocated Bytes 0 B
      Overflow Count 0
    [ALLOC_TEMP_Loading.PreloadManager]
      Initial Block Size 256.0 KB
      Current Block Size 256.0 KB
      Peak Allocated Bytes 88.1 KB
      Overflow Count 5
    [ALLOC_TEMP_Background Job.worker 3]
      Initial Block Size 32.0 KB
      Current Block Size 32.0 KB
      Peak Allocated Bytes 0 B
      Overflow Count 0
    [ALLOC_TEMP_Background Job.worker 11]
      Initial Block Si

In [145]:
from torchrl.envs.utils import check_env_specs
check_env_specs(env)

[92m2025-09-05 10:52:21,010 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


In [146]:
observation_shape = env.observation_spec["agents", "observation", "observation"].shape[1]
observation_key = env.observation_keys
action_shape = env.action_spec["agents", "action"].space.n

print(f"obs shape: {observation_shape}, action shape: {action_shape}, observation keys: {observation_key}")

obs shape: 336, action shape: 27, observation keys: [('agents', 'action_mask'), ('agents', 'mask'), ('agents', 'observation', 'observation')]


## **Model**

### **Model Hyperparameters**

In [147]:
HIDDEN_DIM = 256
N_BLOCKS = 3

MODEL_CONFIG = {
    "hidden_dim": HIDDEN_DIM,
    "n_blocks": N_BLOCKS,
    "in_features": observation_shape,
    "out_features": action_shape,
}

### **Policy Network**

In [148]:
def create_policy(config):
    model = MLP(**config)

    logits_model = TensorDictModule(model, in_keys=[("agents", "observation", "observation")], out_keys=[("agents", "logits")])
    policy = ProbabilisticActor(
        module=logits_model,  
        in_keys={"logits": ("agents", "logits"), "mask": ("agents", "action_mask")},
        out_keys=[("agents", "action")],
        distribution_class=MaskedCategorical,
        return_log_prob=True,
        log_prob_key=("agents", "log_prob"),
        cache_dist=True,
    )

    return policy

In [149]:
policy = create_policy(MODEL_CONFIG)
data = env.rollout(5, policy=policy)
data



TensorDict(
    fields={
        agents: TensorDict(
            fields={
                action: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.int64, is_shared=False),
                action_mask: Tensor(shape=torch.Size([5, 32, 27]), device=cpu, dtype=torch.bool, is_shared=False),
                done: Tensor(shape=torch.Size([5, 32, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                log_prob: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.float32, is_shared=False),
                logits: Tensor(shape=torch.Size([5, 32, 27]), device=cpu, dtype=torch.float32, is_shared=False),
                mask: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([5, 32, 336]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([5, 32]),
                   

### **Value Network**

In [151]:
def create_value(config):
    # Remove out_features from config
    config = copy(config)
    config.pop("out_features", None)

    model = MLP(out_features=1, **config)
    value = TensorDictModule(model, in_keys=[("agents", "observation", "observation")], out_keys=[("agents", "state_value")])
    return value

In [152]:
create_value(MODEL_CONFIG)

TensorDictModule(
    module=MLP(
      (proj_in): Linear(in_features=336, out_features=256, bias=True)
      (mlp_blocks): ModuleList(
        (0-2): 3 x MLPBlock(
          (norm): RMSNorm((256,), eps=None, elementwise_affine=True)
          (geglu): GeGLU(
            (linear): Linear(in_features=256, out_features=1364, bias=True)
            (gelu): GELU(approximate='tanh')
          )
          (proj_down): Linear(in_features=682, out_features=256, bias=True)
        )
      )
      (proj_out): Sequential(
        (0): RMSNorm((256,), eps=None, elementwise_affine=True)
        (1): Linear(in_features=256, out_features=1, bias=True)
      )
    ),
    device=cpu,
    in_keys=[('agents', 'observation', 'observation')],
    out_keys=[('agents', 'state_value')])

### **Env Rollout -> Advantage Estimate -> PPO Loss**

In [153]:
def make_loss_module(policy, value, epsilon, entropy_coef, gamma, lmbda):
    loss_module = ClipPPOLoss(
        actor_network=policy,
        critic_network=value,
        clip_epsilon=epsilon,
        entropy_coeff=entropy_coef,
        # normalize_advantage=True,
    )
    
    loss_module.set_keys(
        action=("agents", "action"),
        sample_log_prob=("agents", "log_prob"),
        value=("agents", "state_value"),

        advantage="advantage",
        value_target="value_target",

        reward=("agents", "reward"),
        done=("agents", "done"),
        terminated=("agents", "terminated"),
        # truncated can be left out; PPO uses done/terminated for bootstrapping
    )

    loss_module.make_value_estimator(ValueEstimators.GAE, gamma=gamma, lmbda=lmbda)

    return loss_module

In [165]:
policy, value = create_policy(MODEL_CONFIG).to(device), create_value(MODEL_CONFIG).to(device)
loss_module = make_loss_module(policy, value, epsilon=0.1, entropy_coef=0.01, gamma=0.99, lmbda=0.95).to(device)

# Compute Advantages (GAE) and Value Target (returns)
with torch.no_grad():
    data = env.rollout(5, policy=policy).to(device)
    loss_module.value_estimator(data)
data = data.reshape(-1)
print(data)

# Compute Loss
loss_data = loss_module(data)
print(loss_data)

TensorDict(
    fields={
        advantage: Tensor(shape=torch.Size([5, 32, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        agents: TensorDict(
            fields={
                action: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.int64, is_shared=False),
                action_mask: Tensor(shape=torch.Size([5, 32, 27]), device=cpu, dtype=torch.bool, is_shared=False),
                done: Tensor(shape=torch.Size([5, 32, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                log_prob: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.float32, is_shared=False),
                logits: Tensor(shape=torch.Size([5, 32, 27]), device=cpu, dtype=torch.float32, is_shared=False),
                mask: Tensor(shape=torch.Size([5, 32]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([5, 32, 336]), device=cpu, dtyp



### PROBLEM: Aggregated Fields (root done, others) Turn Batch_Dim from (T, A) -> (A,). Get rid of them

## **Training**

### **Training Hyperparameters**

In [180]:
### Training Loop Params
STORAGE_DEVICE = device # Use "cpu" to keep dataset in RAM, this is better if you have large datasets, then move only minibatches to VRAM.
TIMESTAMPS = 5000
GENERATION_SIZE = 1000
EPOCHS = 10

# GD Params
MINIBATCH_SIZE = 64
LR = 3e-4
MAX_GRAD_NORM = 1.0
WEIGHT_DECAY = 1e-5

### RL Params

# ENV Params (None)

# PPO Params
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPSILON = 0.1
ENTROPY_COEF = 0.1

### **Training Loop**

In [181]:
# Create env
env = create_torch_env()

# Create Models
policy = create_policy(MODEL_CONFIG).to(device)
value = create_value(MODEL_CONFIG).to(device)

# Create Collecter (Iterates through Environment) + Replay Buffer (Data Loader)
collector = SyncDataCollector(create_torch_env(), policy, frames_per_batch=GENERATION_SIZE, total_frames=TIMESTAMPS, device=device, storing_device=STORAGE_DEVICE)
replay_buffer = ReplayBuffer(storage=LazyTensorStorage(GENERATION_SIZE, device=STORAGE_DEVICE), sampler=SamplerWithoutReplacement(), batch_size=MINIBATCH_SIZE)

# Create Training Helpers
loss_module = make_loss_module(policy, value, epsilon=EPSILON, entropy_coef=ENTROPY_COEF, gamma=GAMMA, lmbda=GAE_LAMBDA)
optimizer = optim.AdamW(loss_module.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

### TRAINING LOOP
for i, tensordict_data in enumerate(collector):
    # 1. Compute Advantages and Value Target
    with torch.no_grad():
        loss_module.value_estimator(data)

    # 2. Minibatch Gradient Descent Loop
    dataset = tensordict_data.reshape(-1)
    replay_buffer.empty(); replay_buffer.extend(dataset)
    for epoch in range(EPOCHS):
        for _ in range(GENERATION_SIZE // MINIBATCH_SIZE):
            # 3. Optimization Step
            batch = replay_buffer.sample(MINIBATCH_SIZE).to(device)
            loss_data = loss_module(batch)
            loss = loss_data["loss_objective"] + loss_data["loss_critic"] + loss_data["loss_entropy"]
            optimizer.zero_grad(); loss.backward()
            nn.utils.clip_grad_norm_(loss_module.parameters(), max_norm=MAX_GRAD_NORM)
            optimizer.step()

    # 4. Log results
    print(f"Iteration: {i}")
env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [182]:
# Inference
env = create_torch_env(no_graphics=False)
with torch.no_grad():
    data = env.rollout(500, policy=policy).to(device)
print(data)
env.close();

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz