# **3DBallEnv Reinforcement Learning**

### **Imports**

In [295]:
### Utility
import numpy as np
import pandas as pd

### Torch
import torch
from torch import nn
from torch import optim

# Model
from ffn import MLP

### Torch RL
# Env
from torchrl.envs.libs import UnityMLAgentsEnv
from torchrl.envs.utils import MarlGroupMapType, step_mdp, check_env_specs

from torchrl.envs import Transform, TransformedEnv, Compose, Stack, RenameTransform, ExcludeTransform


# Modules
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from torchrl.modules import ProbabilisticActor, TanhNormal
from torch.distributions import Categorical

# Data Collection
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer, Composite
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from tensordict import TensorDict, TensorDictBase

# Loss
from torchrl.objectives import ClipPPOLoss, ValueEstimators

In [247]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

Using Device: cpu


## **Create Torch Env**

In [249]:
def create_unity_env(graphics=False):
    try:
        env.close()
    except:
        pass

    env = TransformedEnv(UnityMLAgentsEnv(
        file_name="../../../envs/3DBall", worker_id=np.random.randint(10000), no_graphics=(not graphics),
    ))

    return env

### **Agents not Batched**

In [250]:
def print_specs(env):
    print("action_spec:", env.action_spec)
    print("reward_spec:", env.reward_spec)
    print("done_spec:", env.done_spec)
    print("observation_spec:", env.observation_spec)

env = create_unity_env()
check_env_specs(env)
print_specs(env)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [251]:
agent_root_key = env.observation_keys[0][0]
print(agent_root_key)
n_agents = len(env.action_spec[agent_root_key])
print(n_agents)
agents = list(env.action_spec[agent_root_key].keys())
print(agents)

group_0
12
['agent_0', 'agent_1', 'agent_2', 'agent_3', 'agent_4', 'agent_5', 'agent_6', 'agent_7', 'agent_8', 'agent_9', 'agent_10', 'agent_11']


In [252]:
env.rollout(100)

  source[group_name][agent_name]["truncated"] = torch.tensor(


TensorDict(
    fields={
        group_0: TensorDict(
            fields={
                agent_0: TensorDict(
                    fields={
                        VectorSensor_size8: Tensor(shape=torch.Size([15, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                        continuous_action: Tensor(shape=torch.Size([15, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                        done: Tensor(shape=torch.Size([15, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                        terminated: Tensor(shape=torch.Size([15, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                        truncated: Tensor(shape=torch.Size([15, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
                    batch_size=torch.Size([15]),
                    device=None,
                    is_shared=False),
                agent_10: TensorDict(
                    fields={
                        VectorSensor_size8: Tensor(shape=torch.Size

### **Batch Agents**

In [348]:
def batch_agents(env, out_key="agents"):
    agent_root_key = env.observation_keys[0][0]
    agents = list(env.action_spec[agent_root_key].keys())
    
    # Create transform
    stack = Stack(
        in_keys=[(agent_root_key, agent) for agent in agents], 
        out_key=(out_key,), 
        in_key_inv=(out_key,), 
        out_keys_inv=[(agent_root_key, agent) for agent in agents]
    )

    env.append_transform(stack)
    return env

def create_base_env(graphics=False):
    env = create_unity_env(graphics)
    env = batch_agents(env)
    return env

In [254]:
env = create_base_env()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [255]:
check_env_specs(env)
print_specs(env)

ERROR: Shader UI/Default shader is not supported on this GPU (none of subshaders/fallbacks are suitable)
[92m2025-09-22 18:02:16,231 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m
action_spec: Composite(
    agents: Composite(
        continuous_action: BoundedContinuous(
            shape=torch.Size([12, 2]),
            space=ContinuousBox(
                low=Tensor(shape=torch.Size([12, 2]), device=cpu, dtype=torch.float32, contiguous=True),
                high=Tensor(shape=torch.Size([12, 2]), device=cpu, dtype=torch.float32, contiguous=True)),
            device=cpu,
            dtype=torch.float32,
            domain=continuous),
        device=None,
        shape=torch.Size([12]),
        data_cls=None),
    device=None,
    shape=torch.Size([]),
    data_cls=None)
reward_spec: Composite(
    agents: Composite(
        reward: UnboundedContinuous(
            shape=torch.Size([12, 1]),
            space=ContinuousBox(
                low=Tensor(shape=torch.S

In [256]:
td = env.rollout(100)
td

TensorDict(
    fields={
        agents: TensorDict(
            fields={
                VectorSensor_size8: Tensor(shape=torch.Size([16, 12, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                continuous_action: Tensor(shape=torch.Size([16, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([16, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                terminated: Tensor(shape=torch.Size([16, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([16, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([16, 12]),
            device=None,
            is_shared=False),
        next: TensorDict(
            fields={
                agents: TensorDict(
                    fields={
                        VectorSensor_size8: Tensor(shape=torch.Size([16, 12, 8]), device=cpu, dtype=torch.float32, is_shared=Fals

## **Env Data Preprocessing**

In [169]:
env = create_base_env()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

### **Inspect Keys**

In [188]:
observation_key = env.observation_keys[0][1]
action_key = env.action_key[1]
print(f"observation_key: {observation_key}, action_key: {action_key}")

observation_shape = env.observation_spec["agents", observation_key].shape
action_shape = env.action_spec["agents", action_key].shape

print(f"observation_shape: {observation_shape}, action_shape: {action_shape}")

observation_key: VectorSensor_size8, action_key: continuous_action
observation_shape: torch.Size([12, 8]), action_shape: torch.Size([12, 2])


### **Inspect Action Space**

Actions are in [-1, 1] as expected. No need for further tampering

In [206]:
td = env.rollout(100)
actions_df = pd.DataFrame({
    "action": td["agents", action_key].reshape(-1)
})
actions_df.describe()

  source[group_name][agent_name]["truncated"] = torch.tensor(


Unnamed: 0,action
count,312.0
mean,0.014681
std,0.587857
min,-0.997469
25%,-0.494984
50%,0.03497
75%,0.535219
max,0.990092


In [229]:
space = env.action_spec["agents", action_key].space
print("low:", float(space.low[0, 0]), "high:", float(space.high[0, 0]))

low: -1.0 high: 1.0


### **Inspect Observation Space**

Observation are around z score normalized. No need for further tampering

In [213]:
obs_df = pd.DataFrame({
    "obs": td["agents", observation_key].reshape(-1)
})
obs_df.describe()

Unnamed: 0,obs
count,1248.0
mean,0.243541
std,1.418613
min,-4.120198
25%,-0.249621
50%,0.0
75%,0.473056
max,4.008891


In [232]:
env.observation_spec["agents", observation_key]

UnboundedContinuous(
    shape=torch.Size([12, 8]),
    space=ContinuousBox(
        low=Tensor(shape=torch.Size([12, 8]), device=cpu, dtype=torch.float32, contiguous=True),
        high=Tensor(shape=torch.Size([12, 8]), device=cpu, dtype=torch.float32, contiguous=True)),
    device=cpu,
    dtype=torch.float32,
    domain=continuous)

### **Inspect Reward Space**

It looks like group_reward can be safely ignored. While the reward is -1 on failure, and 0.1 for surviving.

In [214]:
reward_df = pd.DataFrame({
    "reward": td["next", "agents", "reward"].reshape(-1),
    "group_reward": td["next", "agents", "group_reward"].reshape(-1)
})
reward_df.describe()

Unnamed: 0,reward,group_reward
count,156.0,156.0
mean,0.092949,0.0
std,0.08807,0.0
min,-1.0,0.0
25%,0.1,0.0
50%,0.1,0.0
75%,0.1,0.0
max,0.1,0.0


In [217]:
reward_df

Unnamed: 0,reward,group_reward
0,0.1,0.0
1,0.1,0.0
2,0.1,0.0
3,0.1,0.0
4,0.1,0.0
...,...,...
151,0.1,0.0
152,0.1,0.0
153,-1.0,0.0
154,0.1,0.0


### **Finalize Environment**
The environment is already in a good state, the only thing to do is exclude group_reward.

In [342]:
def create_env(graphics=False):
    env = create_base_env(graphics)
    env.append_transform(
        ExcludeTransform(("agents", "group_reward"))
    )
    return env

In [276]:
env = create_env()
check_env_specs(env)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [278]:
step_mdp(env.rollout(20))

  source[group_name][agent_name]["truncated"] = torch.tensor(


TensorDict(
    fields={
        agents: TensorDict(
            fields={
                VectorSensor_size8: Tensor(shape=torch.Size([12, 12, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                continuous_action: Tensor(shape=torch.Size([12, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([12, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                reward: Tensor(shape=torch.Size([12, 12, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([12, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([12, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([12, 12]),
            device=None,
            is_shared=False)},
    batch_size=torch.Size([12]),
    device=None,
    is_shared=False)

## **Create Models**

### **Config**

In [306]:
env = create_env()
observation_key = env.observation_keys[0][1]
action_key = env.action_key[1]

observation_shape = int(env.observation_spec["agents", observation_key].shape[-1])
action_shape = int(env.action_spec["agents", action_key].shape[-1])

HIDDEN_DIM = 256
N_BLOCKS = 3

MODEL_CONFIG = {
    "hidden_dim": HIDDEN_DIM,
    "n_blocks": N_BLOCKS,
    "in_features": observation_shape,
    "out_features": action_shape,
}

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [314]:
MODEL_CONFIG

{'hidden_dim': 256, 'n_blocks': 3, 'in_features': 8, 'out_features': 2}

### **Policy**

In [312]:
def create_policy(config):
    config = config.copy()
    config["out_features"] *= 2 # Double output dim, for loc and scale
    model = MLP(**config)

    normal_params_model = nn.Sequential(
        model,
        NormalParamExtractor()
    )
    logits_model = TensorDictModule(normal_params_model, in_keys=[("agents", observation_key)], out_keys=[("agents", "loc"), ("agents", "scale")])
    policy = ProbabilisticActor(
        module=logits_model,  
        distribution_class=TanhNormal,

        in_keys=[("agents", "loc"), ("agents", "scale")],
        out_keys=[("agents", action_key)],

        return_log_prob=True,
        log_prob_key=("agents", "log_prob"),
        cache_dist=True,
    )

    return policy

### **Value**

In [320]:
def create_value(config):
    # Remove out_features from config
    config = config.copy()
    config["out_features"] = 1

    model = MLP(**config)
    value = TensorDictModule(model, in_keys=[("agents", observation_key)], out_keys=[("agents", "state_value")])
    return value

### **PPO Loss Module**

In [321]:
def make_loss_module(policy, value, epsilon, entropy_coef, gamma, lmbda):
    loss_module = ClipPPOLoss(
        actor_network=policy,
        critic_network=value,
        clip_epsilon=epsilon,
        entropy_coeff=entropy_coef,
        # normalize_advantage=True,
    )
    
    loss_module.set_keys(
        action=("agents", action_key),
        sample_log_prob=("agents", "log_prob"),
        value=("agents", "state_value"),

        advantage=("agents", "advantage"),
        value_target=("agents", "value_target"),

        reward=("agents", "reward"),
        done=("agents", "done"),
        terminated=("agents", "terminated"),
        # truncated can be left out, PPO uses done/terminated for bootstrapping
    )

    loss_module.make_value_estimator(ValueEstimators.GAE, gamma=gamma, lmbda=lmbda)

    return loss_module

### **Inspect**

In [360]:
policy, value = create_policy(MODEL_CONFIG).to(device), create_value(MODEL_CONFIG).to(device)
loss_module = make_loss_module(policy, value, epsilon=0.1, entropy_coef=0.01, gamma=0.99, lmbda=0.95).to(device)

with torch.no_grad():
    td = env.rollout(100, policy=policy)
    loss_module.value_estimator(td)
data = step_mdp(td)["agents"]
data

  source[group_name][agent_name]["truncated"] = torch.tensor(


TensorDict(
    fields={
        agents: TensorDict(
            fields={
                VectorSensor_size8: Tensor(shape=torch.Size([14, 12, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                advantage: Tensor(shape=torch.Size([14, 12, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                continuous_action: Tensor(shape=torch.Size([14, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([14, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                loc: Tensor(shape=torch.Size([14, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                log_prob: Tensor(shape=torch.Size([14, 12]), device=cpu, dtype=torch.float32, is_shared=False),
                scale: Tensor(shape=torch.Size([14, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                state_value: Tensor(shape=torch.Size([14, 12, 1]), device=cpu, dtype=torch.float32, is_shared=False),
 

## **Training**

### **Training Hyperparameters**

In [398]:
### Training Loop Params
STORAGE_DEVICE = device # Use "cpu" to keep dataset in RAM, this is better if you have large datasets, then move only minibatches to VRAM.
TIMESTAMPS = 20_000
GENERATION_SIZE = 512
EPOCHS = 10

# GD Params
MINIBATCH_SIZE = 32
LR = 3e-4
MAX_GRAD_NORM = 0.5
WEIGHT_DECAY = 1e-5

### RL Params

# ENV Params (None)

# PPO Params
GAMMA = 0.999
GAE_LAMBDA = 0.95
EPSILON = 0.2
ENTROPY_COEF = 1e-4

### **Training Loop**

In [399]:
CONTINUE=False

In [None]:
# Create env
env = create_env(graphics=False)

# Create Models
if not CONTINUE:
    policy = create_policy(MODEL_CONFIG).to(device)
    value = create_value(MODEL_CONFIG).to(device)

# Create Collecter (Iterates through Environment) + Replay Buffer (Data Loader + Storage)
collector = SyncDataCollector(create_env, policy, frames_per_batch=GENERATION_SIZE, total_frames=TIMESTAMPS, device=device, storing_device=STORAGE_DEVICE)
replay_buffer = ReplayBuffer(storage=LazyTensorStorage(GENERATION_SIZE, device=STORAGE_DEVICE), sampler=SamplerWithoutReplacement(), batch_size=MINIBATCH_SIZE)

# Loss + Optimizer
loss_module = make_loss_module(policy, value, epsilon=EPSILON, entropy_coef=ENTROPY_COEF, gamma=GAMMA, lmbda=GAE_LAMBDA)
optimizer = optim.AdamW(loss_module.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

### TRAINING LOOP
for i, tensordict_data in enumerate(collector):
    # 1. Compute Advantages and Value Target
    with torch.no_grad():
        loss_module.value_estimator(tensordict_data)

    # 2. Minibatch Gradient Descent Loop
    replay_buffer.empty(); replay_buffer.extend(tensordict_data)
    for epoch in range(EPOCHS):
        for _ in range(GENERATION_SIZE // MINIBATCH_SIZE):
            # 3. Optimization Step
            batch = replay_buffer.sample(MINIBATCH_SIZE).to(device)
            loss_data = loss_module(batch)
            loss = loss_data["loss_objective"] + loss_data["loss_critic"] + loss_data["loss_entropy"]
            optimizer.zero_grad(); loss.backward()
            nn.utils.clip_grad_norm_(loss_module.parameters(), max_norm=MAX_GRAD_NORM)
            optimizer.step()

    # 4. Log results
    dataset = step_mdp(tensordict_data)["agents"]
    mean_step_reward = dataset["reward"].mean()
    mean_return = dataset["value_target"].mean()
    action_std = dataset[action_key].std()
    n_done = dataset["done"].sum()
    print(f"PROGRESS: {(i+1)*GENERATION_SIZE}/{TIMESTAMPS}, MEAN RETURN: {mean_return}, MEAN STEP REWARD: {mean_step_reward}, ACTION STD: {action_std}, DONE: {n_done}")
env.close()

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz



ERROR: Shader UI/Default shader is not supported on this GPU (none of subshaders/fallbacks are suitable)
Thread 0x171dcf000 may have been prematurely finalized
Setting up 7 worker threads for Enlighten.
Thread 0x17082f000 may have been prematurely finalized
Memory Statistics:
[ALLOC_TEMP_TLS] TLS Allocator
  StackAllocators : 
    [ALLOC_TEMP_MAIN]
      Peak usage frame count: [0-1.0 KB]: 6910301 frames, [1.0 KB-2.0 KB]: 37432 frames, [2.0 MB-4.0 MB]: 1 frames
      Initial Block Size 4.0 MB
      Current Block Size 4.0 MB
      Peak Allocated Bytes 2.1 MB
      Overflow Count 0
    [ALLOC_TEMP_Background Job.worker 4]
      Initial Block Size 32.0 KB
      Current Block Size 32.0 KB
      Peak Allocated Bytes 0 B
      Overflow Count 0
    [ALLOC_TEMP_Loading.PreloadManager]
      Initial Block Size 256.0 KB
      Current Block Size 256.0 KB
      Peak Allocated Bytes 78.9 KB
      Overflow Count 4
    [ALLOC_TEMP_Background Job.worker 3]
      Initial Block Size 32.0 KB
      Curren

  source[group_name][agent_name]["truncated"] = torch.tensor(


## **Test**

In [378]:
env = create_env(graphics=True)

[UnityMemory] Configuration Parameters - Can be set up in boot.config
    "memorysetup-bucket-allocator-granularity=16"
    "memorysetup-bucket-allocator-bucket-count=8"
    "memorysetup-bucket-allocator-block-size=4194304"
    "memorysetup-bucket-allocator-block-count=1"
    "memorysetup-main-allocator-block-size=16777216"
    "memorysetup-thread-allocator-block-size=16777216"
    "memorysetup-gfx-main-allocator-block-size=16777216"
    "memorysetup-gfx-thread-allocator-block-size=16777216"
    "memorysetup-cache-allocator-block-size=4194304"
    "memorysetup-typetree-allocator-block-size=2097152"
    "memorysetup-profiler-bucket-allocator-granularity=16"
    "memorysetup-profiler-bucket-allocator-bucket-count=8"
    "memorysetup-profiler-bucket-allocator-block-size=4194304"
    "memorysetup-profiler-bucket-allocator-block-count=1"
    "memorysetup-profiler-allocator-block-size=16777216"
    "memorysetup-profiler-editor-allocator-block-size=1048576"
    "memorysetup-temp-allocator-siz

In [393]:
with torch.no_grad():
    data = env.rollout(100, policy=policy)
data

TensorDict(
    fields={
        agents: TensorDict(
            fields={
                VectorSensor_size8: Tensor(shape=torch.Size([23, 12, 8]), device=cpu, dtype=torch.float32, is_shared=False),
                continuous_action: Tensor(shape=torch.Size([23, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                done: Tensor(shape=torch.Size([23, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                loc: Tensor(shape=torch.Size([23, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                log_prob: Tensor(shape=torch.Size([23, 12]), device=cpu, dtype=torch.float32, is_shared=False),
                scale: Tensor(shape=torch.Size([23, 12, 2]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([23, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([23, 12, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
       

In [354]:
env.close()