In [17]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip3 install torchrl
!pip3 install "gymnasium[mujoco]"
!pip3 install -U 'mujoco-py<2.2,>=2.1'
!pip3 install tqdm

In [24]:
import torch
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs import ObservationNorm, TransformedEnv, Compose, DoubleToFloat, StepCounter
from torchrl.envs.utils import check_env_specs

import matplotlib.pyplot as plt
from tqdm import tqdm

In [10]:
device = torch.device(
    'cuda:0' if torch.cuda.is_available() else
    'cpu'
)

In [11]:
MODEL_SIZE = 256
BATCH_SIZE = 1000
SUB_BATCH_SIZE = 64
LEARNING_RATE = 3e-4
MAX_GRAD_NORM = 1.0
MAX_FRAME_NUMBER = 50_000
EPOCHS = 10
CLIP_EPSILON = (0.2)
GAMMA = 0.99
LAMBDA = 0.95
ENTROPY_EPS = 1e-4

In [21]:
# environment definition

base_env = GymEnv('InvertedDoublePendulum-v4', device=device)
env = TransformedEnv(
    base_env,
    Compose(
        # normalise onservations
        ObservationNorm(in_keys=['observation']),
        DoubleToFloat(),
        StepCounter(),
    ),
)
env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)
print('env:', env)
print('normalisation shape:', env.transform[0].loc.shape)

env: TransformedEnv(
    env=GymEnv(env=InvertedDoublePendulum-v4, batch_size=torch.Size([]), device=cpu),
    transform=Compose(
            ObservationNorm(keys=['observation']),
            DoubleToFloat(in_keys=['observation'], out_keys=['observation'], in_keys_inv=[], out_keys_inv=[]),
            StepCounter(keys=[])))
normalisation shape: torch.Size([11])


In [22]:
# environment specs
print('observation spec:', env.observation_spec)
print('reward spec:', env.reward_spec)
print('inpput spec:', env.input_spec)
print('action spec:', env.action_spec)

observation spec: Composite(
    observation: UnboundedContinuous(
        shape=torch.Size([11]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([11]), device=cpu, dtype=torch.float32, contiguous=True),
            high=Tensor(shape=torch.Size([11]), device=cpu, dtype=torch.float32, contiguous=True)),
        device=cpu,
        dtype=torch.float32,
        domain=continuous),
    step_count: BoundedDiscrete(
        shape=torch.Size([1]),
        space=ContinuousBox(
            low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True),
            high=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, contiguous=True)),
        device=cpu,
        dtype=torch.int64,
        domain=discrete),
    device=cpu,
    shape=torch.Size([]),
    data_cls=None)
reward spec: UnboundedContinuous(
    shape=torch.Size([1]),
    space=ContinuousBox(
        low=Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.float32, contiguous=True),

In [25]:
# ensure specs are properly defined
check_env_specs(env)

[92m2026-02-09 13:44:29,588 [torchrl][INFO][0m    check_env_specs succeeded![92m [END][0m


In [26]:
rollout = env.rollout(3)
rollout

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([3, 11]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                step_count: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.int64, is_shared=False),
                terminated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([3]),
            device=cpu,
            is_s