In [32]:
from torchrl.envs import GymEnv

env = GymEnv("Pendulum-v1")

In [33]:
from torchrl.modules import Actor, MLP, ValueOperator
from torchrl.objectives import DDPGLoss

In [34]:
n_obs = env.observation_spec["observation"].shape[-1]
n_act = env.action_spec.shape[-1]
actor = Actor(
    MLP(in_features=n_obs, out_features=n_act, num_cells=[32,32])
)
value_net = ValueOperator(
    MLP(in_features=n_obs + n_act, out_features=1, num_cells=[32, 32]),
    in_keys=["observation", "action"]
)

ddpg_loss = DDPGLoss(actor_network=actor, value_network=value_net)

In [35]:
# run the module

rollout = env.rollout(max_steps=100, policy=actor)
loss_vals = ddpg_loss(rollout)
print(loss_vals)

TensorDict(
    fields={
        loss_actor: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
        loss_value: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
        pred_value: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False),
        pred_value_max: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
        target_value: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False),
        target_value_max: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False),
        td_error: Tensor(shape=torch.Size([100]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([]),
    device=None,
    is_shared=False)




In [36]:
loss_vals.items()

dict_items([('loss_actor', tensor(0.0336, grad_fn=<MeanBackward0>)), ('loss_value', tensor(37.1705, grad_fn=<MeanBackward0>)), ('td_error', tensor([2.9938e-03, 6.1555e-03, 1.2844e-02, 2.7106e-02, 5.7696e-02, 1.2357e-01,
        2.6547e-01, 5.6952e-01, 1.2126e+00, 2.5447e+00, 5.2299e+00, 1.0464e+01,
        2.0245e+01, 3.7568e+01, 6.6202e+01, 1.0962e+02, 1.6910e+02, 2.3765e+02,
        1.7324e+02, 1.1808e+02, 7.4906e+01, 4.4340e+01, 2.4691e+01, 1.3081e+01,
        6.6768e+00, 3.3278e+00, 1.6468e+00, 8.3325e-01, 4.5807e-01, 3.0407e-01,
        2.7401e-01, 3.4700e-01, 5.6797e-01, 1.0676e+00, 2.1239e+00, 4.2886e+00,
        8.5981e+00, 1.6859e+01, 3.1895e+01, 5.7461e+01, 9.7398e+01, 1.5387e+02,
        2.2555e+02, 1.8133e+02, 1.2609e+02, 8.1694e+01, 4.9398e+01, 2.8071e+01,
        1.5131e+01, 7.8104e+00, 3.8968e+00, 1.8965e+00, 9.0811e-01, 4.3129e-01,
        2.0487e-01, 9.8533e-02, 4.9079e-02, 2.6447e-02, 1.6609e-02, 1.3359e-02,
        1.4642e-02, 2.1364e-02, 3.8040e-02, 7.5725e-02, 1.59

In [37]:
total_loss = 0
for key, val in loss_vals.items():
    if key.startswith("loss_"):
        total_loss += val

In [38]:
total_loss

tensor(37.2041, grad_fn=<AddBackward0>)

In [39]:
# Get trainiable parameters with parameters()

from torch.optim import Adam
optim = Adam(ddpg_loss.parameters())
total_loss.backward()
optim.step()
optim.zero_grad()


In [40]:
ddpg_loss.parameters()# , list(ddpg_loss.parameters()) 

<generator object LossModule.parameters at 0x168da92e0>

In [41]:
# Utilizing target parameters for policy training often proves to be significantly more efficient.
# user’s responsibility to update these values

from torchrl.objectives import SoftUpdate

updater = SoftUpdate(ddpg_loss, eps=0.99)

# In the training loop we need to update the target parameters at each optimization step

updater.step()

In [42]:
# Data Collection and Storage

import tempfile

In [43]:
# DataLoader is referred to a DataCollectors

# SyncDataCollector: takes size of the batch (frames_per_batch) and length of the 
# iterator (possible infinite), policy, and env

import torch

from torchrl.collectors import SyncDataCollector
from torchrl.envs import GymEnv
from torchrl.envs.utils import RandomPolicy

torch.manual_seed(0)

env = GymEnv("CartPole-v1")
env.set_seed(0)

policy = RandomPolicy(env.action_spec)
collector = SyncDataCollector(env, policy, frames_per_batch=200, total_frames=-1)

In [44]:
for data in collector:
    print(data) # May have multiple trajectories in batch since size is fixed to 200
    break

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([200, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([200]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([200]),
            device=None,
            is_shared=False),
        done: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([200, 4]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor

In [45]:
# Data is augmented with collectr-specific metadata

print(data["collector", "traj_ids"])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9,
        9, 9, 9, 9, 9, 9, 9, 9])


In [46]:
# Replay Buffers - Data is stored temporarily (ReplayBuffer)

"""
for data in collector:
    storage.store(data)
    for i in range(n_optim):
        sample = storage.sample()
        loss_val = loss_fn(sample)
        loss_val.backward()
        optim.step() # etc
"""

# Can edit storage type, sampling used, transforms, etc.
# LazyMemapStorage: no neet to tell what data looks like in advance, only need to specify size of buffer.

'\nfor data in collector:\n    storage.store(data)\n    for i in range(n_optim):\n        sample = storage.sample()\n        loss_val = loss_fn(sample)\n        loss_val.backward()\n        optim.step() # etc\n'

In [47]:
from torchrl.data.replay_buffers import LazyMemmapStorage, ReplayBuffer

buffer_scratch_dir = tempfile.TemporaryDirectory().name

buffer = ReplayBuffer(
    storage=LazyMemmapStorage(max_size=1000, scratch_dir=buffer_scratch_dir)
)

# Populate buffer via add() (single element) or extend() (multiple elements).

In [48]:
indices = buffer.extend(data)

In [49]:
buffer

ReplayBuffer(
    storage=LazyMemmapStorage(
        data=TensorDict(
            fields={
                action: MemoryMappedTensor(shape=torch.Size([200, 2]), device=cpu, dtype=torch.int64, is_shared=True),
                collector: TensorDict(
                    fields={
                        traj_ids: MemoryMappedTensor(shape=torch.Size([200]), device=cpu, dtype=torch.int64, is_shared=True)},
                    batch_size=torch.Size([200]),
                    device=cpu,
                    is_shared=False),
                done: MemoryMappedTensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.bool, is_shared=True),
                next: TensorDict(
                    fields={
                        done: MemoryMappedTensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.bool, is_shared=True),
                        observation: MemoryMappedTensor(shape=torch.Size([200, 4]), device=cpu, dtype=torch.float32, is_shared=True),
                        reward: Memory

In [50]:
assert len(buffer) == collector.frames_per_batch # buffer has same number of elements as what we got from the collector

In [51]:
# Sampling

sample = buffer.sample(batch_size=30)
sample

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([30, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        collector: TensorDict(
            fields={
                traj_ids: Tensor(shape=torch.Size([30]), device=cpu, dtype=torch.int64, is_shared=False)},
            batch_size=torch.Size([30]),
            device=cpu,
            is_shared=False),
        done: Tensor(shape=torch.Size([30, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([30, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([30, 4]), device=cpu, dtype=torch.float32, is_shared=False),
                reward: Tensor(shape=torch.Size([30, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([30, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=to

In [52]:
# Logging (torchrl.record)

from torchrl.record import CSVLogger

logger = CSVLogger(exp_name="my_experiment")

In [53]:
# Log values such as reward, loss value or time elapsed

logger.log_scalar("my_scalar", 0.4)

In [54]:
# Recording videos

from torchrl.envs import GymEnv

env = GymEnv("CartPole-v1", from_pixels=True, pixels_only=False)
print(env.rollout(max_steps=3))

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([3, 2]), device=cpu, dtype=torch.int64, is_shared=False),
        done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([3, 4]), device=cpu, dtype=torch.float32, is_shared=False),
                pixels: Tensor(shape=torch.Size([3, 400, 600, 3]), device=cpu, dtype=torch.uint8, is_shared=False),
                reward: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.float32, is_shared=False),
                terminated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                truncated: Tensor(shape=torch.Size([3, 1]), device=cpu, dtype=torch.bool, is_shared=False)},
            batch_size=torch.Size([3]),
            device=None,
            

In [55]:
from torchrl.envs import TransformedEnv
from torchrl.record import VideoRecorder

recorder = VideoRecorder(logger, tag="my_video")
record_env = TransformedEnv(env, recorder)

In [None]:
rollout = record_env.rollout(max_steps=3)
recorder.dump() # Can change video format in CSVLogger
