In [116]:
from collections import defaultdict
from typing import Optional

import numpy as np
import torch
import tqdm
from tensordict.nn import TensorDictModule
from tensordict.tensordict import TensorDict, TensorDictBase
from torch import nn

from torchrl.data import BoundedTensorSpec, CompositeSpec, UnboundedContinuousTensorSpec
from torchrl.envs import (
    CatTensors,
    EnvBase,
    Transform,
    TransformedEnv,
    UnsqueezeTransform,
)
from torchrl.envs.transforms.transforms import _apply_to_composite
from torchrl.envs.utils import check_env_specs, step_mdp

DEFAULT_X = np.pi
DEFAULT_Y = 1.0

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [117]:
thetas_count = 6
pose_count = 6
error_done_threshold = 1e-3
weights = torch.Tensor([1.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [118]:
import numpy as np
import random
import torch
from pytorch3d import transforms
import math
from linguamechanica.kinematics import DifferentiableOpenChainMechanism
from linguamechanica.kinematics import UrdfRobotLibrary

urdf_robot = UrdfRobotLibrary.dobot_cr5()
open_chain = urdf_robot.extract_open_chains(0.3)[-1]

def force_parameters_within_bounds(thetas):
    bigger_than_pi = thetas > math.pi
    thetas[bigger_than_pi] = thetas[bigger_than_pi] - (2.0 * math.pi)
    less_than_minus_pi = thetas < -math.pi
    thetas[less_than_minus_pi] = thetas[less_than_minus_pi] + (2.0 * math.pi)
    return thetas

In [119]:
def compute_reward(thetas, target_pose, weights, error_done_threshold):
    if len(thetas.shape) == 1:
        thetas = thetas.unsqueeze(0)
    if len(target_pose.shape) == 1:
        target_pose = target_pose.unsqueeze(0)
    error_pose = open_chain.compute_error_pose(
        thetas, target_pose
    )
    pose_error = DifferentiableOpenChainMechanism.compute_weighted_error(
        error_pose, weights
    )
    done = pose_error < error_done_threshold
    reward = - pose_error
    return reward, done

In [199]:
def _step(tensordict):
    thetas = tensordict["thetas"]
    theta_deltas = tensordict["action"]
    max_theta_deltas = tensordict["params", "max_theta_deltas"]
    if len(thetas.shape) == 1:
        max_theta_deltas = max_theta_deltas.unsqueeze(1)
    theta_deltas = theta_deltas.clamp(-max_theta_deltas, max_theta_deltas)
    new_thetas = thetas - theta_deltas
    target_pose = tensordict["target_pose"]
    #TODO: I have no idea if this is a good idea or not
    new_thetas = force_parameters_within_bounds(new_thetas)
    reward, done = compute_reward(new_thetas, target_pose, weights, error_done_threshold)
    out = TensorDict(
        {
            "next": {
                "thetas": new_thetas,
                "target_pose": target_pose,
                "params": tensordict["params"],
                "reward": reward,
                "done": done,
            }
        },
        tensordict.shape,
    )
    return out

In [200]:
def uniformly_sample_parameters_within_constraints(open_chain, batch_size):
    samples = []
    for sample_idx in range(batch_size):
        coordinates = []
        for i in range(len(open_chain.joint_limits)):
            # TODO: check if unconstrained works
            coordinates.append(
                random.uniform(
                    open_chain.joint_limits[i][0],
                    open_chain.joint_limits[i][1],
                )
            )
        samples.append(torch.Tensor(coordinates).unsqueeze(0))
    return torch.cat(samples, 0)


In [201]:
def generate_random_target_pose(target_thetas):
    if len(target_thetas.shape) == 1:
        target_thetas = target_thetas.unsqueeze(0)
    target_transformation = open_chain.forward_transformation(
        target_thetas
    )
    target_pose = transforms.se3_log_map(
        target_transformation.get_matrix()
    )
    if target_thetas.shape[0] == 1:
        target_thetas = target_thetas.squeeze(0)
    return target_pose

In [202]:
def _reset(self, tensordict):
    if tensordict is None or tensordict.is_empty():
        # if no tensordict is passed, we generate a single set of hyperparameters
        # Otherwise, we assume that the input tensordict contains all the relevant
        # parameters to get started.
        tensordict = self.gen_params(batch_size=self.batch_size)
    batch_size = 1 if len(tensordict.shape) == 0 else tensordict.shape[0]
    thetas = uniformly_sample_parameters_within_constraints(open_chain, batch_size).to(device=self.device)
    if batch_size == 1:
        thetas = thetas.squeeze(0)
    thetas = force_parameters_within_bounds(thetas)
    #TODO: randommize this better
    target_thetas = thetas + torch.randn(thetas.shape)
    target_thetas = force_parameters_within_bounds(target_thetas)
    target_pose =  generate_random_target_pose(target_thetas)
    #TODO: finish this
    target_pose = (
        torch.rand([*tensordict.shape, pose_count], generator=self.rng, device=self.device)
    )
    out = TensorDict(
        {
            "thetas": thetas,
            "target_pose": target_pose,
            "params": tensordict["params"],
        },
        batch_size=tensordict.shape,
    )
    return out

In [203]:
def _make_spec(self, td_params):
    # Under the hood, this will populate self.output_spec["observation"]
    self.observation_spec = CompositeSpec(
        thetas=BoundedTensorSpec(
            minimum=-torch.ones(thetas_count) * torch.pi,
            maximum= torch.ones(thetas_count) * torch.pi,
            shape=(thetas_count),
            dtype=torch.float32,
        ),
        #TODO: bounds are wrong
        target_pose=BoundedTensorSpec(
            minimum=-torch.ones(thetas_count) * torch.pi,
            maximum= torch.ones(thetas_count) * torch.pi,
            shape=(pose_count),
            dtype=torch.float32,
        ),
        # we need to add the "params" to the observation specs, as we want
        # to pass it at each step during a rollout
        params=make_composite_from_td(td_params["params"]),
        shape=(),
    )
    # since the environment is stateless, we expect the previous output as input.
    # For this, EnvBase expects some state_spec to be available
    self.state_spec = self.observation_spec.clone()
    # action-spec will be automatically wrapped in input_spec when
    # `self.action_spec = spec` will be called supported
    self.action_spec = BoundedTensorSpec(
        minimum=-td_params["params", "max_theta_deltas"],
        maximum=td_params["params", "max_theta_deltas"],
        shape=(thetas_count,),
        dtype=torch.float32,
    )
    self.reward_spec = UnboundedContinuousTensorSpec(shape=(*td_params.shape, 1))


def make_composite_from_td(td):
    # custom funtion to convert a tensordict in a similar spec structure
    # of unbounded values.
    composite = CompositeSpec(
        {
            key: make_composite_from_td(tensor)
            if isinstance(tensor, TensorDictBase)
            else UnboundedContinuousTensorSpec(
                dtype=tensor.dtype, device=tensor.device, shape=tensor.shape
            )
            for key, tensor in td.items()
        },
        shape=td.shape,
    )
    return composite

In [204]:
def _set_seed(self, seed: Optional[int]):
    rng = torch.manual_seed(seed)
    self.rng = rng

In [205]:
def gen_params(g=10.0, batch_size=None) -> TensorDictBase:
    """Returns a tensordict containing the physical parameters such as gravitational force and torque or speed limits."""
    if batch_size is None:
        batch_size = []
    td = TensorDict(
        {
            "params": TensorDict(
                {
                    "max_theta_deltas": 0.1,
                },
                [],
            )
        },
        [],
    )
    if batch_size:
        td = td.expand(batch_size).contiguous()
    return td

In [206]:
class PendulumEnv(EnvBase):
    metadata = {
        "render_modes": ["human", "rgb_array"],
        "render_fps": 30,
    }
    batch_locked = False

    def __init__(self, td_params=None, seed=None, device="cpu"):
        if td_params is None:
            td_params = self.gen_params()

        super().__init__(device=device, batch_size=[])
        self._make_spec(td_params)
        if seed is None:
            seed = torch.empty((), dtype=torch.int64).random_().item()
        self.set_seed(seed)

    # Helpers: _make_step and gen_params
    gen_params = staticmethod(gen_params)
    _make_spec = _make_spec

    # Mandatory methods: _step, _reset and _set_seed
    _reset = _reset
    _step = staticmethod(_step)
    _set_seed = _set_seed

In [None]:
env = PendulumEnv()
check_env_specs(env)

We can have a look at our specs to have a visual representation of the environment
signature:




In [None]:
print("observation_spec:", env.observation_spec)
print("state_spec:", env.state_spec)
print("reward_spec:", env.reward_spec)

We can execute a couple of commands too to check that the output structure
matches what is expected.



In [184]:
td = env.reset()
print("reset tensordict", td)

forward_transformation torch.Size([6, 6]) torch.Size([1, 6, 1])
reset tensordict TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        params: TensorDict(
            fields={
                max_theta_deltas: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([]),
            device=cpu,
            is_shared=False),
        target_pose: Tensor(shape=torch.Size([6]), device=cpu, dtype=torch.float32, is_shared=False),
        thetas: Tensor(shape=torch.Size([6]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)


We can run the :func:`env.rand_step` to generate
an action randomly from the ``action_spec`` domain. A tensordict containing
the hyperparams and the current state **must** be passed since our
environment is stateless. In stateful contexts, ``env.rand_step()`` works
perfectly too.




In [197]:
td = env.rand_step(td)
print("random step tensordict", td)

stuff
torch.Size([10, 6]) torch.Size([10, 6]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
compute_error_pose torch.Size([10, 6]) torch.Size([10, 6])
forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
random step tensordict TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 6]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                params: TensorDict(
                    fields={
                        max_theta_deltas: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False)},
                    batch_size=torch.Size([10]),
                    device=None,
                    is_shared=False),
                reward: Ten

## Transforming an environment

Writing environment transforms for stateless simulators is slightly more
complicated than for stateful ones: transforming an output entry that needs
to be read at the following iteration requires to apply the inverse transform
before calling :func:`env.step` at the next step.
This is an ideal scenario to showcase all the features of torchrl's
transforms!

For instance, in the following transformed environment we unsqueeze the entries
``["th", "thdot"]`` to be able to stack them along the last
dimension. We also pass them as ``in_keys_inv`` to squeeze them back to their
original shape once they are passed as input in the next iteration.




In [186]:
transformed_env = TransformedEnv(
    env,
    # Unsqueezes the observations that we will concatenate
    UnsqueezeTransform(
        unsqueeze_dim=-1,
        in_keys=["thetas"],
        in_keys_inv=["thetas"],
    ),
)

### Writing custom transforms

TorchRL's transforms may not cover all the operations one wants to execute
after an environment has been executed.
Writing a transform does not require much effort. As for the environment
design, there are two steps in writing a transform:

- Getting the dynamics right (forward and inverse);
- Adapting the environment specs.

A transform can be used in two settings: on its own, it can be used as a
:class:`torch.nn.Module`. It can also be used appended to a
:class:`~torchrl.envs.TransformedEnv`. The structure of the class allows to
customize the behaviour in the different contexts.

A :class:`~torchrl.envs.Transform` skeleton can be summarized as follows:

```
class Transform(nn.Module):
    def forward(self, tensordict):
    def _apply_transform(self, tensordict):
    def _step(self, tensordict):
    def _call(self, tensordict):
    def inv(self, tensordict):
    def _inv_apply_transform(self, tensordict):
```
There are three entry points (:func:`forward`, :func:`_step` and :func:`inv`)
which all receive :class:`tensordict.TensorDict` instances. The first two
will eventually go through the keys indicated by :obj:`Transform.in_keys`
and call :func:`Transform._apply_transform` to each of these. The results will
be written in the entries pointed by :obj:`Transform.out_keys` if provided
(if not the ``in_keys`` will be updated with the transformed values).
If inverse transforms need to be executed, a similar data flow will be
executed but with the :func:`Transform.inv` and
:func:`Transform._inv_apply_transform` methods and across the ``in_keys_inv``
and ``out_keys_inv`` list of keys.
The following figure summarized this flow for environments and replay
buffers.

.. figure:: /_static/img/transforms.png

   Transform API

In some cases, a transform will not work on a subset of keys in a unitary
manner, but will execute some operation on the parent environment or
work with the entire input tensordict.
In those cases, the :func:`_call` and :func:`forward` methods should be
re-written, and the :func:`_apply_transform` method can be skipped.

Let us code new transforms that will compute the ``sine`` and ``cosine``
values of the position angle, as these values are more useful to us to learn
a policy than the raw angle value:



In [187]:
class SinTransform(Transform):
    def _apply_transform(self, obs: torch.Tensor) -> None:
        return obs.sin()

    # _apply_to_composite will execute the observation spec transform across all
    # in_keys/out_keys pairs and write the result in the observation_spec which
    # is of type ``Composite``
    @_apply_to_composite
    def transform_observation_spec(self, observation_spec):
        return BoundedTensorSpec(
            minimum=-1,
            maximum=1,
            shape=observation_spec.shape,
            dtype=observation_spec.dtype,
            device=observation_spec.device,
        )


class CosTransform(Transform):
    def _apply_transform(self, obs: torch.Tensor) -> None:
        return obs.cos()

    # _apply_to_composite will execute the observation spec transform across all
    # in_keys/out_keys pairs and write the result in the observation_spec which
    # is of type ``Composite``
    @_apply_to_composite
    def transform_observation_spec(self, observation_spec):
        return BoundedTensorSpec(
            minimum=-1,
            maximum=1,
            shape=observation_spec.shape,
            dtype=observation_spec.dtype,
            device=observation_spec.device,
        )


t_sin = SinTransform(in_keys=["th"], out_keys=["sin"])
t_cos = CosTransform(in_keys=["th"], out_keys=["cos"])
env.append_transform(t_sin)
env.append_transform(t_cos)

AttributeError: 'PendulumEnv' object has no attribute 'append_transform'

In [None]:
class OnManifodError(Transform):
    def _apply_transform(self, obs: torch.Tensor) -> None:
        return obs.sin()

    # _apply_to_composite will execute the observation spec transform across all
    # in_keys/out_keys pairs and write the result in the observation_spec which
    # is of type ``Composite``
    @_apply_to_composite
    def transform_observation_spec(self, observation_spec):
        return BoundedTensorSpec(
            minimum=-1,
            maximum=1,
            shape=observation_spec.shape,
            dtype=observation_spec.dtype,
            device=observation_spec.device,
        )


class CosTransform(Transform):
    def _apply_transform(self, obs: torch.Tensor) -> None:
        return obs.cos()

    # _apply_to_composite will execute the observation spec transform across all
    # in_keys/out_keys pairs and write the result in the observation_spec which
    # is of type ``Composite``
    @_apply_to_composite
    def transform_observation_spec(self, observation_spec):
        return BoundedTensorSpec(
            minimum=-1,
            maximum=1,
            shape=observation_spec.shape,
            dtype=observation_spec.dtype,
            device=observation_spec.device,
        )


t_sin = SinTransform(in_keys=["th"], out_keys=["sin"])
t_cos = CosTransform(in_keys=["th"], out_keys=["cos"])
env.append_transform(t_sin)
env.append_transform(t_cos)

Concatenates the observations onto an "observation" entry.
del_keys=False ensures that we keep these values for the next
iteration.



In [188]:
cat_transform = CatTensors(
    in_keys=["sin", "cos", "thdot"], dim=-1, out_key="observation", del_keys=False
)
transformed_env.append_transform(cat_transform)

Once more, let us check that our env specs match what is received:



In [189]:
check_env_specs(transformed_env)

ValueError: CatTensor got a list of keys that does not match the keys in observation_spec. Make sure the environment has an observation_spec attribute that includes all the specs needed for CatTensor.

## Executing a rollout

Executing a rollout is a succession of simple steps:

* reset the environment
* while some condition is not met:

  * compute an action given a policy
  * execute a step given this action
  * collect the data
  * make a MDP step

* gather the data and return

These operations have been convinently wrapped in the :func:`EnvBase.rollout`
method, from which we provide a simplified version here below.



In [190]:
def simple_rollout(steps=100):
    # preallocate:
    data = TensorDict({}, [steps])
    # reset
    _data = env.reset()
    for i in range(steps):
        _data["action"] = env.action_spec.rand()
        _data = env.step(_data)
        data[i] = _data
        _data = step_mdp(_data, keep_other=True)
    return data


print("data from rollout:", simple_rollout(100))

forward_transformation torch.Size([6, 6]) torch.Size([1, 6, 1])
stuff
torch.Size([6]) torch.Size([6]) tensor(0.1000)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

## Batching computations

The last unexplored end of our tutorial is the ability that we have to
batch computations in TorchRL. Because our environment does not
make any assumptions regarding the input data shape, we can seamlessly
execute it over batches of data. Even better: for non-batch-locked
environments such as our Pendulum, we can change the batch size on the fly
without recreating the env.
To do this, we just generate parameters with the desired shape.




In [191]:
batch_size = 10  # number of environments to be executed in batch
td = env.reset(env.gen_params(batch_size=[batch_size]))
print("reset (batch size of 10)", td)
td = env.rand_step(td)
print("rand step (batch size of 10)", td)

forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
reset (batch size of 10) TensorDict(
    fields={
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        params: TensorDict(
            fields={
                max_theta_deltas: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([10]),
            device=None,
            is_shared=False),
        target_pose: Tensor(shape=torch.Size([10, 6]), device=cpu, dtype=torch.float32, is_shared=False),
        thetas: Tensor(shape=torch.Size([10, 6]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([10]),
    device=None,
    is_shared=False)
stuff
torch.Size([10, 6]) torch.Size([10, 6]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
compute_error_pose torch.Size([10, 6]) torch.Size([10, 6])
forward_transformation torch.Size([6, 6]) torc

executing a rollout with a batch of data requires us to reset the env
out of the rollout function, since we need to define the batch_size
dynamically and this is not supported by :func:`EnvBase.rollout`:




In [192]:
rollout = env.rollout(
    3,
    auto_reset=False,  # we're executing the reset out of the ``rollout`` call
    tensordict=env.reset(env.gen_params(batch_size=[batch_size])),
)
print("rollout of len 3 (batch size of 10):", rollout)

forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
stuff
torch.Size([10, 6]) torch.Size([10, 6]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
compute_error_pose torch.Size([10, 6]) torch.Size([10, 6])
forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
stuff
torch.Size([10, 6]) torch.Size([10, 6]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
compute_error_pose torch.Size([10, 6]) torch.Size([10, 6])
forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
stuff
torch.Size([10, 6]) torch.Size([10, 6]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
compute_error_pose torch.Size([10, 6]) torch.Size([10, 6])
forward_transformation torch.Size([6, 6]) torch.Size([10, 6, 1])
rollout of len 3 (batch size of 10): TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 3, 6]), device=cpu, dtype=t

## Training a simple policy

In this example, we will train a simple policy using the reward as a
differentiable objective (i.e. a negative loss).
We will take advantage of the fact that our dynamic system is fully
differentiable to backpropagate through the trajectory return and adjust the
weights of our policy to maximise this value directly. Of course, in many
settings many of the assumptions we make do not hold, such as
differentiability of the system and full access to the underlying mechanics.

Still, this is a very simple example that showcases how a training loop can
be coded with a custom environment in TorchRL.

Let us first write the policy network:




In [194]:
torch.manual_seed(0)
env.set_seed(0)

net = nn.Sequential(
    nn.LazyLinear(64),
    nn.Tanh(),
    nn.LazyLinear(64),
    nn.Tanh(),
    nn.LazyLinear(64),
    nn.Tanh(),
    nn.LazyLinear(thetas_count),
)
policy = TensorDictModule(
    net,
    in_keys=["observation"],
    out_keys=["action"],
)

and our optimizer:




In [195]:
optim = torch.optim.Adam(policy.parameters(), lr=2e-3)

### Training loop

We will successively:

* generate a trajectory
* sum the rewards
* backpropagate through the graph defined by these operations
* clip the gradient norm and make an optimization step
* repeat

At the end of the training loop, we should have a final reward close to 0
which demonstrates that the pendulum is upward and still as desired.




In [196]:
batch_size = 32
pbar = tqdm.tqdm(range(20_000 // batch_size))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, 20_000)
logs = defaultdict(list)

for _ in pbar:
    init_td = env.reset(env.gen_params(batch_size=[batch_size]))
    rollout = env.rollout(100, policy, tensordict=init_td, auto_reset=False)
    traj_return = rollout["next", "reward"].mean()
    (-traj_return).backward()
    gn = torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
    optim.step()
    optim.zero_grad()
    pbar.set_description(
        f"reward: {traj_return: 4.4f}, "
        f"last reward: {rollout[..., -1]['next', 'reward'].mean(): 4.4f}, gradient norm: {gn: 4.4}"
    )
    logs["return"].append(traj_return.item())
    logs["last_reward"].append(rollout[..., -1]["next", "reward"].mean().item())
    scheduler.step()


def plot():
    import matplotlib
    from matplotlib import pyplot as plt

    is_ipython = "inline" in matplotlib.get_backend()
    if is_ipython:
        from IPython import display

    with plt.ion():
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.plot(logs["return"])
        plt.title("returns")
        plt.xlabel("iteration")
        plt.subplot(1, 2, 2)
        plt.plot(logs["last_reward"])
        plt.title("last reward")
        plt.xlabel("iteration")
        if is_ipython:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        plt.show()


plot()

  0%|                                                                                           | 0/625 [00:00<?, ?it/s]


forward_transformation torch.Size([6, 6]) torch.Size([32, 6, 1])


RuntimeError: TensorDictModule failed with operation
    Sequential(
      (0): LazyLinear(in_features=0, out_features=64, bias=True)
      (1): Tanh()
      (2): LazyLinear(in_features=0, out_features=64, bias=True)
      (3): Tanh()
      (4): LazyLinear(in_features=0, out_features=64, bias=True)
      (5): Tanh()
      (6): LazyLinear(in_features=0, out_features=6, bias=True)
    )
    in_keys=['observation']
    out_keys=['action'].

## Conclusion

In this tutorial, we have learned how to code a stateless environment from
scratch. We touched the subjects of:

* the four essential components that need to be taken care of when coding
  an environment (:func:`step`, :func:`reset", seeding and building specs).
  We saw how these methods and classes interact with the
  :class:`tensordict.TensorDict` class;
* how to test that an environment is properly coded using
  :func:`~torchrl.envs.utils.check_env_specs`;
* How to append transforms in the context of stateless environments and how
  to write custom transformations;
* How to train a policy on a fully differentiable simulator.


