In [1]:
import torch

torch.manual_seed(0)

import time

from torchrl.envs import GymEnv, StepCounter, TransformedEnv

env = TransformedEnv(GymEnv("CartPole-v1"), StepCounter())
env.set_seed(0)

from tensordict.nn import TensorDictModule as Mod, TensorDictSequential as Seq

In [2]:
# Dessign the Policy

from torchrl.modules import EGreedyModule, MLP, QValueModule

value_mlp = MLP(out_features=env.action_spec.shape[-1], num_cells=[64, 64])
value_net = Mod(value_mlp, in_keys=["observation"], out_keys=["action_value"])
policy = Seq(value_net, QValueModule(spec=env.action_spec))

exploration_module = EGreedyModule(
    env.action_spec, annealing_num_steps=100_000, eps_init=0.5, eps_end=0.01
)

policy_explore = Seq(policy, exploration_module)

In [3]:
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer

init_rand_steps = 5000
frames_per_batch = 100
optim_steps = 10
collector = SyncDataCollector(
    env,
    policy_explore,
    frames_per_batch=frames_per_batch,
    total_frames=-1,
    init_random_frames=init_rand_steps,
)
rb = ReplayBuffer(storage=LazyTensorStorage(100_000))

from torch.optim import Adam


In [4]:
from torchrl.objectives import DQNLoss, SoftUpdate

loss = DQNLoss(value_network=policy, action_space=env.action_spec, delay_value=True)
optim = Adam(loss.parameters(), lr=0.02)
updater = SoftUpdate(loss, eps=0.99)

In [6]:
# train until reaching 200 steps in the environment

total_count = 0
total_episodes = 0
t0 = time.time()

for i, data in enumerate(collector):
    rb.extend(data)
    max_length = rb[:]["next", "step_count"].max()
    if len(rb) > init_rand_steps:
        for _ in range(optim_steps):
            sample = rb.sample(128)
            loss_vals = loss(sample)
            loss_vals["loss"].backward()
            optim.step()
            optim.zero_grad()

            exploration_module.step(data.numel())

            updater.step()

            if i % 10 == 0:
                torchrl_logger.info(f"Max num steps: {max_length}, rb length {len(rb)}")
            
            total_count += data.numel()
            total_episodes += data["next", "done"].sum()
    if max_length > 250:
        break

t1 = time.time()

torchrl_logger.info(
    f"solved after {total_count} steps, {total_episodes} episodes and in {t1-t0}s."
)

2025-09-26 12:00:23,215 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,218 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,221 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,223 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,226 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,230 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,233 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,236 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,238 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,241 [torchrl][INFO] Max num steps: 100, rb length 5100
2025-09-26 12:00:23,840 [torchrl][INFO] Max num steps: 133, rb length 6100
2025-09-26 12:00:23,843 [torchrl][INFO] Max num steps: 133, rb length 6100
2025-09-26 12:00:23,846 [torchrl][INFO] Max num steps: 133, rb length 6100
2025-09-26 12:00:23,848 [