# Bug report on Recorder

In [None]:


recorder = Recorder(
    record_interval=100,  # log every 100 optimization steps
    record_frames=1000,  # maximum number of frames in the record
    frame_skip=1,
    policy_exploration=actor_explore,
    environment=env,
    exploration_type=ExplorationType.MODE,
    log_keys=[("next", "reward")],
    out_keys={("next", "reward"): "rewards"},
    log_pbar=True,
)


# Play with torchRL

In [244]:
from torchrl.data import BinaryDiscreteTensorSpec, DiscreteTensorSpec, OneHotDiscreteTensorSpec

In [237]:
binary = BinaryDiscreteTensorSpec(n=3, shape=(2,3))

In [246]:
OneHotDiscreteTensorSpec(n=9, shape=(2, 9)).rand()

tensor([[False, False, False, False, False, False, False, False,  True],
        [False,  True, False, False, False, False, False, False, False]])

In [238]:
binary.rand()

tensor([[0, 0, 1],
        [1, 0, 0]])

In [52]:
binary.dtype

torch.int64

In [234]:
discrete = DiscreteTensorSpec(n=3, shape=(2,))

In [235]:
for _ in range(10):
    print(discrete.rand())

tensor([0, 1])
tensor([1, 2])
tensor([0, 0])
tensor([1, 2])
tensor([0, 2])
tensor([1, 1])
tensor([1, 2])
tensor([2, 0])
tensor([2, 2])
tensor([0, 1])


In [55]:
discrete.dtype

torch.int64

In [56]:
from torchrl.envs.libs.gym import GymEnv
env = GymEnv("CliffWalking-v0")

In [57]:
env.done_spec

DiscreteTensorSpec(
    shape=torch.Size([1]),
    space=DiscreteBox(n=2),
    device=cpu,
    dtype=torch.bool,
    domain=discrete)

In [58]:
env.observation_spec["observation"]

OneHotDiscreteTensorSpec(
    shape=torch.Size([48]),
    space=DiscreteBox(n=48),
    device=cpu,
    dtype=torch.int64,
    domain=discrete)

In [59]:
env.observation_spec.keys()

_CompositeSpecKeysView(keys=['observation'])

In [60]:
env.action_spec

OneHotDiscreteTensorSpec(
    shape=torch.Size([4]),
    space=DiscreteBox(n=4),
    device=cpu,
    dtype=torch.int64,
    domain=discrete)

In [61]:
import torch
from mcts import AlphaZeroMCTS
from unittest.mock import MagicMock
from mcts import TensorDictTree, MCTSNode, AlphaZeroMCTS, PUCTPolicy

In [64]:
nnet = MagicMock(return_value={"action_values": torch.Tensor([1 / 4 for _ in range(4)])})
tree = TensorDictTree("done", nnet)
alpha_env = AlphaZeroMCTS(
    env,
    nnet,
    tree,
)


In [65]:
alpha_env.reset()

TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([48]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)

In [12]:
path = env.rollout(3)

In [14]:
path[0]["action"]

tensor([0, 0, 1, 0])

TD Lambda 

In [5]:
import torch
from torch import nn
from tensordict import TensorDict
from tensordict.nn import TensorDictModule
from torchrl.objectives.value import TD0Estimator, TDLambdaEstimator

In [2]:

value_net = TensorDictModule(
    nn.Linear(3, 1), in_keys=["obs"], out_keys=["state_value"]
)

module = TD0Estimator(
    gamma=1,
    value_network=None,
)

obs, next_obs = torch.randn(2, 1, 10, 3)
reward = torch.randn(1, 10, 1)
done = torch.zeros(1, 10, 1, dtype=torch.bool)
terminated = torch.zeros(1, 10, 1, dtype=torch.bool)
tensordict = TensorDict(
    {
        "obs": obs, 
        "next": {
            "obs": next_obs, 
            "done": done, 
            "terminated": terminated, 
            "reward": reward
        }
    }, 
    [1, 10]
)
# module(tensordict)

In [3]:
# state_value = torch.Tensor([0]).to(torch.float32)
reward = torch.Tensor([[1], [1]]).to(torch.float32)
done = torch.Tensor([[0], [1]]).to(torch.bool)
next_value = torch.Tensor([[1], [1]]).to(torch.float32)

current_state = TensorDict(
    {
        # "state_value": state_value,
        "next": {
            "reward": reward,
            "done": done
        }
    },
    batch_size=(2,)
)

module = TD0Estimator(
    gamma=0.8,
    value_network=None,
)

module.value_estimate(current_state, next_value=next_value)

tensor([[1.8000],
        [1.0000]])

In [4]:
# lambda = 1 -> MC values
# lambda = 0 -> TD0
# Requirements: 1) next_value, 2) reward, 3) done 
# shape is B x T x F

In [65]:
estimator = TDLambdaEstimator(
    gamma=0.8,
    lmbda=0.9,
    value_network=None,
)

current_state_value = torch.Tensor([[0., 1., 2.0]])
next_state_value = torch.Tensor([[0.0, 1.0, 2.0]])
done = torch.Tensor([[0, 0, 1]]).to(torch.bool)
reward = torch.Tensor([[0.0, 0.0, 1.0]])

keys = estimator.tensor_keys

episode = TensorDict(
    {
        # keys.value: current_state_value,
        "next": {
            keys.value: next_state_value,
            keys.done: done,  
            keys.reward: reward,
        },
    }, 
    batch_size=current_state_value.shape,
)

res = estimator.value_estimate(episode)
print(res)

print("value target: " + str(res))

tensor([[0.5760, 0.8000, 1.0000]])
value target: tensor([[0.5760, 0.8000, 1.0000]])


In [66]:
a = ()
print(type(a))

<class 'tuple'>


# Playing with a toy problem

In [18]:
from torchrl.envs.libs.gym import GymEnv

In [19]:
env = GymEnv("CliffWalking-v0", render_mode="rgb_array")

In [190]:
print("action space", env.action_spec)

action space OneHotDiscreteTensorSpec(
    shape=torch.Size([4]),
    space=DiscreteBox(n=4),
    device=cpu,
    dtype=torch.int64,
    domain=discrete)


In [189]:
state = env.reset()
print("state td:", state)
print("observation: ", state["observation"].reshape(-1, 12))

state["action"] = torch.Tensor([0]).to(torch.int64)
next_state = env.step(state)
print("next state: ", next_state)
print("next state observation: ", next_state[("next", "observation")].reshape(-1, 12))

state = next_state["next"]
print("new state: ", state)
state["action"] = torch.Tensor([0, 1, 0, 0]).to(torch.int64)
next_state = env.step(state)
print("new next state: ", next_state)
print("new next observation: ", next_state[("next", "observation")].reshape(-1, 12))


state td: TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([48]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)
observation:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
next state:  TensorDict(
    fields={
        action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False),
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([48]), device=cpu, dtype=torch.int64, is_shared=False),
                reward:

In [216]:
state = env.reset()
print(env.action_spec.rand().shape)

while True:
    print("state: ", state["observation"].reshape(-1, 12))
    action = int(input())
    if action == -1:
        break
    else:
        env_action = torch.zeros(4).to(torch.int64)
        env_action[action] = 1
        state["action"] = env_action
        state = env.step(state)["next"]
        print("reward: ", state["reward"])
        print("done: ", state["done"])

torch.Size([4])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 0


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 2


reward:  tensor([-1.])
done:  tensor([True])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])


 1


reward:  tensor([-1.])
done:  tensor([True])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])


 0


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 1


reward:  tensor([-1.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 2


reward:  tensor([-1.])
done:  tensor([True])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])


 2


reward:  tensor([-1.])
done:  tensor([True])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])


 3


reward:  tensor([-100.])
done:  tensor([False])
state:  tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


 -1


In [None]:
# rollout = env.rollout(max_steps=100000)
print(rollout)

In [52]:
num_states = env.observation_spec["observation"].shape[0]
num_actions = env.action_spec.shape[0]

In [139]:
import torch

state_idx = torch.Tensor([1, 3]).to(torch.int64)
state_input = torch.nn.functional.one_hot(state_idx, num_states)
print(state_input.size())

class CastTo(torch.nn.Module):
    def __init__(self, dtype):
        super(CastTo, self).__init__()
        self.dtype = dtype
    def forward(self, input):
        return input.to(self.dtype)

to_action = torch.nn.Linear(num_states, num_actions)
to_float32 = CastTo(torch.float32)

qvalue = torch.nn.Sequential(to_float32, to_action)

print(qvalue(state_input))

torch.Size([2, 48])
tensor([[-0.2153,  0.0463,  0.1416, -0.0828],
        [-0.1546, -0.0262,  0.1625, -0.0649]], grad_fn=<AddmmBackward0>)


In [150]:
from tensordict.nn import TensorDictModule, TensorDictSequential
from torchrl.modules import QValueModule

qvalue_network = TensorDictModule(qvalue, in_keys=["observation"], out_keys=["action_value"])
qvalue_module = QValueModule(action_space="one_hot")

actor = TensorDictSequential(qvalue_network, qvalue_module)

In [173]:
res = qvalue_network(env.rollout(max_steps=10))
print(res)
print(res["action"])

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([10, 4]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([10, 4]), device=cpu, dtype=torch.float32, is_shared=False),
        done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),
                observation: Tensor(shape=torch.Size([10, 48]), device=cpu, dtype=torch.int64, is_shared=False),
                reward: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False)},
            batch_size=torch.Size([10]),
            device=cpu,
            is_shared=False),
        observation: Tensor(shape=torch.Size([10, 48]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([10]),
    device=cpu,
    is_shared=False)
tensor([[0, 0, 0, 1],
      

In [152]:
from torchrl.envs import ParallelEnv

In [153]:
penv = ParallelEnv(2, lambda: GymEnv("CliffWalking-v0"))

In [154]:
res = penv.rollout(max_steps=10, policy=actor)

In [175]:
for i in range(10):
    print(res[i]["observation"].reshape(-1, 12))
    print(res[i]["action"])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([0, 0, 0, 1])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([1, 0, 0, 0])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([1, 0, 0, 0])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([0, 1, 0, 0])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [176]:
env.action_spec.rand()

tensor([1, 0, 0, 0])

In [182]:
torch.max(torch.Tensor([1, 2, 3, 4]), dim=-1 , keepdim=False)

torch.return_types.max(
values=tensor(4.),
indices=tensor(3))

# understanding different modules

In [184]:
import torch
from tensordict import TensorDict
from torch import nn
from torchrl.data import OneHotDiscreteTensorSpec
from torchrl.modules.tensordict_module.actors import QValueActor

td = TensorDict({'observation': torch.randn(5, 4)}, [5])
# with a regular nn.Module
module = nn.Linear(4, 3)
action_spec = OneHotDiscreteTensorSpec(3)
qvalue_actor = QValueActor(module=module, spec=action_spec)
td = qvalue_actor(td)
print(td)

TensorDict(
    fields={
        action: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.int64, is_shared=False),
        action_value: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.float32, is_shared=False),
        chosen_action_value: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.float32, is_shared=False),
        observation: Tensor(shape=torch.Size([5, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
    batch_size=torch.Size([5]),
    device=None,
    is_shared=False)


In [185]:
print(td["action"])

tensor([[1, 0, 0],
        [0, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 0, 1]])


In [186]:
print(td["action_value"])

tensor([[ 1.4383,  0.4109, -0.6570],
        [ 0.3302,  0.5247, -0.9097],
        [ 0.6031, -0.4730,  0.3791],
        [-0.2069, -0.0556,  0.0531],
        [-0.4058, -0.2364, -0.0673]], grad_fn=<AddmmBackward0>)


In [187]:
print(td["chosen_action_value"])

tensor([[ 1.4383],
        [ 0.5247],
        [ 0.6031],
        [ 0.0531],
        [-0.0673]], grad_fn=<SumBackward1>)


In [None]:
# QValueModule: action_value -> action
# QValueActor: obseration -> module -> action_value -> action