In [1]:
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs import StepCounter
from torchrl.collectors import MultiSyncDataCollector
from torchrl.envs import TransformedEnv
from tensordict.nn import TensorDictModule
from torch import nn
env_maker1 = lambda: TransformedEnv(GymEnv("Pendulum-v1", device="cpu"), StepCounter(max_steps=50))
env_maker2 = lambda: TransformedEnv(GymEnv("Pendulum-v1", device="cpu"), StepCounter(max_steps=50))

policy = TensorDictModule(nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"])
collector = MultiSyncDataCollector(
     create_env_fn=[env_maker1, env_maker2],
     policy=policy,
     total_frames=2000,
     max_frames_per_traj=50,
     frames_per_batch=200,
     init_random_frames=-1,
     reset_at_each_iter=False,
     devices="cpu",
     storing_devices="cpu",
 )

Process Process-1:
Traceback (most recent call last):
  File "/home/kukjin/.conda/envs/darl2/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/kukjin/.conda/envs/darl2/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/kukjin/.conda/envs/darl2/lib/python3.10/site-packages/torchrl/collectors/collectors.py", line 2014, in _main_async_collector
    inner_collector = SyncDataCollector(
  File "/home/kukjin/.conda/envs/darl2/lib/python3.10/site-packages/torchrl/collectors/collectors.py", line 558, in __init__
    raise ValueError(
ValueError: A 'truncated' key is already present in the environment and the 'max_frames_per_traj' argument may conflict with a 'StepCounter' that has already been set. Possible solutions: Set max_frames_per_traj to 0 or remove the StepCounter limit from the environment transforms.


EOFError: 

In [None]:
for i, data in enumerate(collector):
     if i == 2:
         print(data)
         break

In [9]:
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs import StepCounter
from torchrl.collectors import MultiSyncDataCollector
from torchrl.envs import TransformedEnv
from tensordict.nn import TensorDictModule
import torch
from torch import nn
from omegaconf import OmegaConf
from modules.networks.blocks import get_encoder_1d, get_encoder_2d, get_decoder
from modules.networks.mlp import ResidualMLP
from modules.utils import get_activation
from torch.distributions import Normal, Categorical

cfg_path = "/home/kukjin/kukjin/Projects/MultiEnvRL/DARL_transformer/configs/ppo_trainer.yaml"
nn_cfg_path = "/home/kukjin/kukjin/Projects/MultiEnvRL/DARL_transformer/configs/nn/nn.yaml"
ppo_cfg_path = "/home/kukjin/kukjin/Projects/MultiEnvRL/DARL_transformer/configs/ppo/ppo.yaml"
ccnn_cfg_img_path = "/home/kukjin/kukjin/Projects/MultiEnvRL/DARL_transformer/configs/ccnn_img/ccnn_img.yaml"
ccnn_cfg_seq_path = "/home/kukjin/kukjin/Projects/MultiEnvRL/DARL_transformer/configs/ccnn_seq/ccnn_seq.yaml"

cfg = OmegaConf.load(cfg_path)
nn_cfg = OmegaConf.load(nn_cfg_path)
ppo_cfg = OmegaConf.load(ppo_cfg_path)
ccnn_seq_cfg = OmegaConf.load(ccnn_cfg_img_path)
ccnn_img_cfg = OmegaConf.load(ccnn_cfg_seq_path)

cfg.nn = nn_cfg
cfg.ppo = ppo_cfg
cfg.ccnn_seq = ccnn_seq_cfg
cfg.ccnn_img = ccnn_img_cfg

print(cfg)

def bmm_input(b_weight, b_input):
    batch_size, feature_dim = b_input.shape
    bmm = torch.einsum('nfh, nf -> nh', b_weight, b_input) / feature_dim
    return bmm


def bmm_output(b_weight, b_input):
    batch_size, output_dim, shared_output_dim = b_weight.shape
    batch_size, shared_output_dim = b_input.shape
    # [batch_size, 6, 32], [batch_size, 32]
    bmm = torch.einsum('noh, nh -> no', b_weight, b_input)  / shared_output_dim
    return bmm

class Actor(nn.Module):
    def __init__(self, cfg: OmegaConf):
        super().__init__()
        activation_name = cfg.nn.actor_critic.activation
        self.act_func = get_activation(activation_name)()
        self.obs_encoder_1d = get_encoder_1d(cfg)
        self.policy_prob_decoder = get_decoder(cfg)
        self.input_to_hidden = cfg.nn.actor_critic.input_to_hidden
        self.hidden_to_output = cfg.nn.actor_critic.hidden_to_output
        self.use_mlp = cfg.nn.actor_critic.use_mlp
        if self.use_mlp:
            self.res_mlp = ResidualMLP(cfg)
    
    def forward(self, is_continuous, output_dim, x):
        print(f"is_continuous: {is_continuous}")
        print(f"output_dim: {output_dim}")
        
        if len(x.shape) == 1:
            x = x.unsqueeze(0)
            is_continuous = is_continuous.unsqueeze(0)
            output_dim = output_dim.unsqueeze(0)
            
        # x: [batch_size, feature_dim]
        # ? Encoding
        h = self.get_hidden_from_1d_bmm(x)
        # h: [batch_size, d_model]
        h = self.act_func(h) 
        # ? Shared MLP
        if self.use_mlp:
            h = self.act_func(self.res_mlp(h))
        # ? Decoding
        dist, _ = self.get_dist_with_bmm(is_continuous, output_dim, h)
        return dist.sample()
    
    def get_hidden_from_1d_bmm(self, x):
        h = self.obs_encoder_1d(x)
        h = bmm_input(h, x)
        return h
    
    def get_dist_with_bmm(self, is_continuous, output_dim, h):
        out_dim = output_dim[0]
        if is_continuous[0]:
            a_mean_weights = self.policy_mean_decoder(out_dim, h)
            a_logstd_weights = self.policy_logstd_decoder(out_dim, h)
            # a_mean_weights, a_logstd_weights: [batch_size, act_dim, d_model]
            a_mu = bmm_output(a_mean_weights, h) # out: [batch_size, act_dim]
            a_logstd = bmm_output(a_logstd_weights, h) # out: [batch_size, act_dim]
            a_logstd = torch.tanh(a_logstd)
            a_logstd = self.LOG_STD_MIN + 0.5 * (self.LOG_STD_MAX - self.LOG_STD_MIN) * (a_logstd + 1)
            actor_std = a_logstd.exp()
            dist = Normal(a_mu, actor_std)
            return dist, a_mu
        else:
            # generate policy weights
            a_probs_weight = self.policy_prob_decoder(out_dim, h)
            self.policy_prob_weights = a_probs_weight
            logits = bmm_output(a_probs_weight, h) # out: [batch_size, act_dim]
            # get categorical distribution
            dist = Categorical(logits=logits)
            return dist, logits
        
actor = Actor(cfg)



{'defaults': [{'ppo': 'ppo'}, {'nn': 'nn'}, {'ccnn_img': 'ccnn_img'}, {'ccnn_seq': 'ccnn_seq'}, '_self_'], 'hydra': {'run': {'dir': 'outputs/${now:%Y-%m-%d/%H-%M-%S}'}}, 'experiment': {'env_ids': ['CartPole-v1'], 'seed': 42, 'max_episode_steps': 1000, 'num_rollout_steps': 128, 'num_envs': 64, 'total_timesteps': 10000000, 'save_ckpt': False, 'num_checkpoints': 20, 'print_interval': 100, 'stop_after_epochs': 500, 'capture_video': False, 'device': 0, 'cuda': True, 'torch_deterministic': True, 'resume': False, 'resume_update_idx': 0, 'resume_dir': 'None'}, 'evaluation': {'eval_seed': 3142, 'every': 8, 'num_eval': 5, 'num_test_envs': 5}, 'wandb': {'mode': 'online', 'project': 'DomainAgnosticRL', 'entity': None, 'name': None, 'group': None, 'tags': None, 'notes': None}, 'paths': {'dir': 'outputs/${now:%Y-%m-%d/%H-%M-%S}', 'log': 'outputs/${now:%Y-%m-%d/%H-%M-%S}/runs', 'video': 'outputs/${now:%Y-%m-%d/%H-%M-%S}/videos', 'checkpoints': 'outputs/${now:%Y-%m-%d/%H-%M-%S}/checkpoints', 'src': 'o

In [10]:
from torchrl.envs.libs.gym import GymEnv
env = GymEnv("CartPole-v1", device="cpu")
td = env.reset()
td

TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        is_continuous: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([4]), device=cpu, dtype=torch.float32, is_shared=False),
        output_dim: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)

In [11]:
from collections import defaultdict

import matplotlib.pyplot as plt
import torch
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor
from torch import nn

from torchrl.collectors import SyncDataCollector
from torchrl.data.replay_buffers import ReplayBuffer
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.envs import (
    Compose,
    DoubleToFloat,
    ObservationNorm,
    StepCounter,
    TransformedEnv,
)
from torchrl.envs.libs.gym import GymEnv
from torchrl.envs.utils import check_env_specs, ExplorationType, set_exploration_type
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value import GAE
from tqdm import tqdm

In [12]:
policy = TensorDictModule(actor, in_keys=["is_continuous", "output_dim", "observation"], out_keys=["action"])

In [13]:
policy(td)

is_continuous: False
output_dim: tensor([2])


TensorDict(
    fields={
        action: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False),
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        is_continuous: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([4]), device=cpu, dtype=torch.float32, is_shared=False),
        output_dim: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)

In [14]:
td['action']

tensor([0])

In [15]:
td['action'] = policy(td)


is_continuous: False
output_dim: tensor([2])


In [16]:
collector = SyncDataCollector(
    env,
    policy,
    frames_per_batch=32,
    total_frames=200,
    split_trajs=False,
    device='cpu',
)

debug 1
TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),
        is_continuous: Tensor(shape=torch.Size([]), device=cpu, dtype=torch.bool, is_shared=False),
        observation: Tensor(shape=torch.Size([4]), device=cpu, dtype=torch.float32, is_shared=False),
        output_dim: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.int64, is_shared=False)},
    batch_size=torch.Size([]),
    device=cpu,
    is_shared=False)
is_continuous: None
output_dim: None




KeyError: "Some tensors that are necessary for the module call may not have not been found in the input tensordict: the following inputs are None: {'output_dim', 'is_continuous'}."

In [28]:
env_maker1 = lambda: TransformedEnv(GymEnv("Pendulum-v1", device="cpu"))
env_maker2 = lambda: TransformedEnv(GymEnv("HalfCheetah-v4", device="cpu"))
# env_maker2 = lambda: TransformedEnv(GymEnv("Pendulum-v1", device="cpu"))

policy = TensorDictModule(actor, in_keys=["is_continuous", "output_dim", "observation"], out_keys=["action"])
# policy = TensorDictModule(nn.Linear(3, 1), in_keys=["observation"], out_keys=["action"])

collector = MultiSyncDataCollector(
    create_env_fn=[env_maker1, env_maker2],
    policy=None,
    total_frames=100,
    max_frames_per_traj=50,
    frames_per_batch=64,
    init_random_frames=-1,
    reset_at_each_iter=False,
    devices="cpu",
    storing_devices="cpu",
)

for i, data in enumerate(collector):
    print(data)
    if i == 2:
        break



ValueError: env must be provided to _get_policy_and_device if policy is None