diff --git a/ding/example/bcq.py b/ding/example/bcq.py index 4bd1385c3f..d0114d120e 100755 --- a/ding/example/bcq.py +++ b/ding/example/bcq.py @@ -15,7 +15,7 @@ def main(): # If you don't have offline data, you need to prepare if first and set the data_path in config - # For demostration, we also can train a RL policy (e.g. SAC) and collect some data + # For demonstration, we also can train a RL policy (e.g. SAC) and collect some data logging.getLogger().setLevel(logging.INFO) cfg = compile_config(main_config, create_cfg=create_config, auto=True) ding_init(cfg) diff --git a/ding/policy/a2c.py b/ding/policy/a2c.py index 6e05f4e712..2d2f116afc 100644 --- a/ding/policy/a2c.py +++ b/ding/policy/a2c.py @@ -1,11 +1,12 @@ -from typing import List, Dict, Any, Tuple, Union from collections import namedtuple +from typing import List, Dict, Any, Tuple + import torch +from ding.model import model_wrap from ding.rl_utils import a2c_data, a2c_error, get_gae_with_default_last_value, get_train_sample, \ - a2c_error_continuous + a2c_error_continuous from ding.torch_utils import Adam, to_device -from ding.model import model_wrap from ding.utils import POLICY_REGISTRY, split_data_generator from ding.utils.data import default_collate, default_decollate from .base_policy import Policy @@ -14,68 +15,95 @@ @POLICY_REGISTRY.register('a2c') class A2CPolicy(Policy): - r""" + """ Overview: - Policy class of A2C algorithm. + Policy class of A2C (Advantage Actor-Critic) algorithm, proposed in https://arxiv.org/abs/1602.01783. """ config = dict( - # (string) RL policy register name (refer to function "register_policy"). + # (str) Name of the registered RL policy (refer to the "register_policy" function). type='a2c', - # (bool) Whether to use cuda for network. + # (bool) Flag to enable CUDA for model computation. cuda=False, - # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same) - on_policy=True, # for a2c strictly on policy algorithm, this line should not be seen by users + # (bool) Flag for using on-policy training (training policy is the same as the behavior policy). + on_policy=True, + # (bool) Flag for enabling priority experience replay. Must be False when priority_IS_weight is False. priority=False, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) Flag for using Importance Sampling weights to correct updates. Requires `priority` to be True. priority_IS_weight=False, - # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous'] + # (str) Type of action space used in the policy, with valid options ['discrete', 'continuous']. action_space='discrete', + # learn_mode configuration learn=dict( - - # (int) for a2c, update_per_collect must be 1. - update_per_collect=1, # fixed value, this line should not be modified by users + # (int) Number of updates per data collection. A2C requires this to be set to 1. + update_per_collect=1, + # (int) Batch size for learning. batch_size=64, + # (float) Learning rate for optimizer. learning_rate=0.001, - # (List[float]) + # (Tuple[float, float]) Coefficients used for computing running averages of gradient and its square. betas=(0.9, 0.999), - # (float) + # (float) Term added to the denominator to improve numerical stability in optimizer. eps=1e-8, - # (float) + # (float) Maximum norm for gradients. grad_norm=0.5, - # ============================================================== - # The following configs is algorithm-specific - # ============================================================== - # (float) loss weight of the value network, the weight of policy network is set to 1 + # (float) Scaling factor for value network loss relative to policy network loss. value_weight=0.5, - # (float) loss weight of the entropy regularization, the weight of policy network is set to 1 + # (float) Weight of entropy regularization in the loss function. entropy_weight=0.01, - # (bool) Whether to normalize advantage. Default to False. + # (bool) Flag to enable normalization of advantages. adv_norm=False, + # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time + # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks + # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments, + # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching + # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the + # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`, + # even when the episode surpasses the predefined step limit. ignore_done=False, ), + # collect_mode configuration collect=dict( - # (int) collect n_sample data, train model n_iteration times - # n_sample=80, + # (int) The length of rollout for data collection. unroll_len=1, - # ============================================================== - # The following configs is algorithm-specific - # ============================================================== - # (float) discount factor for future reward, defaults int [0, 1] + # (float) Discount factor for calculating future rewards, typically in the range [0, 1]. discount_factor=0.9, - # (float) the trade-off factor lambda to balance 1step td and mc + # (float) Trade-off parameter for balancing TD-error and Monte Carlo error in GAE. gae_lambda=0.95, ), + # eval_mode configuration (kept empty for compatibility purposes) eval=dict(), ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Returns the default model configuration used by the A2C algorithm. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): \ + Tuple containing the registered model name and model's import_names. + """ return 'vac', ['ding.model.template.vac'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For A2C, it mainly \ + contains optimizer, algorithm-specific arguments such as value_weight, entropy_weight, adv_norm + and grad_norm, and main model. \ + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ assert self._cfg.action_space in ["continuous", "discrete"] # Optimizer @@ -98,15 +126,32 @@ def _init_learn(self) -> None: self._learn_model = model_wrap(self._model, wrapper_name='base') self._learn_model.reset() - def _forward_learn(self, data: dict) -> Dict[str, Any]: - r""" + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as policy_loss, value_loss, entropy_loss. Arguments: - - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs','adv'] + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in the list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For A2C, each element in the list is a dict containing at least the following keys: \ + ['obs', 'action', 'adv', 'value', 'weight']. Returns: - - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss. + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that is not supported, the main reason is that the corresponding model does not support \ + it. You can implement your own model rather than use the default model. For more information, please \ + raise an issue in GitHub repo, and we will continue to follow up. """ + # Data preprocessing operations, such as stack data, cpu to cuda device data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False) if self._cuda: data = to_device(data, self._device) @@ -135,7 +180,6 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # ==================== # A2C-learning update # ==================== - self._optimizer.zero_grad() total_loss.backward() @@ -160,22 +204,44 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: } def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring. + """ return { 'model': self._learn_model.state_dict(), 'optimizer': self._optimizer.state_dict(), } def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ self._learn_model.load_state_dict(state_dict['model']) self._optimizer.load_state_dict(state_dict['optimizer']) def _init_collect(self) -> None: - r""" - Overview: - Collect mode init method. Called by ``self.__init__``. - Init traj and unroll length, collect model. """ + Overview: + Initialize the collect mode of policy, including related attributes and modules. For A2C, it contains the \ + collect_model to balance the exploration and exploitation with ``reparam_sample`` or \ + ``multinomial_sample`` mechanism, and other algorithm-specific arguments such as gamma and gae_lambda. \ + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + """ assert self._cfg.action_space in ["continuous", "discrete"] self._unroll_len = self._cfg.collect.unroll_len @@ -189,17 +255,19 @@ def _init_collect(self) -> None: self._gamma = self._cfg.collect.discount_factor self._gae_lambda = self._cfg.collect.gae_lambda - def _forward_collect(self, data: dict) -> dict: - r""" + def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ Overview: - Forward function of collect mode. + Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \ + that the policy gets some necessary data (mainly observation) from the envs and then returns the output \ + data, such as the action to interact with the envs. Arguments: - - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ - values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. Returns: - - output (:obj:`Dict[int, Any]`): Dict type data, including at least inferred action according to input obs. - ReturnsKeys - - necessary: ``action`` + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ + other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \ + dict is the same as the input data, i.e. environment id. """ data_id = list(data.keys()) data = default_collate(list(data.values())) @@ -213,51 +281,68 @@ def _forward_collect(self, data: dict) -> dict: output = default_decollate(output) return {i: d for i, d in zip(data_id, output)} - def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: - r""" + def _process_transition(self, obs: Any, policy_output: Dict[str, torch.Tensor], + timestep: namedtuple) -> Dict[str, torch.Tensor]: + """ Overview: - Generate dict type transition data from inputs. + Process and pack one timestep transition data into a dict, which can be directly used for training and \ + saved in replay buffer. For A2C, it contains obs, next_obs, action, value, reward, done. Arguments: - - obs (:obj:`Any`): Env observation - - model_output (:obj:`dict`): Output of collect model, including at least ['action'] - - timestep (:obj:`namedtuple`): Output after env step, including at least ['obs', 'reward', 'done'] \ - (here 'obs' indicates obs after env step). + - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari. + - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \ + as input. For A2C, it contains the action and the value of the state. + - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \ + except all the elements have been transformed into tensor data. Usually, it contains the next obs, \ + reward, done, info, etc. Returns: - - transition (:obj:`dict`): Dict type transition data. + - transition (:obj:`Dict[str, torch.Tensor]`): The processed transition data of the current timestep. """ transition = { 'obs': obs, 'next_obs': timestep.obs, - 'action': model_output['action'], - 'value': model_output['value'], + 'action': policy_output['action'], + 'value': policy_output['value'], 'reward': timestep.reward, 'done': timestep.done, } return transition - def _get_train_sample(self, data: list) -> Union[None, List[Any]]: - r""" + def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ Overview: - Get the trajectory and the n step return data, then sample from the n_step return data + For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \ + can be used for training directly. In A2C, a train sample is a processed transition. \ + This method is usually used in collectors to execute necessary \ + RL data preprocessing before training, which can help the learner amortize relevant time consumption. \ + In addition, you can also implement this method as an identity function and do the data processing \ + in ``self._forward_learn`` method. Arguments: - - data (:obj:`list`): The trajectory's buffer list + - transitions (:obj:`List[Dict[str, Any]`): The trajectory data (a list of transition), each element is \ + in the same format as the return value of ``self._process_transition`` method. Returns: - - samples (:obj:`dict`): The training samples generated + - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is similar in format \ + to input transitions, but may contain more data for training, such as advantages. """ - data = get_gae_with_default_last_value( - data, - data[-1]['done'], + transitions = get_gae_with_default_last_value( + transitions, + transitions[-1]['done'], gamma=self._gamma, gae_lambda=self._gae_lambda, cuda=self._cuda, ) - return get_train_sample(data, self._unroll_len) + return get_train_sample(transitions, self._unroll_len) def _init_eval(self) -> None: - r""" + """ Overview: - Evaluate mode init method. Called by ``self.__init__``. - Init eval model with argmax strategy. + Initialize the eval mode of policy, including related attributes and modules. For A2C, it contains the \ + eval model to greedily select action with ``argmax_sample`` mechanism (For discrete action space) and \ + ``deterministic_sample`` mechanism (For continuous action space). \ + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. """ assert self._cfg.action_space in ["continuous", "discrete"] self._action_space = self._cfg.action_space @@ -267,17 +352,24 @@ def _init_eval(self) -> None: self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample') self._eval_model.reset() - def _forward_eval(self, data: dict) -> dict: - r""" + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ Overview: - Forward function of eval mode, similar to ``self._forward_collect``. + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. Arguments: - - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \ - values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer. + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. Returns: - - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env. - ReturnsKeys - - necessary: ``action`` + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e., environment id. + + .. note:: + The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \ + For the data type that is not supported, the main reason is that the corresponding model does not \ + support it. You can implement your own model rather than use the default model. For more information, \ + please raise an issue in GitHub repo, and we will continue to follow up. """ data_id = list(data.keys()) data = default_collate(list(data.values())) @@ -292,4 +384,11 @@ def _forward_eval(self, data: dict) -> dict: return {i: d for i, d in zip(data_id, output)} def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'grad_norm'] diff --git a/ding/policy/acer.py b/ding/policy/acer.py index 7ac4db7753..319b2fe814 100644 --- a/ding/policy/acer.py +++ b/ding/policy/acer.py @@ -47,11 +47,11 @@ class ACERPolicy(Policy): config = dict( type='acer', cuda=False, - # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same) + # (bool) whether to use on-policy training pipeline (behaviour policy and training policy are the same) # here we follow ppo serial pipeline, the original is False on_policy=False, priority=False, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, learn=dict( # (str) the type of gradient clip method @@ -295,7 +295,7 @@ def _reshape_data( Update values and rewards with the weight Arguments: - output (:obj:`Dict[int, Any]`): Dict type data, output of learn_model forward. \ - Values are torch.Tensor or np.ndarray or dict/list combinations,keys are value, logit. + Values are torch.Tensor or np.ndarray or dict/list combinations, keys are value, logit. - data (:obj:`Dict[int, Any]`): Dict type data, input of policy._forward_learn \ Values are torch.Tensor or np.ndarray or dict/list combinations. Keys includes at \ least ['logit', 'action', 'reward', 'done',] @@ -378,7 +378,7 @@ def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Dict[str, Any]]: action, values are torch.Tensor or np.ndarray or dict/list combinations,keys \ are env_id indicated by integer. Returns: - - output (:obj:`Dict[int, Dict[str,Any]]`): Dict of predicting policy_output(logit, action) for each env. + - output (:obj:`Dict[int, Dict[str, Any]]`): Dict of predicting policy_output(logit, action) for each env. ReturnsKeys - necessary: ``logit``, ``action`` """ @@ -479,7 +479,7 @@ def _monitor_vars_learn(self) -> List[str]: Returns: - model_info (:obj:`Tuple[str, List[str]]`): model name and mode import_names .. note:: - The user can define and use customized network model but must obey the same interface definition indicated \ - by import_names path. For IMPALA, ``ding.model.interface.IMPALA`` + The user can define and use a customized network model but must obey the same interface definition \ + indicated by import_names path. For IMPALA, ``ding.model.interface.IMPALA`` """ return ['actor_loss', 'bc_loss', 'policy_loss', 'critic_loss', 'entropy_loss', 'kl_div'] diff --git a/ding/policy/base_policy.py b/ding/policy/base_policy.py index 3ff99c7b43..7e843f8429 100644 --- a/ding/policy/base_policy.py +++ b/ding/policy/base_policy.py @@ -196,14 +196,14 @@ def hook(*ignore): def _create_model(self, cfg: EasyDict, model: Optional[torch.nn.Module] = None) -> torch.nn.Module: """ Overview: - Create or validate the neural network model according to input configures and model. If the input model is \ - None, then the model will be created according to ``default_model`` method and ``cfg.model`` field. \ - Otherwise, the model will be verified as an instance of ``torch.nn.Module`` and set to the ``model`` \ - instance created by outside caller. + Create or validate the neural network model according to the input configuration and model. \ + If the input model is None, then the model will be created according to ``default_model`` \ + method and ``cfg.model`` field. Otherwise, the model will be verified as an instance of \ + ``torch.nn.Module`` and set to the ``model`` instance created by outside caller. Arguments: - cfg (:obj:`EasyDict`): The final merged config used to initialize policy. - model (:obj:`torch.nn.Module`): The neural network model used to initialize policy. User can refer to \ - the default model defined in corresponding policy to customize its own model. + the default model defined in the corresponding policy to customize its own model. Returns: - model (:obj:`torch.nn.Module`): The created neural network model. The different modes of policy will \ add distinct wrappers and plugins to the model, which is used to train, collect and evaluate. @@ -272,7 +272,7 @@ def _init_eval(self) -> None: Overview: Initialize the eval mode of policy, including related attributes and modules. This method will be \ called in ``__init__`` method if ``eval`` field is in ``enable_field``. Almost different policies have \ - its own eval mode, so this method must be overrided in subclass. + its own eval mode, so this method must be override in subclass. .. note:: For the member variables that need to be saved and loaded, please refer to the ``_state_dict_eval`` \ @@ -289,7 +289,7 @@ def learn_mode(self) -> 'Policy.learn_function': # noqa """ Overview: Return the interfaces of learn mode of policy, which is used to train the model. Here we use namedtuple \ - to define immutable interfaces and restrict the usage of policy in different mode. Moreover, derived \ + to define immutable interfaces and restrict the usage of policy in different modes. Moreover, derived \ subclass can override the interfaces to customize its own learn mode. Returns: - interfaces (:obj:`Policy.learn_function`): The interfaces of learn mode of policy, it is a namedtuple \ @@ -316,7 +316,7 @@ def collect_mode(self) -> 'Policy.collect_function': # noqa """ Overview: Return the interfaces of collect mode of policy, which is used to train the model. Here we use namedtuple \ - to define immutable interfaces and restrict the usage of policy in different mode. Moreover, derived \ + to define immutable interfaces and restrict the usage of policy in different modes. Moreover, derived \ subclass can override the interfaces to customize its own collect mode. Returns: - interfaces (:obj:`Policy.collect_function`): The interfaces of collect mode of policy, it is a \ @@ -370,7 +370,7 @@ def _set_attribute(self, name: str, value: Any) -> None: Overview: In order to control the access of the policy attributes, we expose different modes to outside rather than \ directly use the policy instance. And we also provide a method to set the attribute of the policy in \ - different modes. And the new attribute will named as ``_{name}``. + different modes. And the new attribute will name as ``_{name}``. Arguments: - name (:obj:`str`): The name of the attribute. - value (:obj:`Any`): The value of the attribute. @@ -416,7 +416,7 @@ def sync_gradients(self, model: torch.nn.Module) -> None: - model (:obj:`torch.nn.Module`): The model to synchronize gradients. .. note:: - This method is only used in multi-gpu training, and it shoule be called after ``backward`` method and \ + This method is only used in multi-gpu training, and it should be called after ``backward`` method and \ before ``step`` method. The user can also use ``bp_update_sync`` config to control whether to synchronize \ gradients allreduce and optimizer updates. """ diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py index 9a8388b00f..2f0643e612 100755 --- a/ding/policy/bcq.py +++ b/ding/policy/bcq.py @@ -1,15 +1,14 @@ -from typing import List, Dict, Any, Tuple, Union -from collections import namedtuple import copy -import numpy as np +from collections import namedtuple +from typing import List, Dict, Any, Tuple + import torch -import torch.nn as nn import torch.nn.functional as F -from ding.torch_utils import Adam, to_device -from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data from ding.model import model_wrap from ding.policy import Policy +from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data +from ding.torch_utils import Adam, to_device from ding.utils import POLICY_REGISTRY from ding.utils.data import default_collate, default_decollate from .common_utils import default_preprocess_learn @@ -17,94 +16,103 @@ @POLICY_REGISTRY.register('bcq') class BCQPolicy(Policy): + """ + Overview: + Policy class of BCQ (Batch-Constrained deep Q-learning) algorithm, proposed in \ + https://arxiv.org/abs/1812.02900. + """ + config = dict( + # (str) Name of the registered RL policy (refer to the "register_policy" function). type='bcq', - # (bool) Whether to use cuda for network. + # (bool) Indicates if CUDA should be used for network operations. cuda=False, - # (bool type) priority: Determine whether to use priority in buffer sample. - # Default False in SAC. + # (bool) Determines whether priority sampling is used in the replay buffer. Default is False. priority=False, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) If True, Importance Sampling Weight is used to correct updates. Requires 'priority' to be True. priority_IS_weight=False, - # (int) Number of training samples(randomly collected) in replay buffer when training starts. - # Default 10000 in SAC. + # (int) Number of random samples in replay buffer before training begins. Default is 10000. random_collect_size=10000, + # (int) The number of steps for calculating target q_value. nstep=1, model=dict( - # (List) Hidden list for actor network head. + # (List[int]) Sizes of the hidden layers in the actor network. actor_head_hidden_size=[400, 300], - - # (List) Hidden list for critic network head. + # (List[int]) Sizes of the hidden layers in the critic network. critic_head_hidden_size=[400, 300], - # Max perturbation hyper-parameter for BCQ + # (float) Maximum perturbation for BCQ. Controls exploration in action space. phi=0.05, ), learn=dict( - - # How many updates(iterations) to train after collector's one collection. - # Bigger "update_per_collect" means bigger off-policy. - # collect data -> update policy-> collect data -> ... + # (int) Number of policy updates per data collection step. Higher values indicate more off-policy training. update_per_collect=1, - # (int) Minibatch size for gradient descent. + # (int) Batch size for each gradient descent step. batch_size=100, - - # (float type) learning_rate_q: Learning rate for soft q network. - # Default to 3e-4. - # Please set to 1e-3, when model.value_network is True. + # (float) Learning rate for the Q-network. Set to 1e-3 if `model.value_network` is True. learning_rate_q=3e-4, - # (float type) learning_rate_policy: Learning rate for policy network. - # Default to 3e-4. - # Please set to 1e-3, when model.value_network is True. + # (float) Learning rate for the policy network. Set to 1e-3 if `model.value_network` is True. learning_rate_policy=3e-4, - # (float type) learning_rate_vae: Learning rate for vae network. - # `learning_rate_value` should be initialized, when model.vae_network is True. - # Please set to 3e-4, when model.vae_network is True. + # (float) Learning rate for the VAE network. Initialize if `model.vae_network` is True. learning_rate_vae=3e-4, - # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum) - # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. - # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. - # However, interaction with HalfCheetah always gets done with done is False, - # Since we inplace done==True with done==False to keep - # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), - # when the episode step is greater than max episode step. + # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time + # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks + # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments, + # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching + # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the + # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`, + # even when the episode surpasses the predefined step limit. ignore_done=False, - - # (float type) target_theta: Used for soft update of the target network, - # aka. Interpolation factor in polyak averaging for target networks. - # Default to 0.005. + # (float) Polyak averaging coefficient for the target network update. Typically small. target_theta=0.005, - # (float) discount factor for the discounted sum of rewards, aka. gamma. + # (float) Discount factor for future rewards, often denoted as gamma. discount_factor=0.99, + # (float) Lambda for TD(lambda) learning. Weighs the trade-off between bias and variance. lmbda=0.75, - - # (float) Weight uniform initialization range in the last output layer + # (float) Range for uniform weight initialization in the output layer. init_w=3e-3, ), collect=dict( - # (int) Cut trajectories into pieces with length "unroll_len". + # (int) Length of trajectory segments for unrolling. Set to higher for longer dependencies. unroll_len=1, ), eval=dict(), other=dict( replay_buffer=dict( - # (int type) replay_buffer_size: Max size of replay buffer. + # (int) Maximum size of the replay buffer. replay_buffer_size=1000000, - # (int type) max_use: Max use times of one data in the buffer. - # Data will be removed once used for too many times. - # Default to infinite. - # max_use=256, ), ), ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Returns the default model configuration used by the BCQ algorithm. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): \ + Tuple containing the registered model name and model's import_names. + """ return 'bcq', ['ding.model.template.bcq'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init q, value and policy's optimizers, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For BCQ, it mainly \ + contains optimizer, algorithm-specific arguments such as gamma, main and target model. \ + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ # Init self._priority = self._cfg.priority @@ -140,12 +148,35 @@ def _init_learn(self) -> None: self._learn_model = model_wrap(self._model, wrapper_name='base') self._learn_model.reset() self._target_model.reset() - self._forward_learn_cnt = 0 - def _forward_learn(self, data: dict) -> Dict[str, Any]: + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Overview: + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as policy_loss, value_loss, entropy_loss. + Arguments: + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For BCQ, each element in list is a dict containing at least the following keys: \ + ['obs', 'action', 'adv', 'value', 'weight']. + Returns: + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement your own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ loss_dict = {} - + # Data preprocessing operations, such as stack data, cpu to cuda device data = default_preprocess_learn( data, use_priority=self._priority, @@ -183,7 +214,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: # train_critic q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] - with torch.no_grad(): + with (torch.no_grad()): next_obs_rep = torch.repeat_interleave(next_obs, 10, 0) z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5) vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action'] @@ -230,12 +261,25 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: } def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ return [ 'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss', 'vae_loss' ] def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring. + """ ret = { 'model': self._learn_model.state_dict(), 'target_model': self._target_model.state_dict(), @@ -245,11 +289,38 @@ def _state_dict_learn(self) -> Dict[str, Any]: } return ret - def _init_eval(self): + def _init_eval(self) -> None: + """ + Overview: + Initialize the eval mode of policy, including related attributes and modules. + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. + """ self._eval_model = model_wrap(self._model, wrapper_name='base') self._eval_model.reset() - def _forward_eval(self, data: dict) -> Dict[str, Any]: + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e., environment id. + + .. note:: + The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \ + For the data type that is not supported, the main reason is that the corresponding model does not \ + support it. You can implement your own model rather than use the default model. For more information, \ + please raise an issue in GitHub repo, and we will continue to follow up. + """ data_id = list(data.keys()) data = default_collate(list(data.values())) if self._cuda: @@ -264,26 +335,28 @@ def _forward_eval(self, data: dict) -> Dict[str, Any]: return {i: d for i, d in zip(data_id, output)} def _init_collect(self) -> None: + """ + Overview: + Initialize the collect mode of policy, including related attributes and modules. For BCQ, it contains the \ + collect_model to balance the exploration and exploitation with ``eps_greedy_sample`` \ + mechanism, and other algorithm-specific arguments such as gamma and nstep. + This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \ + with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``. + """ self._unroll_len = self._cfg.collect.unroll_len - self._gamma = self._cfg.discount_factor # necessary for parallel - self._nstep = self._cfg.nstep # necessary for parallel + self._gamma = self._cfg.discount_factor + self._nstep = self._cfg.nstep self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample') self._collect_model.reset() + def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + pass + def _forward_collect(self, data: dict, **kwargs) -> dict: pass def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: pass - - def _get_train_sample(self, data: list) -> Union[None, List[Any]]: - r""" - Overview: - Get the trajectory and the n step return data, then sample from the n_step return data - Arguments: - - data (:obj:`list`): The trajectory's cache - Returns: - - samples (:obj:`dict`): The training samples generated - """ - data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) - return get_train_sample(data, self._unroll_len) diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py index 2e253370b8..6f62c59795 100644 --- a/ding/policy/ddpg.py +++ b/ding/policy/ddpg.py @@ -68,7 +68,7 @@ class DDPGPolicy(Policy): on_policy=False, # (bool) Whether to enable priority experience sample. priority=False, - # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True. priority_IS_weight=False, # (int) Number of training samples(randomly collected) in replay buffer when training starts. # Default 25000 in DDPG/TD3. @@ -411,7 +411,7 @@ def _forward_collect(self, data: Dict[int, Any], **kwargs) -> Dict[int, Any]: Returns: - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \ other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \ - dict is the same as the input data, i.e. environment id. + dict is the same as the input data, i.e., environment id. .. note:: The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py index d1f6fdbb49..8e0944f270 100644 --- a/ding/policy/dqn.py +++ b/ding/policy/dqn.py @@ -95,7 +95,7 @@ class DQNPolicy(Policy): priority_IS_weight=False, # (float) Discount factor(gamma) for returns. discount_factor=0.97, - # (int) The number of step for calculating target q_value. + # (int) The number of steps for calculating target q_value. nstep=1, model=dict( # (list(int)) Sequence of ``hidden_size`` of subsequent conv layers and the final dense layer. @@ -111,31 +111,31 @@ class DQNPolicy(Policy): batch_size=64, # (float) The step size of gradient descent. learning_rate=0.001, - # (int) Frequence of target network update. + # (int) Frequency of target network update. # Only one of [target_update_freq, target_theta] should be set. target_update_freq=100, - # (float) : Used for soft update of the target network. + # (float) Used for soft update of the target network. # aka. Interpolation factor in EMA update for target network. # Only one of [target_update_freq, target_theta] should be set. target_theta=0.005, - # (bool) Whether ignore done(usually for max step termination env). - # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. - # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. - # However, interaction with HalfCheetah always gets done with done is False, - # Since we inplace done==True with done==False to keep - # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), - # when the episode step is greater than max episode step. + # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time + # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks + # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments, + # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching + # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the + # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`, + # even when the episode surpasses the predefined step limit. ignore_done=False, ), # collect_mode config collect=dict( # (int) How many training samples collected in one collection procedure. - # Only one of [n_sample, n_episode] shoule be set. + # Only one of [n_sample, n_episode] should be set. n_sample=8, # (int) Split episodes or trajectories into pieces with length `unroll_len`. unroll_len=1, ), - eval=dict(), # for compability + eval=dict(), # for compatibility # other config other=dict( # Epsilon greedy with decay. @@ -165,7 +165,7 @@ def default_model(self) -> Tuple[str, List[str]]: - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names. .. note:: - The user can define and use customized network model but must obey the same inferface definition indicated \ + The user can define and use customized network model but must obey the same interface definition indicated \ by import_names path. For example about DQN, its registered name is ``dqn`` and the import_names is \ ``ding.model.template.q_learning``. """ @@ -242,7 +242,7 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: .. note:: The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ For the data type that not supported, the main reason is that the corresponding model does not support it. \ - You can implement you own model rather than use the default model. For more information, please raise an \ + You can implement your own model rather than use the default model. For more information, please raise an \ issue in GitHub repo and we will continue to follow up. .. note:: @@ -398,7 +398,7 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \ can be used for training directly. In DQN with nstep TD, a train sample is a processed transition. \ This method is usually used in collectors to execute necessary \ - RL data preprocessing before training, which can help learner amortize revelant time consumption. \ + RL data preprocessing before training, which can help learner amortize relevant time consumption. \ In addition, you can also implement this method as an identity function and do the data processing \ in ``self._forward_learn`` method. Arguments: diff --git a/ding/policy/fqf.py b/ding/policy/fqf.py index f1ba86fd91..fae697f85d 100644 --- a/ding/policy/fqf.py +++ b/ding/policy/fqf.py @@ -1,22 +1,34 @@ -from typing import List, Dict, Any, Tuple, Union import copy +from typing import List, Dict, Any, Tuple + import torch -from ding.torch_utils import Adam, RMSprop, to_device -from ding.rl_utils import fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss, \ - get_train_sample, get_nstep_return_data from ding.model import model_wrap +from ding.rl_utils import fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss +from ding.torch_utils import Adam, RMSprop, to_device from ding.utils import POLICY_REGISTRY -from ding.utils.data import default_collate, default_decollate -from .dqn import DQNPolicy from .common_utils import default_preprocess_learn +from .dqn import DQNPolicy + + +def compute_grad_norm(model): + """ + Overview: + Compute grad norm of a network's parameters. + Arguments: + - model (:obj:`nn.Module`): The network to compute grad norm. + Returns: + - grad_norm (:obj:`torch.Tensor`): The grad norm of the network's parameters. + """ + return torch.norm(torch.stack([torch.norm(p.grad.detach(), 2.0) for p in model.parameters()]), 2.0) @POLICY_REGISTRY.register('fqf') class FQFPolicy(DQNPolicy): - r""" + """ Overview: - Policy class of FQF algorithm. + Policy class of FQF (Fully Parameterized Quantile Function) algorithm, proposed in + https://arxiv.org/pdf/1911.02140.pdf. Config: == ==================== ======== ============== ======================================== ======================= @@ -46,70 +58,100 @@ class FQFPolicy(DQNPolicy): """ config = dict( - # (str) RL policy register name (refer to function "POLICY_REGISTRY"). + # (str) Name of the RL policy registered in "POLICY_REGISTRY" function. type='fqf', - # (bool) Whether to use cuda for network. + # (bool) Flag to enable/disable CUDA for network computation. cuda=False, - # (bool) Whether the RL algorithm is on-policy or off-policy. + # (bool) Indicator of the RL algorithm's policy type (True for on-policy algorithms). on_policy=False, - # (bool) Whether use priority(priority sample, IS weight, update priority) + # (bool) Toggle for using prioritized experience replay (priority sampling and updating). priority=False, - # (float) Reward's future discount factor, aka. gamma. + # (float) Discount factor (gamma) for calculating the future reward. discount_factor=0.97, - # (int) N-step reward for target q_value estimation + # (int) Number of steps to consider for calculating n-step returns. nstep=1, learn=dict( - - # How many updates(iterations) to train after collector's one collection. - # Bigger "update_per_collect" means bigger off-policy. - # collect data -> update policy-> collect data -> ... + # (int) Number of training iterations per data collection from the environment. update_per_collect=3, + # (int) Size of minibatch for each update. batch_size=64, + # (float) Fractional learning rate for the fraction proposal network. learning_rate_fraction=2.5e-9, + # (float) Learning rate for the quantile regression network. learning_rate_quantile=0.00005, # ============================================================== - # The following configs are algorithm-specific + # Algorithm-specific configurations # ============================================================== - # (int) Frequence of target network update. + # (int) Frequency of target network updates. target_update_freq=100, - # (float) Threshold of Huber loss. In the FQF paper, this is denoted by kappa. Default to 1.0. + # (float) Huber loss threshold (kappa in the FQF paper). kappa=1.0, - # (float) Coefficient of entropy_loss. + # (float) Coefficient for the entropy loss term. ent_coef=0, - # (bool) Whether ignore done(usually for max step termination env) + # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time + # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks + # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments, + # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching + # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the + # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`, + # even when the episode surpasses the predefined step limit. ignore_done=False, ), - # collect_mode config collect=dict( - # (int) Only one of [n_sample, n_step, n_episode] shoule be set + # (int) Specify one of [n_sample, n_step, n_episode] for data collection. # n_sample=8, - # (int) Cut trajectories into pieces with length "unroll_len". + # (int) Length of trajectory segments for processing. unroll_len=1, ), eval=dict(), - # other config other=dict( - # Epsilon greedy with decay. + # Epsilon-greedy strategy with a decay mechanism. eps=dict( - # (str) Decay type. Support ['exp', 'linear']. + # (str) Type of decay mechanism ['exp' for exponential, 'linear']. type='exp', + # (float) Initial value of epsilon in epsilon-greedy exploration. start=0.95, + # (float) Final value of epsilon after decay. end=0.1, - # (int) Decay length(env step) + # (int) Number of environment steps over which epsilon is decayed. decay=10000, ), - replay_buffer=dict(replay_buffer_size=10000, ) + replay_buffer=dict( + # (int) Size of the replay buffer. + replay_buffer_size=10000, + ), ), ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Returns the default model configuration used by the FQF algorithm. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): \ + Tuple containing the registered model name and model's import_names. + """ return 'fqf', ['ding.model.template.q_learning'] def _init_learn(self) -> None: - r""" + """ Overview: - Learn mode init method. Called by ``self.__init__``. - Init the optimizer, algorithm config, main and target models. + Initialize the learn mode of policy, including related attributes and modules. For FQF, it mainly \ + contains optimizer, algorithm-specific arguments such as gamma, nstep, kappa ent_coef, main and \ + target model. This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. """ self._priority = self._cfg.priority # Optimizer @@ -143,15 +185,32 @@ def _init_learn(self) -> None: self._learn_model.reset() self._target_model.reset() - def _forward_learn(self, data: dict) -> Dict[str, Any]: - r""" + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ Overview: - Forward and backward function of learn mode. + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as policy_loss, value_loss, entropy_loss. Arguments: - - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs'] + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For FQF, each element in list is a dict containing at least the following keys: \ + ['obs', 'action', 'reward', 'next_obs']. Returns: - - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss. + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement your own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. """ + # Data preprocessing operations, such as stack data, cpu to cuda device data = default_preprocess_learn( data, use_priority=self._priority, ignore_done=self._cfg.learn.ignore_done, use_nstep=True ) @@ -182,19 +241,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: data['weight'] ) value_gamma = data.get('value_gamma') - entropy_loss = -self._ent_coef * entropies.mean() - fraction_loss = fqf_calculate_fraction_loss(q_tau_i.detach(), q_value, quantiles, data['action']) + entropy_loss - quantile_loss, td_error_per_sample = fqf_nstep_td_error( data_n, self._gamma, nstep=self._nstep, kappa=self._kappa, value_gamma=value_gamma ) - # compute grad norm of a network's parameters - def compute_grad_norm(model): - return torch.norm(torch.stack([torch.norm(p.grad.detach(), 2.0) for p in model.parameters()]), 2.0) - # ==================== # fraction_proposal network update # ==================== @@ -240,12 +292,25 @@ def compute_grad_norm(model): } def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ return [ 'cur_lr_fraction_loss', 'cur_lr_quantile_loss', 'logit', 'fraction_loss', 'quantile_loss', 'total_norm_quantiles_proposal', 'total_norm_Q', 'total_norm_fqf_fc', 'total_norm_encoder' ] def _state_dict_learn(self) -> Dict[str, Any]: + """ + Overview: + Return the state_dict of learn mode, usually including model and optimizer. + Returns: + - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring. + """ return { 'model': self._learn_model.state_dict(), 'target_model': self._target_model.state_dict(), @@ -254,6 +319,17 @@ def _state_dict_learn(self) -> Dict[str, Any]: } def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None: + """ + Overview: + Load the state_dict variable into policy learn mode. + Arguments: + - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before. + + .. tip:: + If you want to only load some parts of model, you can simply set the ``strict`` argument in \ + load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \ + complicated operation. + """ self._learn_model.load_state_dict(state_dict['model']) self._target_model.load_state_dict(state_dict['target_model']) self._fraction_loss_optimizer.load_state_dict(state_dict['optimizer_fraction_loss']) diff --git a/ding/policy/ibc.py b/ding/policy/ibc.py index b39e14f53a..887edf298d 100644 --- a/ding/policy/ibc.py +++ b/ding/policy/ibc.py @@ -20,39 +20,89 @@ class IBCPolicy(BehaviourCloningPolicy): r""" Overview: - Implicit Behavior Cloning - https://arxiv.org/abs/2109.00137.pdf + Policy class of IBC (Implicit Behavior Cloning), proposed in https://arxiv.org/abs/2109.00137.pdf. .. note:: - The code is adapted from the pytorch version of IBC https://github.com/kevinzakka/ibc, - which only supports the derivative-free optimization (dfo) variants. - This implementation moves a step forward and supports all variants of energy-based model - mentioned in the paper (dfo, autoregressive dfo, and mcmc). + The code is adapted from the pytorch version of IBC https://github.com/kevinzakka/ibc, which only supports the \ + derivative-free optimization (dfo) variants. This implementation moves a step forward and supports all \ + variants of energy-based model mentioned in the paper (dfo, autoregressive dfo, and mcmc). """ config = dict( + # (str) The policy type. 'ibc' refers to Implicit Behavior Cloning. type='ibc', + # (bool) Whether to use CUDA for training. False means CPU will be used. cuda=False, + # (bool) If True, the policy will operate on-policy. Here it's False, indicating off-policy. on_policy=False, + # (bool) Whether the action space is continuous. True for continuous action space. continuous=True, - model=dict(stochastic_optim=dict(type='mcmc', )), + # (dict) Configuration for the model, including stochastic optimization settings. + model=dict( + # (dict) Configuration for the stochastic optimization, specifying the type of optimizer. + stochastic_optim=dict( + # (str) The type of stochastic optimizer. 'mcmc' refers to Markov Chain Monte Carlo methods. + type='mcmc', + ), + ), + # (dict) Configuration for the learning process. learn=dict( + # (int) The number of training epochs. train_epoch=30, + # (int) The size of batches used during training. batch_size=256, + # (dict) Configuration for the optimizer used during training. optim=dict( + # (float) The learning rate for the optimizer. learning_rate=1e-5, + # (float) The weight decay regularization term for the optimizer. weight_decay=0.0, + # (float) The beta1 hyperparameter for the AdamW optimizer. beta1=0.9, + # (float) The beta2 hyperparameter for the AdamW optimizer. beta2=0.999, ), ), - eval=dict(evaluator=dict(eval_freq=10000, )), + # (dict) Configuration for the evaluation process. + eval=dict( + # (dict) Configuration for the evaluator. + evaluator=dict( + # (int) The frequency of evaluations during training, in terms of number of training steps. + eval_freq=10000, + ), + ), ) def default_model(self) -> Tuple[str, List[str]]: + """ + Overview: + Returns the default model configuration used by the IBC algorithm. ``__init__`` method will \ + automatically call this method to get the default model setting and create model. + + Returns: + - model_info (:obj:`Tuple[str, List[str]]`): \ + Tuple containing the registered model name and model's import_names. + """ return 'ebm', ['ding.model.template.ebm'] - def _init_learn(self): + def _init_learn(self) -> None: + """ + Overview: + Initialize the learn mode of policy, including related attributes and modules. For IBC, it mainly \ + contains optimizer and main model. \ + This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``. + + .. note:: + For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \ + and ``_load_state_dict_learn`` methods. + + .. note:: + For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method. + + .. note:: + If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \ + with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``. + """ self._timer = EasyTimer(cuda=self._cfg.cuda) self._sync_timer = EasyTimer(cuda=self._cfg.cuda) optim_cfg = self._cfg.learn.optim @@ -67,7 +117,31 @@ def _init_learn(self): self._learn_model = model_wrap(self._model, 'base') self._learn_model.reset() - def _forward_learn(self, data): + def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Overview: + Policy forward function of learn mode (training policy and updating parameters). Forward means \ + that the policy inputs some training batch data from the replay buffer and then returns the output \ + result, including various training information such as policy_loss, value_loss, entropy_loss. + Arguments: + - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \ + training samples. For each element in list, the key of the dict is the name of data items and the \ + value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \ + combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \ + dimension by some utility functions such as ``default_preprocess_learn``. \ + For IBC, each element in list is a dict containing at least the following keys: \ + ['obs', 'action']. + Returns: + - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \ + recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \ + detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method. + + .. note:: + The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \ + For the data type that not supported, the main reason is that the corresponding model does not support it. \ + You can implement your own model rather than use the default model. For more information, please raise an \ + issue in GitHub repo and we will continue to follow up. + """ with self._timer: data = default_collate(data) if self._cuda: @@ -81,7 +155,7 @@ def _forward_learn(self, data): obs, action = data['obs'], data['action'] # When action/observation space is 1, the action/observation dimension will # be squeezed in the first place, therefore unsqueeze there to make the data - # compatiable with the ibc pipeline. + # compatible with the ibc pipeline. if len(obs.shape) == 1: obs = obs.unsqueeze(-1) if len(action.shape) == 1: @@ -136,17 +210,51 @@ def _forward_learn(self, data): **loss_dict, } - def _monitor_vars_learn(self): + def _monitor_vars_learn(self) -> List[str]: + """ + Overview: + Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \ + as text logger, tensorboard logger, will use these keys to save the corresponding data. + Returns: + - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged. + """ if isinstance(self._stochastic_optimizer, MCMC): return ['total_loss', 'ebm_loss', 'grad_penalty', 'total_time', 'sync_time'] else: return ['total_loss', 'ebm_loss', 'total_time', 'sync_time'] - def _init_eval(self): + def _init_eval(self) -> None: + """ + Overview: + Initialize the eval mode of policy, including related attributes and modules. + This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``. + + .. note:: + If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \ + with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``. + """ self._eval_model = model_wrap(self._model, wrapper_name='base') self._eval_model.reset() - def _forward_eval(self, data: dict) -> dict: + def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]: + """ + Overview: + Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \ + means that the policy gets some necessary data (mainly observation) from the envs and then returns the \ + action to interact with the envs. + Arguments: + - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \ + key of the dict is environment id and the value is the corresponding data of the env. + Returns: + - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \ + key of the dict is the same as the input data, i.e., environment id. + + .. note:: + The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \ + For the data type that is not supported, the main reason is that the corresponding model does not \ + support it. You can implement your own model rather than use the default model. For more information, \ + please raise an issue in GitHub repo, and we will continue to follow up. + """ tensor_input = isinstance(data, torch.Tensor) if not tensor_input: data_id = list(data.keys()) @@ -168,6 +276,13 @@ def _forward_eval(self, data: dict) -> dict: return {i: d for i, d in zip(data_id, output)} def set_statistic(self, statistics: EasyDict) -> None: + """ + Overview: + Set the statistics of the environment, including the action space and the observation space. + Arguments: + - statistics (:obj:`EasyDict`): The statistics of the environment. For IBC, it contains at least the \ + following keys: ['action_bounds']. + """ self._stochastic_optimizer.set_action_bounds(statistics.action_bounds) # =================================================================== #