diff --git a/ding/example/bcq.py b/ding/example/bcq.py
index 4bd1385c3f..d0114d120e 100755
--- a/ding/example/bcq.py
+++ b/ding/example/bcq.py
@@ -15,7 +15,7 @@
 
 def main():
     # If you don't have offline data, you need to prepare if first and set the data_path in config
-    # For demostration, we also can train a RL policy (e.g. SAC) and collect some data
+    # For demonstration, we also can train a RL policy (e.g. SAC) and collect some data
     logging.getLogger().setLevel(logging.INFO)
     cfg = compile_config(main_config, create_cfg=create_config, auto=True)
     ding_init(cfg)
diff --git a/ding/policy/a2c.py b/ding/policy/a2c.py
index 6e05f4e712..2d2f116afc 100644
--- a/ding/policy/a2c.py
+++ b/ding/policy/a2c.py
@@ -1,11 +1,12 @@
-from typing import List, Dict, Any, Tuple, Union
 from collections import namedtuple
+from typing import List, Dict, Any, Tuple
+
 import torch
 
+from ding.model import model_wrap
 from ding.rl_utils import a2c_data, a2c_error, get_gae_with_default_last_value, get_train_sample, \
-                        a2c_error_continuous
+    a2c_error_continuous
 from ding.torch_utils import Adam, to_device
-from ding.model import model_wrap
 from ding.utils import POLICY_REGISTRY, split_data_generator
 from ding.utils.data import default_collate, default_decollate
 from .base_policy import Policy
@@ -14,68 +15,95 @@
 
 @POLICY_REGISTRY.register('a2c')
 class A2CPolicy(Policy):
-    r"""
+    """
     Overview:
-        Policy class of A2C algorithm.
+        Policy class of A2C (Advantage Actor-Critic) algorithm, proposed in https://arxiv.org/abs/1602.01783.
     """
     config = dict(
-        # (string) RL policy register name (refer to function "register_policy").
+        # (str) Name of the registered RL policy (refer to the "register_policy" function).
         type='a2c',
-        # (bool) Whether to use cuda for network.
+        # (bool) Flag to enable CUDA for model computation.
         cuda=False,
-        # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same)
-        on_policy=True,  # for a2c strictly on policy algorithm, this line should not be seen by users
+        # (bool) Flag for using on-policy training (training policy is the same as the behavior policy).
+        on_policy=True,
+        # (bool) Flag for enabling priority experience replay. Must be False when priority_IS_weight is False.
         priority=False,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) Flag for using Importance Sampling weights to correct updates. Requires `priority` to be True.
         priority_IS_weight=False,
-        # (str) Which kind of action space used in PPOPolicy, ['discrete', 'continuous']
+        # (str) Type of action space used in the policy, with valid options ['discrete', 'continuous'].
         action_space='discrete',
+        # learn_mode configuration
         learn=dict(
-
-            # (int) for a2c, update_per_collect must be 1.
-            update_per_collect=1,  # fixed value, this line should not be modified by users
+            # (int) Number of updates per data collection. A2C requires this to be set to 1.
+            update_per_collect=1,
+            # (int) Batch size for learning.
             batch_size=64,
+            # (float) Learning rate for optimizer.
             learning_rate=0.001,
-            # (List[float])
+            # (Tuple[float, float]) Coefficients used for computing running averages of gradient and its square.
             betas=(0.9, 0.999),
-            # (float)
+            # (float) Term added to the denominator to improve numerical stability in optimizer.
             eps=1e-8,
-            # (float)
+            # (float) Maximum norm for gradients.
             grad_norm=0.5,
-            # ==============================================================
-            # The following configs is algorithm-specific
-            # ==============================================================
-            # (float) loss weight of the value network, the weight of policy network is set to 1
+            # (float) Scaling factor for value network loss relative to policy network loss.
             value_weight=0.5,
-            # (float) loss weight of the entropy regularization, the weight of policy network is set to 1
+            # (float) Weight of entropy regularization in the loss function.
             entropy_weight=0.01,
-            # (bool) Whether to normalize advantage. Default to False.
+            # (bool) Flag to enable normalization of advantages.
             adv_norm=False,
+            # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time
+            # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks
+            # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments,
+            # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching
+            # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the
+            # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`,
+            # even when the episode surpasses the predefined step limit.
             ignore_done=False,
         ),
+        # collect_mode configuration
         collect=dict(
-            # (int) collect n_sample data, train model n_iteration times
-            # n_sample=80,
+            # (int) The length of rollout for data collection.
             unroll_len=1,
-            # ==============================================================
-            # The following configs is algorithm-specific
-            # ==============================================================
-            # (float) discount factor for future reward, defaults int [0, 1]
+            # (float) Discount factor for calculating future rewards, typically in the range [0, 1].
             discount_factor=0.9,
-            # (float) the trade-off factor lambda to balance 1step td and mc
+            # (float) Trade-off parameter for balancing TD-error and Monte Carlo error in GAE.
             gae_lambda=0.95,
         ),
+        # eval_mode configuration (kept empty for compatibility purposes)
         eval=dict(),
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Returns the default model configuration used by the A2C algorithm. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): \
+                Tuple containing the registered model name and model's import_names.
+        """
         return 'vac', ['ding.model.template.vac']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For A2C, it mainly \
+            contains optimizer, algorithm-specific arguments such as value_weight, entropy_weight, adv_norm
+            and grad_norm, and main model. \
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         assert self._cfg.action_space in ["continuous", "discrete"]
         # Optimizer
@@ -98,15 +126,32 @@ def _init_learn(self) -> None:
         self._learn_model = model_wrap(self._model, wrapper_name='base')
         self._learn_model.reset()
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
-        r"""
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as policy_loss, value_loss, entropy_loss.
         Arguments:
-            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs','adv']
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in the list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For A2C, each element in the list is a dict containing at least the following keys: \
+                ['obs', 'action', 'adv', 'value', 'weight'].
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that is not supported, the main reason is that the corresponding model does not support \
+             it. You can implement your own model rather than use the default model. For more information, please \
+             raise an issue in GitHub repo, and we will continue to follow up.
         """
+        # Data preprocessing operations, such as stack data, cpu to cuda device
         data = default_preprocess_learn(data, ignore_done=self._cfg.learn.ignore_done, use_nstep=False)
         if self._cuda:
             data = to_device(data, self._device)
@@ -135,7 +180,6 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
             # ====================
             # A2C-learning update
             # ====================
-
             self._optimizer.zero_grad()
             total_loss.backward()
 
@@ -160,22 +204,44 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         }
 
     def _state_dict_learn(self) -> Dict[str, Any]:
+        """
+        Overview:
+            Return the state_dict of learn mode, usually including model and optimizer.
+        Returns:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring.
+        """
         return {
             'model': self._learn_model.state_dict(),
             'optimizer': self._optimizer.state_dict(),
         }
 
     def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Overview:
+            Load the state_dict variable into policy learn mode.
+        Arguments:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before.
+
+        .. tip::
+            If you want to only load some parts of model, you can simply set the ``strict`` argument in \
+            load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \
+            complicated operation.
+        """
         self._learn_model.load_state_dict(state_dict['model'])
         self._optimizer.load_state_dict(state_dict['optimizer'])
 
     def _init_collect(self) -> None:
-        r"""
-        Overview:
-            Collect mode init method. Called by ``self.__init__``.
-            Init traj and unroll length, collect model.
         """
+        Overview:
+            Initialize the collect mode of policy, including related attributes and modules. For A2C, it contains the \
+            collect_model to balance the exploration and exploitation with ``reparam_sample`` or \
+            ``multinomial_sample`` mechanism, and other algorithm-specific arguments such as gamma and gae_lambda. \
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
 
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+        """
         assert self._cfg.action_space in ["continuous", "discrete"]
         self._unroll_len = self._cfg.collect.unroll_len
 
@@ -189,17 +255,19 @@ def _init_collect(self) -> None:
         self._gamma = self._cfg.collect.discount_factor
         self._gae_lambda = self._cfg.collect.gae_lambda
 
-    def _forward_collect(self, data: dict) -> dict:
-        r"""
+    def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
         Overview:
-            Forward function of collect mode.
+            Policy forward function of collect mode (collecting training data by interacting with envs). Forward means \
+            that the policy gets some necessary data (mainly observation) from the envs and then returns the output \
+            data, such as the action to interact with the envs.
         Arguments:
-            - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \
-                values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
         Returns:
-            - output (:obj:`Dict[int, Any]`): Dict type data, including at least inferred action according to input obs.
-        ReturnsKeys
-            - necessary: ``action``
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \
+                other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \
+                dict is the same as the input data, i.e. environment id.
         """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
@@ -213,51 +281,68 @@ def _forward_collect(self, data: dict) -> dict:
         output = default_decollate(output)
         return {i: d for i, d in zip(data_id, output)}
 
-    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
-        r"""
+    def _process_transition(self, obs: Any, policy_output: Dict[str, torch.Tensor],
+                            timestep: namedtuple) -> Dict[str, torch.Tensor]:
+        """
         Overview:
-            Generate dict type transition data from inputs.
+            Process and pack one timestep transition data into a dict, which can be directly used for training and \
+            saved in replay buffer. For A2C, it contains obs, next_obs, action, value, reward, done.
         Arguments:
-            - obs (:obj:`Any`): Env observation
-            - model_output (:obj:`dict`): Output of collect model, including at least ['action']
-            - timestep (:obj:`namedtuple`): Output after env step, including at least ['obs', 'reward', 'done'] \
-                (here 'obs' indicates obs after env step).
+            - obs (:obj:`torch.Tensor`): The env observation of current timestep, such as stacked 2D image in Atari.
+            - policy_output (:obj:`Dict[str, torch.Tensor]`): The output of the policy network with the observation \
+                as input. For A2C, it contains the action and the value of the state.
+            - timestep (:obj:`namedtuple`): The execution result namedtuple returned by the environment step method, \
+                except all the elements have been transformed into tensor data. Usually, it contains the next obs, \
+                reward, done, info, etc.
         Returns:
-            - transition (:obj:`dict`): Dict type transition data.
+            - transition (:obj:`Dict[str, torch.Tensor]`): The processed transition data of the current timestep.
         """
         transition = {
             'obs': obs,
             'next_obs': timestep.obs,
-            'action': model_output['action'],
-            'value': model_output['value'],
+            'action': policy_output['action'],
+            'value': policy_output['value'],
             'reward': timestep.reward,
             'done': timestep.done,
         }
         return transition
 
-    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
-        r"""
+    def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
         Overview:
-            Get the trajectory and the n step return data, then sample from the n_step return data
+            For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \
+            can be used for training directly. In A2C, a train sample is a processed transition. \
+            This method is usually used in collectors to execute necessary \
+            RL data preprocessing before training, which can help the learner amortize relevant time consumption. \
+            In addition, you can also implement this method as an identity function and do the data processing \
+            in ``self._forward_learn`` method.
         Arguments:
-            - data (:obj:`list`): The trajectory's buffer list
+            - transitions (:obj:`List[Dict[str, Any]`): The trajectory data (a list of transition), each element is \
+                in the same format as the return value of ``self._process_transition`` method.
         Returns:
-            - samples (:obj:`dict`): The training samples generated
+            - samples (:obj:`List[Dict[str, Any]]`): The processed train samples, each element is similar in format \
+                to input transitions, but may contain more data for training, such as advantages.
         """
-        data = get_gae_with_default_last_value(
-            data,
-            data[-1]['done'],
+        transitions = get_gae_with_default_last_value(
+            transitions,
+            transitions[-1]['done'],
             gamma=self._gamma,
             gae_lambda=self._gae_lambda,
             cuda=self._cuda,
         )
-        return get_train_sample(data, self._unroll_len)
+        return get_train_sample(transitions, self._unroll_len)
 
     def _init_eval(self) -> None:
-        r"""
+        """
         Overview:
-            Evaluate mode init method. Called by ``self.__init__``.
-            Init eval model with argmax strategy.
+            Initialize the eval mode of policy, including related attributes and modules. For A2C, it contains the \
+            eval model to greedily select action with ``argmax_sample`` mechanism (For discrete action space) and \
+            ``deterministic_sample`` mechanism (For continuous action space). \
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
         """
         assert self._cfg.action_space in ["continuous", "discrete"]
         self._action_space = self._cfg.action_space
@@ -267,17 +352,24 @@ def _init_eval(self) -> None:
             self._eval_model = model_wrap(self._model, wrapper_name='argmax_sample')
         self._eval_model.reset()
 
-    def _forward_eval(self, data: dict) -> dict:
-        r"""
+    def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
         Overview:
-            Forward function of eval mode, similar to ``self._forward_collect``.
+            Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \
+            means that the policy gets some necessary data (mainly observation) from the envs and then returns the \
+            action to interact with the envs.
         Arguments:
-            - data (:obj:`Dict[str, Any]`): Dict type data, stacked env data for predicting policy_output(action), \
-                values are torch.Tensor or np.ndarray or dict/list combinations, keys are env_id indicated by integer.
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
         Returns:
-            - output (:obj:`Dict[int, Any]`): The dict of predicting action for the interaction with env.
-        ReturnsKeys
-            - necessary: ``action``
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \
+                key of the dict is the same as the input data, i.e., environment id.
+
+        .. note::
+            The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \
+            For the data type that is not supported, the main reason is that the corresponding model does not \
+            support it. You can implement your own model rather than use the default model. For more information, \
+            please raise an issue in GitHub repo, and we will continue to follow up.
         """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
@@ -292,4 +384,11 @@ def _forward_eval(self, data: dict) -> dict:
         return {i: d for i, d in zip(data_id, output)}
 
     def _monitor_vars_learn(self) -> List[str]:
+        """
+        Overview:
+            Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \
+            as text logger, tensorboard logger, will use these keys to save the corresponding data.
+        Returns:
+            - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged.
+        """
         return super()._monitor_vars_learn() + ['policy_loss', 'value_loss', 'entropy_loss', 'adv_abs_max', 'grad_norm']
diff --git a/ding/policy/acer.py b/ding/policy/acer.py
index 7ac4db7753..319b2fe814 100644
--- a/ding/policy/acer.py
+++ b/ding/policy/acer.py
@@ -47,11 +47,11 @@ class ACERPolicy(Policy):
     config = dict(
         type='acer',
         cuda=False,
-        # (bool) whether use on-policy training pipeline(behaviour policy and training policy are the same)
+        # (bool) whether to use on-policy training pipeline (behaviour policy and training policy are the same)
         # here we follow ppo serial pipeline, the original is False
         on_policy=False,
         priority=False,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True.
         priority_IS_weight=False,
         learn=dict(
             # (str) the type of gradient clip method
@@ -295,7 +295,7 @@ def _reshape_data(
             Update values and rewards with the weight
         Arguments:
             - output (:obj:`Dict[int, Any]`): Dict type data, output of learn_model forward. \
-             Values are torch.Tensor or np.ndarray or dict/list combinations,keys are value, logit.
+             Values are torch.Tensor or np.ndarray or dict/list combinations, keys are value, logit.
             - data (:obj:`Dict[int, Any]`): Dict type data, input of policy._forward_learn \
              Values are torch.Tensor or np.ndarray or dict/list combinations. Keys includes at \
              least ['logit', 'action', 'reward', 'done',]
@@ -378,7 +378,7 @@ def _forward_collect(self, data: Dict[int, Any]) -> Dict[int, Dict[str, Any]]:
             action, values are torch.Tensor or np.ndarray or dict/list combinations,keys \
             are env_id indicated by integer.
         Returns:
-            - output (:obj:`Dict[int, Dict[str,Any]]`): Dict of predicting policy_output(logit, action) for each env.
+            - output (:obj:`Dict[int, Dict[str, Any]]`): Dict of predicting policy_output(logit, action) for each env.
         ReturnsKeys
             - necessary: ``logit``, ``action``
         """
@@ -479,7 +479,7 @@ def _monitor_vars_learn(self) -> List[str]:
         Returns:
             - model_info (:obj:`Tuple[str, List[str]]`): model name and mode import_names
         .. note::
-            The user can define and use customized network model but must obey the same interface definition indicated \
-            by import_names path. For IMPALA, ``ding.model.interface.IMPALA``
+            The user can define and use a customized network model but must obey the same interface definition \
+            indicated by import_names path. For IMPALA, ``ding.model.interface.IMPALA``
         """
         return ['actor_loss', 'bc_loss', 'policy_loss', 'critic_loss', 'entropy_loss', 'kl_div']
diff --git a/ding/policy/base_policy.py b/ding/policy/base_policy.py
index 3ff99c7b43..7e843f8429 100644
--- a/ding/policy/base_policy.py
+++ b/ding/policy/base_policy.py
@@ -196,14 +196,14 @@ def hook(*ignore):
     def _create_model(self, cfg: EasyDict, model: Optional[torch.nn.Module] = None) -> torch.nn.Module:
         """
         Overview:
-            Create or validate the neural network model according to input configures and model. If the input model is \
-            None, then the model will be created according to ``default_model`` method and ``cfg.model`` field. \
-            Otherwise, the model will be verified as an instance of ``torch.nn.Module`` and set to the ``model`` \
-            instance created by outside caller.
+            Create or validate the neural network model according to the input configuration and model. \
+            If the input model is None, then the model will be created according to ``default_model`` \
+            method and ``cfg.model`` field. Otherwise, the model will be verified as an instance of \
+            ``torch.nn.Module`` and set to the ``model`` instance created by outside caller.
         Arguments:
             - cfg (:obj:`EasyDict`): The final merged config used to initialize policy.
             - model (:obj:`torch.nn.Module`): The neural network model used to initialize policy. User can refer to \
-                the default model defined in corresponding policy to customize its own model.
+                the default model defined in the corresponding policy to customize its own model.
         Returns:
             - model (:obj:`torch.nn.Module`): The created neural network model. The different modes of policy will \
                 add distinct wrappers and plugins to the model, which is used to train, collect and evaluate.
@@ -272,7 +272,7 @@ def _init_eval(self) -> None:
         Overview:
             Initialize the eval mode of policy, including related attributes and modules. This method will be \
             called in ``__init__`` method if ``eval`` field is in ``enable_field``. Almost different policies have \
-            its own eval mode, so this method must be overrided in subclass.
+            its own eval mode, so this method must be override in subclass.
 
         .. note::
             For the member variables that need to be saved and loaded, please refer to the ``_state_dict_eval`` \
@@ -289,7 +289,7 @@ def learn_mode(self) -> 'Policy.learn_function':  # noqa
         """
         Overview:
             Return the interfaces of learn mode of policy, which is used to train the model. Here we use namedtuple \
-            to define immutable interfaces and restrict the usage of policy in different mode. Moreover, derived \
+            to define immutable interfaces and restrict the usage of policy in different modes. Moreover, derived \
             subclass can override the interfaces to customize its own learn mode.
         Returns:
             - interfaces (:obj:`Policy.learn_function`): The interfaces of learn mode of policy, it is a namedtuple \
@@ -316,7 +316,7 @@ def collect_mode(self) -> 'Policy.collect_function':  # noqa
         """
         Overview:
             Return the interfaces of collect mode of policy, which is used to train the model. Here we use namedtuple \
-            to define immutable interfaces and restrict the usage of policy in different mode. Moreover, derived \
+            to define immutable interfaces and restrict the usage of policy in different modes. Moreover, derived \
             subclass can override the interfaces to customize its own collect mode.
         Returns:
             - interfaces (:obj:`Policy.collect_function`): The interfaces of collect mode of policy, it is a \
@@ -370,7 +370,7 @@ def _set_attribute(self, name: str, value: Any) -> None:
         Overview:
             In order to control the access of the policy attributes, we expose different modes to outside rather than \
             directly use the policy instance. And we also provide a method to set the attribute of the policy in \
-            different modes. And the new attribute will named as ``_{name}``.
+            different modes. And the new attribute will name as ``_{name}``.
         Arguments:
             - name (:obj:`str`): The name of the attribute.
             - value (:obj:`Any`): The value of the attribute.
@@ -416,7 +416,7 @@ def sync_gradients(self, model: torch.nn.Module) -> None:
             - model (:obj:`torch.nn.Module`): The model to synchronize gradients.
 
         .. note::
-            This method is only used in multi-gpu training, and it shoule be called after ``backward`` method and \
+            This method is only used in multi-gpu training, and it should be called after ``backward`` method and \
             before ``step`` method. The user can also use ``bp_update_sync`` config to control whether to synchronize \
             gradients allreduce and optimizer updates.
         """
diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
index 9a8388b00f..2f0643e612 100755
--- a/ding/policy/bcq.py
+++ b/ding/policy/bcq.py
@@ -1,15 +1,14 @@
-from typing import List, Dict, Any, Tuple, Union
-from collections import namedtuple
 import copy
-import numpy as np
+from collections import namedtuple
+from typing import List, Dict, Any, Tuple
+
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 
-from ding.torch_utils import Adam, to_device
-from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data
 from ding.model import model_wrap
 from ding.policy import Policy
+from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data
+from ding.torch_utils import Adam, to_device
 from ding.utils import POLICY_REGISTRY
 from ding.utils.data import default_collate, default_decollate
 from .common_utils import default_preprocess_learn
@@ -17,94 +16,103 @@
 
 @POLICY_REGISTRY.register('bcq')
 class BCQPolicy(Policy):
+    """
+    Overview:
+        Policy class of BCQ (Batch-Constrained deep Q-learning) algorithm, proposed in \
+        https://arxiv.org/abs/1812.02900.
+    """
+
     config = dict(
+        # (str) Name of the registered RL policy (refer to the "register_policy" function).
         type='bcq',
-        # (bool) Whether to use cuda for network.
+        # (bool) Indicates if CUDA should be used for network operations.
         cuda=False,
-        # (bool type) priority: Determine whether to use priority in buffer sample.
-        # Default False in SAC.
+        # (bool) Determines whether priority sampling is used in the replay buffer. Default is False.
         priority=False,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) If True, Importance Sampling Weight is used to correct updates. Requires 'priority' to be True.
         priority_IS_weight=False,
-        # (int) Number of training samples(randomly collected) in replay buffer when training starts.
-        # Default 10000 in SAC.
+        # (int) Number of random samples in replay buffer before training begins. Default is 10000.
         random_collect_size=10000,
+        # (int) The number of steps for calculating target q_value.
         nstep=1,
         model=dict(
-            # (List) Hidden list for actor network head.
+            # (List[int]) Sizes of the hidden layers in the actor network.
             actor_head_hidden_size=[400, 300],
-
-            # (List) Hidden list for critic network head.
+            # (List[int]) Sizes of the hidden layers in the critic network.
             critic_head_hidden_size=[400, 300],
-            # Max perturbation hyper-parameter for BCQ
+            # (float) Maximum perturbation for BCQ. Controls exploration in action space.
             phi=0.05,
         ),
         learn=dict(
-
-            # How many updates(iterations) to train after collector's one collection.
-            # Bigger "update_per_collect" means bigger off-policy.
-            # collect data -> update policy-> collect data -> ...
+            # (int) Number of policy updates per data collection step. Higher values indicate more off-policy training.
             update_per_collect=1,
-            # (int) Minibatch size for gradient descent.
+            # (int) Batch size for each gradient descent step.
             batch_size=100,
-
-            # (float type) learning_rate_q: Learning rate for soft q network.
-            # Default to 3e-4.
-            # Please set to 1e-3, when model.value_network is True.
+            # (float) Learning rate for the Q-network. Set to 1e-3 if `model.value_network` is True.
             learning_rate_q=3e-4,
-            # (float type) learning_rate_policy: Learning rate for policy network.
-            # Default to 3e-4.
-            # Please set to 1e-3, when model.value_network is True.
+            # (float) Learning rate for the policy network. Set to 1e-3 if `model.value_network` is True.
             learning_rate_policy=3e-4,
-            # (float type) learning_rate_vae: Learning rate for vae network.
-            # `learning_rate_value` should be initialized, when model.vae_network is True.
-            # Please set to 3e-4, when model.vae_network is True.
+            # (float) Learning rate for the VAE network. Initialize if `model.vae_network` is True.
             learning_rate_vae=3e-4,
-            # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum)
-            # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers.
-            # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000.
-            # However, interaction with HalfCheetah always gets done with done is False,
-            # Since we inplace done==True with done==False to keep
-            # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``),
-            # when the episode step is greater than max episode step.
+            # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time
+            # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks
+            # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments,
+            # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching
+            # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the
+            # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`,
+            # even when the episode surpasses the predefined step limit.
             ignore_done=False,
-
-            # (float type) target_theta: Used for soft update of the target network,
-            # aka. Interpolation factor in polyak averaging for target networks.
-            # Default to 0.005.
+            # (float) Polyak averaging coefficient for the target network update. Typically small.
             target_theta=0.005,
-            # (float) discount factor for the discounted sum of rewards, aka. gamma.
+            # (float) Discount factor for future rewards, often denoted as gamma.
             discount_factor=0.99,
+            # (float) Lambda for TD(lambda) learning. Weighs the trade-off between bias and variance.
             lmbda=0.75,
-
-            # (float) Weight uniform initialization range in the last output layer
+            # (float) Range for uniform weight initialization in the output layer.
             init_w=3e-3,
         ),
         collect=dict(
-            # (int) Cut trajectories into pieces with length "unroll_len".
+            # (int) Length of trajectory segments for unrolling. Set to higher for longer dependencies.
             unroll_len=1,
         ),
         eval=dict(),
         other=dict(
             replay_buffer=dict(
-                # (int type) replay_buffer_size: Max size of replay buffer.
+                # (int) Maximum size of the replay buffer.
                 replay_buffer_size=1000000,
-                # (int type) max_use: Max use times of one data in the buffer.
-                # Data will be removed once used for too many times.
-                # Default to infinite.
-                # max_use=256,
             ),
         ),
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Returns the default model configuration used by the BCQ algorithm. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): \
+                Tuple containing the registered model name and model's import_names.
+        """
         return 'bcq', ['ding.model.template.bcq']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init q, value and policy's optimizers, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For BCQ, it mainly \
+            contains optimizer, algorithm-specific arguments such as gamma, main and target model. \
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         # Init
         self._priority = self._cfg.priority
@@ -140,12 +148,35 @@ def _init_learn(self) -> None:
         self._learn_model = model_wrap(self._model, wrapper_name='base')
         self._learn_model.reset()
         self._target_model.reset()
-
         self._forward_learn_cnt = 0
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Overview:
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as policy_loss, value_loss, entropy_loss.
+        Arguments:
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For BCQ, each element in list is a dict containing at least the following keys: \
+                ['obs', 'action', 'adv', 'value', 'weight'].
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement your own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+        """
         loss_dict = {}
-
+        # Data preprocessing operations, such as stack data, cpu to cuda device
         data = default_preprocess_learn(
             data,
             use_priority=self._priority,
@@ -183,7 +214,7 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         # train_critic
         q_value = self._learn_model.forward(data, mode='compute_critic')['q_value']
 
-        with torch.no_grad():
+        with (torch.no_grad()):
             next_obs_rep = torch.repeat_interleave(next_obs, 10, 0)
             z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5)
             vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action']
@@ -230,12 +261,25 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
         }
 
     def _monitor_vars_learn(self) -> List[str]:
+        """
+        Overview:
+            Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \
+            as text logger, tensorboard logger, will use these keys to save the corresponding data.
+        Returns:
+            - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged.
+        """
         return [
             'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss',
             'vae_loss'
         ]
 
     def _state_dict_learn(self) -> Dict[str, Any]:
+        """
+        Overview:
+            Return the state_dict of learn mode, usually including model and optimizer.
+        Returns:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring.
+        """
         ret = {
             'model': self._learn_model.state_dict(),
             'target_model': self._target_model.state_dict(),
@@ -245,11 +289,38 @@ def _state_dict_learn(self) -> Dict[str, Any]:
         }
         return ret
 
-    def _init_eval(self):
+    def _init_eval(self) -> None:
+        """
+        Overview:
+            Initialize the eval mode of policy, including related attributes and modules.
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
+        """
         self._eval_model = model_wrap(self._model, wrapper_name='base')
         self._eval_model.reset()
 
-    def _forward_eval(self, data: dict) -> Dict[str, Any]:
+    def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Overview:
+            Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \
+            means that the policy gets some necessary data (mainly observation) from the envs and then returns the \
+            action to interact with the envs.
+        Arguments:
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \
+                key of the dict is the same as the input data, i.e., environment id.
+
+        .. note::
+            The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \
+            For the data type that is not supported, the main reason is that the corresponding model does not \
+            support it. You can implement your own model rather than use the default model. For more information, \
+            please raise an issue in GitHub repo, and we will continue to follow up.
+        """
         data_id = list(data.keys())
         data = default_collate(list(data.values()))
         if self._cuda:
@@ -264,26 +335,28 @@ def _forward_eval(self, data: dict) -> Dict[str, Any]:
         return {i: d for i, d in zip(data_id, output)}
 
     def _init_collect(self) -> None:
+        """
+        Overview:
+            Initialize the collect mode of policy, including related attributes and modules. For BCQ, it contains the \
+            collect_model to balance the exploration and exploitation with ``eps_greedy_sample`` \
+             mechanism, and other algorithm-specific arguments such as gamma and nstep.
+            This method will be called in ``__init__`` method if ``collect`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_collect`` method, you'd better name them \
+            with prefix ``_collect_`` to avoid conflict with other modes, such as ``self._collect_attr1``.
+        """
         self._unroll_len = self._cfg.collect.unroll_len
-        self._gamma = self._cfg.discount_factor  # necessary for parallel
-        self._nstep = self._cfg.nstep  # necessary for parallel
+        self._gamma = self._cfg.discount_factor
+        self._nstep = self._cfg.nstep
         self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample')
         self._collect_model.reset()
 
+    def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        pass
+
     def _forward_collect(self, data: dict, **kwargs) -> dict:
         pass
 
     def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
         pass
-
-    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
-        r"""
-            Overview:
-                Get the trajectory and the n step return data, then sample from the n_step return data
-            Arguments:
-                - data (:obj:`list`): The trajectory's cache
-            Returns:
-                - samples (:obj:`dict`): The training samples generated
-            """
-        data = get_nstep_return_data(data, self._nstep, gamma=self._gamma)
-        return get_train_sample(data, self._unroll_len)
diff --git a/ding/policy/ddpg.py b/ding/policy/ddpg.py
index 2e253370b8..6f62c59795 100644
--- a/ding/policy/ddpg.py
+++ b/ding/policy/ddpg.py
@@ -68,7 +68,7 @@ class DDPGPolicy(Policy):
         on_policy=False,
         # (bool) Whether to enable priority experience sample.
         priority=False,
-        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        # (bool) Whether to use Importance Sampling Weight to correct biased update. If True, priority must be True.
         priority_IS_weight=False,
         # (int) Number of training samples(randomly collected) in replay buffer when training starts.
         # Default 25000 in DDPG/TD3.
@@ -411,7 +411,7 @@ def _forward_collect(self, data: Dict[int, Any], **kwargs) -> Dict[int, Any]:
         Returns:
             - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action and \
                 other necessary data for learn mode defined in ``self._process_transition`` method. The key of the \
-                dict is the same as the input data, i.e. environment id.
+                dict is the same as the input data, i.e., environment id.
 
         .. note::
             The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
diff --git a/ding/policy/dqn.py b/ding/policy/dqn.py
index d1f6fdbb49..8e0944f270 100644
--- a/ding/policy/dqn.py
+++ b/ding/policy/dqn.py
@@ -95,7 +95,7 @@ class DQNPolicy(Policy):
         priority_IS_weight=False,
         # (float) Discount factor(gamma) for returns.
         discount_factor=0.97,
-        # (int) The number of step for calculating target q_value.
+        # (int) The number of steps for calculating target q_value.
         nstep=1,
         model=dict(
             # (list(int)) Sequence of ``hidden_size`` of subsequent conv layers and the final dense layer.
@@ -111,31 +111,31 @@ class DQNPolicy(Policy):
             batch_size=64,
             # (float) The step size of gradient descent.
             learning_rate=0.001,
-            # (int) Frequence of target network update.
+            # (int) Frequency of target network update.
             # Only one of [target_update_freq, target_theta] should be set.
             target_update_freq=100,
-            # (float) : Used for soft update of the target network.
+            # (float) Used for soft update of the target network.
             # aka. Interpolation factor in EMA update for target network.
             # Only one of [target_update_freq, target_theta] should be set.
             target_theta=0.005,
-            # (bool) Whether ignore done(usually for max step termination env).
-            # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers.
-            # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000.
-            # However, interaction with HalfCheetah always gets done with done is False,
-            # Since we inplace done==True with done==False to keep
-            # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``),
-            # when the episode step is greater than max episode step.
+            # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time
+            # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks
+            # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments,
+            # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching
+            # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the
+            # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`,
+            # even when the episode surpasses the predefined step limit.
             ignore_done=False,
         ),
         # collect_mode config
         collect=dict(
             # (int) How many training samples collected in one collection procedure.
-            # Only one of [n_sample, n_episode] shoule be set.
+            # Only one of [n_sample, n_episode] should be set.
             n_sample=8,
             # (int) Split episodes or trajectories into pieces with length `unroll_len`.
             unroll_len=1,
         ),
-        eval=dict(),  # for compability
+        eval=dict(),  # for compatibility
         # other config
         other=dict(
             # Epsilon greedy with decay.
@@ -165,7 +165,7 @@ def default_model(self) -> Tuple[str, List[str]]:
             - model_info (:obj:`Tuple[str, List[str]]`): The registered model name and model's import_names.
 
         .. note::
-            The user can define and use customized network model but must obey the same inferface definition indicated \
+            The user can define and use customized network model but must obey the same interface definition indicated \
             by import_names path. For example about DQN, its registered name is ``dqn`` and the import_names is \
             ``ding.model.template.q_learning``.
         """
@@ -242,7 +242,7 @@ def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         .. note::
             The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
             For the data type that not supported, the main reason is that the corresponding model does not support it. \
-            You can implement you own model rather than use the default model. For more information, please raise an \
+            You can implement your own model rather than use the default model. For more information, please raise an \
             issue in GitHub repo and we will continue to follow up.
 
         .. note::
@@ -398,7 +398,7 @@ def _get_train_sample(self, transitions: List[Dict[str, Any]]) -> List[Dict[str,
             For a given trajectory (transitions, a list of transition) data, process it into a list of sample that \
             can be used for training directly. In DQN with nstep TD, a train sample is a processed transition. \
             This method is usually used in collectors to execute necessary \
-            RL data preprocessing before training, which can help learner amortize revelant time consumption. \
+            RL data preprocessing before training, which can help learner amortize relevant time consumption. \
             In addition, you can also implement this method as an identity function and do the data processing \
             in ``self._forward_learn`` method.
         Arguments:
diff --git a/ding/policy/fqf.py b/ding/policy/fqf.py
index f1ba86fd91..fae697f85d 100644
--- a/ding/policy/fqf.py
+++ b/ding/policy/fqf.py
@@ -1,22 +1,34 @@
-from typing import List, Dict, Any, Tuple, Union
 import copy
+from typing import List, Dict, Any, Tuple
+
 import torch
 
-from ding.torch_utils import Adam, RMSprop, to_device
-from ding.rl_utils import fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss, \
-    get_train_sample, get_nstep_return_data
 from ding.model import model_wrap
+from ding.rl_utils import fqf_nstep_td_data, fqf_nstep_td_error, fqf_calculate_fraction_loss
+from ding.torch_utils import Adam, RMSprop, to_device
 from ding.utils import POLICY_REGISTRY
-from ding.utils.data import default_collate, default_decollate
-from .dqn import DQNPolicy
 from .common_utils import default_preprocess_learn
+from .dqn import DQNPolicy
+
+
+def compute_grad_norm(model):
+    """
+    Overview:
+        Compute grad norm of a network's parameters.
+    Arguments:
+        - model (:obj:`nn.Module`): The network to compute grad norm.
+    Returns:
+        - grad_norm (:obj:`torch.Tensor`): The grad norm of the network's parameters.
+    """
+    return torch.norm(torch.stack([torch.norm(p.grad.detach(), 2.0) for p in model.parameters()]), 2.0)
 
 
 @POLICY_REGISTRY.register('fqf')
 class FQFPolicy(DQNPolicy):
-    r"""
+    """
     Overview:
-        Policy class of FQF algorithm.
+        Policy class of FQF (Fully Parameterized Quantile Function) algorithm, proposed in
+        https://arxiv.org/pdf/1911.02140.pdf.
 
     Config:
         == ==================== ======== ============== ======================================== =======================
@@ -46,70 +58,100 @@ class FQFPolicy(DQNPolicy):
     """
 
     config = dict(
-        # (str) RL policy register name (refer to function "POLICY_REGISTRY").
+        # (str) Name of the RL policy registered in "POLICY_REGISTRY" function.
         type='fqf',
-        # (bool) Whether to use cuda for network.
+        # (bool) Flag to enable/disable CUDA for network computation.
         cuda=False,
-        # (bool) Whether the RL algorithm is on-policy or off-policy.
+        # (bool) Indicator of the RL algorithm's policy type (True for on-policy algorithms).
         on_policy=False,
-        # (bool) Whether use priority(priority sample, IS weight, update priority)
+        # (bool) Toggle for using prioritized experience replay (priority sampling and updating).
         priority=False,
-        # (float) Reward's future discount factor, aka. gamma.
+        # (float) Discount factor (gamma) for calculating the future reward.
         discount_factor=0.97,
-        # (int) N-step reward for target q_value estimation
+        # (int) Number of steps to consider for calculating n-step returns.
         nstep=1,
         learn=dict(
-
-            # How many updates(iterations) to train after collector's one collection.
-            # Bigger "update_per_collect" means bigger off-policy.
-            # collect data -> update policy-> collect data -> ...
+            # (int) Number of training iterations per data collection from the environment.
             update_per_collect=3,
+            # (int) Size of minibatch for each update.
             batch_size=64,
+            # (float) Fractional learning rate for the fraction proposal network.
             learning_rate_fraction=2.5e-9,
+            # (float) Learning rate for the quantile regression network.
             learning_rate_quantile=0.00005,
             # ==============================================================
-            # The following configs are algorithm-specific
+            # Algorithm-specific configurations
             # ==============================================================
-            # (int) Frequence of target network update.
+            # (int) Frequency of target network updates.
             target_update_freq=100,
-            # (float) Threshold of Huber loss. In the FQF paper, this is denoted by kappa. Default to 1.0.
+            # (float) Huber loss threshold (kappa in the FQF paper).
             kappa=1.0,
-            # (float) Coefficient of entropy_loss.
+            # (float) Coefficient for the entropy loss term.
             ent_coef=0,
-            # (bool) Whether ignore done(usually for max step termination env)
+            # (bool) If set to True, the 'done' signals that indicate the end of an episode due to environment time
+            # limits are disregarded. By default, this is set to False. This setting is particularly useful for tasks
+            # that have a predetermined episode length, such as HalfCheetah and various other MuJoCo environments,
+            # where the maximum length is capped at 1000 steps. When enabled, any 'done' signal triggered by reaching
+            # the maximum episode steps will be overridden to 'False'. This ensures the accurate calculation of the
+            # Temporal Difference (TD) error, using the formula `gamma * (1 - done) * next_v + reward`,
+            # even when the episode surpasses the predefined step limit.
             ignore_done=False,
         ),
-        # collect_mode config
         collect=dict(
-            # (int) Only one of [n_sample, n_step, n_episode] shoule be set
+            # (int) Specify one of [n_sample, n_step, n_episode] for data collection.
             # n_sample=8,
-            # (int) Cut trajectories into pieces with length "unroll_len".
+            # (int) Length of trajectory segments for processing.
             unroll_len=1,
         ),
         eval=dict(),
-        # other config
         other=dict(
-            # Epsilon greedy with decay.
+            # Epsilon-greedy strategy with a decay mechanism.
             eps=dict(
-                # (str) Decay type. Support ['exp', 'linear'].
+                # (str) Type of decay mechanism ['exp' for exponential, 'linear'].
                 type='exp',
+                # (float) Initial value of epsilon in epsilon-greedy exploration.
                 start=0.95,
+                # (float) Final value of epsilon after decay.
                 end=0.1,
-                # (int) Decay length(env step)
+                # (int) Number of environment steps over which epsilon is decayed.
                 decay=10000,
             ),
-            replay_buffer=dict(replay_buffer_size=10000, )
+            replay_buffer=dict(
+                # (int) Size of the replay buffer.
+                replay_buffer_size=10000,
+            ),
         ),
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Returns the default model configuration used by the FQF algorithm. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): \
+                Tuple containing the registered model name and model's import_names.
+        """
         return 'fqf', ['ding.model.template.q_learning']
 
     def _init_learn(self) -> None:
-        r"""
+        """
         Overview:
-            Learn mode init method. Called by ``self.__init__``.
-            Init the optimizer, algorithm config, main and target models.
+            Initialize the learn mode of policy, including related attributes and modules. For FQF, it mainly \
+            contains optimizer, algorithm-specific arguments such as gamma, nstep, kappa ent_coef, main and \
+            target model. This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
         """
         self._priority = self._cfg.priority
         # Optimizer
@@ -143,15 +185,32 @@ def _init_learn(self) -> None:
         self._learn_model.reset()
         self._target_model.reset()
 
-    def _forward_learn(self, data: dict) -> Dict[str, Any]:
-        r"""
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
         Overview:
-            Forward and backward function of learn mode.
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as policy_loss, value_loss, entropy_loss.
         Arguments:
-            - data (:obj:`dict`): Dict type data, including at least ['obs', 'action', 'reward', 'next_obs']
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For FQF, each element in list is a dict containing at least the following keys: \
+                ['obs', 'action', 'reward', 'next_obs'].
         Returns:
-            - info_dict (:obj:`Dict[str, Any]`): Including current lr and loss.
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement your own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
         """
+        # Data preprocessing operations, such as stack data, cpu to cuda device
         data = default_preprocess_learn(
             data, use_priority=self._priority, ignore_done=self._cfg.learn.ignore_done, use_nstep=True
         )
@@ -182,19 +241,12 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
             data['weight']
         )
         value_gamma = data.get('value_gamma')
-
         entropy_loss = -self._ent_coef * entropies.mean()
-
         fraction_loss = fqf_calculate_fraction_loss(q_tau_i.detach(), q_value, quantiles, data['action']) + entropy_loss
-
         quantile_loss, td_error_per_sample = fqf_nstep_td_error(
             data_n, self._gamma, nstep=self._nstep, kappa=self._kappa, value_gamma=value_gamma
         )
 
-        # compute grad norm of a network's parameters
-        def compute_grad_norm(model):
-            return torch.norm(torch.stack([torch.norm(p.grad.detach(), 2.0) for p in model.parameters()]), 2.0)
-
         # ====================
         # fraction_proposal network update
         # ====================
@@ -240,12 +292,25 @@ def compute_grad_norm(model):
         }
 
     def _monitor_vars_learn(self) -> List[str]:
+        """
+        Overview:
+            Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \
+            as text logger, tensorboard logger, will use these keys to save the corresponding data.
+        Returns:
+            - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged.
+        """
         return [
             'cur_lr_fraction_loss', 'cur_lr_quantile_loss', 'logit', 'fraction_loss', 'quantile_loss',
             'total_norm_quantiles_proposal', 'total_norm_Q', 'total_norm_fqf_fc', 'total_norm_encoder'
         ]
 
     def _state_dict_learn(self) -> Dict[str, Any]:
+        """
+        Overview:
+            Return the state_dict of learn mode, usually including model and optimizer.
+        Returns:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of current policy learn state, for saving and restoring.
+        """
         return {
             'model': self._learn_model.state_dict(),
             'target_model': self._target_model.state_dict(),
@@ -254,6 +319,17 @@ def _state_dict_learn(self) -> Dict[str, Any]:
         }
 
     def _load_state_dict_learn(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Overview:
+            Load the state_dict variable into policy learn mode.
+        Arguments:
+            - state_dict (:obj:`Dict[str, Any]`): The dict of policy learn state saved before.
+
+        .. tip::
+            If you want to only load some parts of model, you can simply set the ``strict`` argument in \
+            load_state_dict to ``False``, or refer to ``ding.torch_utils.checkpoint_helper`` for more \
+            complicated operation.
+        """
         self._learn_model.load_state_dict(state_dict['model'])
         self._target_model.load_state_dict(state_dict['target_model'])
         self._fraction_loss_optimizer.load_state_dict(state_dict['optimizer_fraction_loss'])
diff --git a/ding/policy/ibc.py b/ding/policy/ibc.py
index b39e14f53a..887edf298d 100644
--- a/ding/policy/ibc.py
+++ b/ding/policy/ibc.py
@@ -20,39 +20,89 @@
 class IBCPolicy(BehaviourCloningPolicy):
     r"""
     Overview:
-        Implicit Behavior Cloning
-        https://arxiv.org/abs/2109.00137.pdf
+        Policy class of IBC (Implicit Behavior Cloning), proposed in https://arxiv.org/abs/2109.00137.pdf.
 
     .. note::
-        The code is adapted from the pytorch version of IBC https://github.com/kevinzakka/ibc,
-            which only supports the derivative-free optimization (dfo) variants.
-        This implementation moves a step forward and supports all variants of energy-based model
-            mentioned in the paper (dfo, autoregressive dfo, and mcmc).
+        The code is adapted from the pytorch version of IBC https://github.com/kevinzakka/ibc, which only supports the \
+        derivative-free optimization (dfo) variants. This implementation moves a step forward and supports all \
+        variants of energy-based model mentioned in the paper (dfo, autoregressive dfo, and mcmc).
     """
 
     config = dict(
+        # (str) The policy type. 'ibc' refers to Implicit Behavior Cloning.
         type='ibc',
+        # (bool) Whether to use CUDA for training. False means CPU will be used.
         cuda=False,
+        # (bool) If True, the policy will operate on-policy. Here it's False, indicating off-policy.
         on_policy=False,
+        # (bool) Whether the action space is continuous. True for continuous action space.
         continuous=True,
-        model=dict(stochastic_optim=dict(type='mcmc', )),
+        # (dict) Configuration for the model, including stochastic optimization settings.
+        model=dict(
+            # (dict) Configuration for the stochastic optimization, specifying the type of optimizer.
+            stochastic_optim=dict(
+                # (str) The type of stochastic optimizer. 'mcmc' refers to Markov Chain Monte Carlo methods.
+                type='mcmc',
+            ),
+        ),
+        # (dict) Configuration for the learning process.
         learn=dict(
+            # (int) The number of training epochs.
             train_epoch=30,
+            # (int) The size of batches used during training.
             batch_size=256,
+            # (dict) Configuration for the optimizer used during training.
             optim=dict(
+                # (float) The learning rate for the optimizer.
                 learning_rate=1e-5,
+                # (float) The weight decay regularization term for the optimizer.
                 weight_decay=0.0,
+                # (float) The beta1 hyperparameter for the AdamW optimizer.
                 beta1=0.9,
+                # (float) The beta2 hyperparameter for the AdamW optimizer.
                 beta2=0.999,
             ),
         ),
-        eval=dict(evaluator=dict(eval_freq=10000, )),
+        # (dict) Configuration for the evaluation process.
+        eval=dict(
+            # (dict) Configuration for the evaluator.
+            evaluator=dict(
+                # (int) The frequency of evaluations during training, in terms of number of training steps.
+                eval_freq=10000,
+            ),
+        ),
     )
 
     def default_model(self) -> Tuple[str, List[str]]:
+        """
+        Overview:
+            Returns the default model configuration used by the IBC algorithm. ``__init__`` method will \
+            automatically call this method to get the default model setting and create model.
+
+        Returns:
+            - model_info (:obj:`Tuple[str, List[str]]`): \
+                Tuple containing the registered model name and model's import_names.
+        """
         return 'ebm', ['ding.model.template.ebm']
 
-    def _init_learn(self):
+    def _init_learn(self) -> None:
+        """
+        Overview:
+            Initialize the learn mode of policy, including related attributes and modules. For IBC, it mainly \
+            contains optimizer and main model. \
+            This method will be called in ``__init__`` method if ``learn`` field is in ``enable_field``.
+
+        .. note::
+            For the member variables that need to be saved and loaded, please refer to the ``_state_dict_learn`` \
+            and ``_load_state_dict_learn`` methods.
+
+        .. note::
+            For the member variables that need to be monitored, please refer to the ``_monitor_vars_learn`` method.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_learn`` method, you'd better name them \
+            with prefix ``_learn_`` to avoid conflict with other modes, such as ``self._learn_attr1``.
+        """
         self._timer = EasyTimer(cuda=self._cfg.cuda)
         self._sync_timer = EasyTimer(cuda=self._cfg.cuda)
         optim_cfg = self._cfg.learn.optim
@@ -67,7 +117,31 @@ def _init_learn(self):
         self._learn_model = model_wrap(self._model, 'base')
         self._learn_model.reset()
 
-    def _forward_learn(self, data):
+    def _forward_learn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Overview:
+            Policy forward function of learn mode (training policy and updating parameters). Forward means \
+            that the policy inputs some training batch data from the replay buffer and then returns the output \
+            result, including various training information such as policy_loss, value_loss, entropy_loss.
+        Arguments:
+            - data (:obj:`List[Dict[int, Any]]`): The input data used for policy forward, including a batch of \
+                training samples. For each element in list, the key of the dict is the name of data items and the \
+                value is the corresponding data. Usually, the value is torch.Tensor or np.ndarray or there dict/list \
+                combinations. In the ``_forward_learn`` method, data often need to first be stacked in the batch \
+                dimension by some utility functions such as ``default_preprocess_learn``. \
+                For IBC, each element in list is a dict containing at least the following keys: \
+                ['obs', 'action'].
+        Returns:
+            - info_dict (:obj:`Dict[str, Any]`): The information dict that indicated training result, which will be \
+                recorded in text log and tensorboard, values must be python scalar or a list of scalars. For the \
+                detailed definition of the dict, refer to the code of ``_monitor_vars_learn`` method.
+
+        .. note::
+            The input value can be torch.Tensor or dict/list combinations and current policy supports all of them. \
+            For the data type that not supported, the main reason is that the corresponding model does not support it. \
+            You can implement your own model rather than use the default model. For more information, please raise an \
+            issue in GitHub repo and we will continue to follow up.
+        """
         with self._timer:
             data = default_collate(data)
             if self._cuda:
@@ -81,7 +155,7 @@ def _forward_learn(self, data):
             obs, action = data['obs'], data['action']
             # When action/observation space is 1, the action/observation dimension will
             # be squeezed in the first place, therefore unsqueeze there to make the data
-            # compatiable with the ibc pipeline.
+            # compatible with the ibc pipeline.
             if len(obs.shape) == 1:
                 obs = obs.unsqueeze(-1)
             if len(action.shape) == 1:
@@ -136,17 +210,51 @@ def _forward_learn(self, data):
             **loss_dict,
         }
 
-    def _monitor_vars_learn(self):
+    def _monitor_vars_learn(self) -> List[str]:
+        """
+        Overview:
+            Return the necessary keys for logging the return dict of ``self._forward_learn``. The logger module, such \
+            as text logger, tensorboard logger, will use these keys to save the corresponding data.
+        Returns:
+            - necessary_keys (:obj:`List[str]`): The list of the necessary keys to be logged.
+        """
         if isinstance(self._stochastic_optimizer, MCMC):
             return ['total_loss', 'ebm_loss', 'grad_penalty', 'total_time', 'sync_time']
         else:
             return ['total_loss', 'ebm_loss', 'total_time', 'sync_time']
 
-    def _init_eval(self):
+    def _init_eval(self) -> None:
+        """
+        Overview:
+            Initialize the eval mode of policy, including related attributes and modules.
+            This method will be called in ``__init__`` method if ``eval`` field is in ``enable_field``.
+
+        .. note::
+            If you want to set some spacial member variables in ``_init_eval`` method, you'd better name them \
+            with prefix ``_eval_`` to avoid conflict with other modes, such as ``self._eval_attr1``.
+        """
         self._eval_model = model_wrap(self._model, wrapper_name='base')
         self._eval_model.reset()
 
-    def _forward_eval(self, data: dict) -> dict:
+    def _forward_eval(self, data: Dict[int, Any]) -> Dict[int, Any]:
+        """
+        Overview:
+            Policy forward function of eval mode (evaluation policy performance by interacting with envs). Forward \
+            means that the policy gets some necessary data (mainly observation) from the envs and then returns the \
+            action to interact with the envs.
+        Arguments:
+            - data (:obj:`Dict[int, Any]`): The input data used for policy forward, including at least the obs. The \
+                key of the dict is environment id and the value is the corresponding data of the env.
+        Returns:
+            - output (:obj:`Dict[int, Any]`): The output data of policy forward, including at least the action. The \
+                key of the dict is the same as the input data, i.e., environment id.
+
+        .. note::
+            The input value can be ``torch.Tensor`` or dict/list combinations, current policy supports all of them. \
+            For the data type that is not supported, the main reason is that the corresponding model does not \
+            support it. You can implement your own model rather than use the default model. For more information, \
+            please raise an issue in GitHub repo, and we will continue to follow up.
+        """
         tensor_input = isinstance(data, torch.Tensor)
         if not tensor_input:
             data_id = list(data.keys())
@@ -168,6 +276,13 @@ def _forward_eval(self, data: dict) -> dict:
             return {i: d for i, d in zip(data_id, output)}
 
     def set_statistic(self, statistics: EasyDict) -> None:
+        """
+        Overview:
+            Set the statistics of the environment, including the action space and the observation space.
+        Arguments:
+            - statistics (:obj:`EasyDict`): The statistics of the environment. For IBC, it contains at least the \
+                following keys: ['action_bounds'].
+        """
         self._stochastic_optimizer.set_action_bounds(statistics.action_bounds)
 
     # =================================================================== #