From 54cecb70cc0b34f5d14720ce3d9f37438a0fe3b9 Mon Sep 17 00:00:00 2001
From: Vivek Miglani <vivekm@fb.com>
Date: Mon, 2 Mar 2020 10:58:10 -0800
Subject: [PATCH 1/3] Fixing docs

---
 captum/attr/_core/deep_lift.py                |  91 ++---
 captum/attr/_core/feature_ablation.py         |  36 +-
 captum/attr/_core/gradient_shap.py            |  93 ++---
 .../attr/_core/guided_backprop_deconvnet.py   |  62 ++--
 captum/attr/_core/guided_grad_cam.py          | 264 +++++++-------
 captum/attr/_core/input_x_gradient.py         |  10 +-
 captum/attr/_core/layer/grad_cam.py           | 260 +++++++-------
 captum/attr/_core/layer/internal_influence.py | 260 +++++++-------
 captum/attr/_core/layer/layer_activation.py   | 114 ++++---
 captum/attr/_core/layer/layer_conductance.py  | 323 +++++++++---------
 captum/attr/_core/layer/layer_deep_lift.py    |  92 ++---
 .../_core/layer/layer_feature_ablation.py     |  23 +-
 .../attr/_core/layer/layer_gradient_shap.py   |  70 ++--
 .../layer/layer_gradient_x_activation.py      | 167 ++++-----
 .../attr/_core/neuron/neuron_conductance.py   | 295 ++++++++--------
 captum/attr/_core/neuron/neuron_deep_lift.py  |  91 ++---
 .../_core/neuron/neuron_feature_ablation.py   | 217 ++++++------
 captum/attr/_core/neuron/neuron_gradient.py   | 137 ++++----
 .../attr/_core/neuron/neuron_gradient_shap.py |  97 +++---
 .../neuron_guided_backprop_deconvnet.py       |  60 ++--
 .../neuron/neuron_integrated_gradients.py     | 260 +++++++-------
 captum/attr/_core/noise_tunnel.py             |  46 +--
 captum/attr/_core/occlusion.py                |  38 ++-
 captum/attr/_core/saliency.py                 |  15 +-
 captum/attr/_utils/attribution.py             |  14 +-
 25 files changed, 1591 insertions(+), 1544 deletions(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index e97036ade4..31f276d3b6 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -68,6 +68,37 @@ def is_output_cloned(output_fn, input_grad_fn) -> bool:
 
 
 class DeepLift(GradientAttribution):
+    r"""
+    Implements DeepLIFT algorithm based on the following paper:
+    Learning Important Features Through Propagating Activation Differences,
+    Avanti Shrikumar, et. al.
+    https://arxiv.org/abs/1704.02685
+
+    and the gradient formulation proposed in:
+    Towards better understanding of gradient-based attribution methods for
+    deep neural networks,  Marco Ancona, et.al.
+    https://openreview.net/pdf?id=Sy21R9JAW
+
+    This implementation supports only Rescale rule. RevealCancel rule will
+    be supported in later releases.
+    In addition to that, in order to keep the implementation cleaner, DeepLIFT
+    for internal neurons and layers extends current implementation and is
+    implemented separately in LayerDeepLift and NeuronDeepLift.
+    Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
+    Integrated Gradients, it runs significantly faster than Integrated
+    Gradients and is preferred for large datasets.
+
+    Currently we only support a limited number of non-linear activations
+    but the plan is to expand the list in the future.
+
+    Note: As we know, currently we cannot access the building blocks,
+    of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
+    Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
+    with performance similar to built-in ones using TorchScript.
+    More details on how to build custom RNNs can be found here:
+    https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
+    """
+
     def __init__(self, model: Module) -> None:
         r"""
         Args:
@@ -116,35 +147,6 @@ def attribute(  # type: ignore
         TensorOrTupleOfTensorsGeneric, Tuple[TensorOrTupleOfTensorsGeneric, Tensor]
     ]:
         r""""
-        Implements DeepLIFT algorithm based on the following paper:
-        Learning Important Features Through Propagating Activation Differences,
-        Avanti Shrikumar, et. al.
-        https://arxiv.org/abs/1704.02685
-
-        and the gradient formulation proposed in:
-        Towards better understanding of gradient-based attribution methods for
-        deep neural networks,  Marco Ancona, et.al.
-        https://openreview.net/pdf?id=Sy21R9JAW
-
-        This implementation supports only Rescale rule. RevealCancel rule will
-        be supported in later releases.
-        In addition to that, in order to keep the implementation cleaner, DeepLIFT
-        for internal neurons and layers extends current implementation and is
-        implemented separately in LayerDeepLift and NeuronDeepLift.
-        Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
-        Integrated Gradients, it runs significantly faster than Integrated
-        Gradients and is preferred for large datasets.
-
-        Currently we only support a limited number of non-linear activations
-        but the plan is to expand the list in the future.
-
-        Note: As we know, currently we cannot access the building blocks,
-        of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
-        Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
-        with performance similar to built-in ones using TorchScript.
-        More details on how to build custom RNNs can be found here:
-        https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -520,6 +522,23 @@ def has_convergence_delta(self) -> bool:
 
 
 class DeepLiftShap(DeepLift):
+    r"""
+    Extends DeepLift algorithm and approximates SHAP values using Deeplift.
+    For each input sample it computes DeepLift attribution with respect to
+    each baseline and averages resulting attributions.
+    More details about the algorithm can be found here:
+
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+
+    Note that the explanation model:
+        1. Assumes that input features are independent of one another
+        2. Is linear, meaning that the explanations are modeled through
+            the additive composition of feature effects.
+    Although, it assumes a linear model for each explanation, the overall
+    model across multiple explanations can be complex and non-linear.
+
+    """
+
     def __init__(self, model: Module) -> None:
         r"""
         Args:
@@ -573,20 +592,6 @@ def attribute(  # type: ignore
         TensorOrTupleOfTensorsGeneric, Tuple[TensorOrTupleOfTensorsGeneric, Tensor]
     ]:
         r"""
-        Extends DeepLift algorithm and approximates SHAP values using Deeplift.
-        For each input sample it computes DeepLift attribution with respect to
-        each baseline and averages resulting attributions.
-        More details about the algorithm can be found here:
-
-        http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
-
-        Note that the explanation model:
-            1. Assumes that input features are independent of one another
-            2. Is linear, meaning that the explanations are modeled through
-               the additive composition of feature effects.
-        Although, it assumes a linear model for each explanation, the overall
-        model across multiple explanations can be complex and non-linear.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
index ee0e016422..bda61daf0d 100644
--- a/captum/attr/_core/feature_ablation.py
+++ b/captum/attr/_core/feature_ablation.py
@@ -22,6 +22,25 @@
 
 
 class FeatureAblation(PerturbationAttribution):
+    r"""
+    A perturbation based approach to computing attribution, involving
+    replacing each input feature with a given baseline / reference, and
+    computing the difference in output. By default, each scalar value within
+    each input tensor is taken as a feature and replaced independently. Passing
+    a feature mask, allows grouping features to be ablated together. This can
+    be used in cases such as images, where an entire segment or region
+    can be ablated, measuring the importance of the segment (feature group).
+    Each input scalar in the group will be given the same attribution value
+    equal to the change in target as a result of ablating the entire feature
+    group.
+
+    The forward function can either return a scalar per example, or a single
+    scalar for the full batch. If a single scalar is returned for the batch,
+    `perturbations_per_eval` must be 1, and the returned attributions will have
+    first dimension 1, corresponding to feature importance across all
+    examples in the batch.
+    """
+
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
@@ -43,23 +62,6 @@ def attribute(
         **kwargs: Any
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        A perturbation based approach to computing attribution, involving
-        replacing each input feature with a given baseline / reference, and
-        computing the difference in output. By default, each scalar value within
-        each input tensor is taken as a feature and replaced independently. Passing
-        a feature mask, allows grouping features to be ablated together. This can
-        be used in cases such as images, where an entire segment or region
-        can be ablated, measuring the importance of the segment (feature group).
-        Each input scalar in the group will be given the same attribution value
-        equal to the change in target as a result of ablating the entire feature
-        group.
-
-        The forward function can either return a scalar per example, or a single
-        scalar for the full batch. If a single scalar is returned for the batch,
-        `perturbations_per_eval` must be 1, and the returned attributions will have
-        first dimension 1, corresponding to feature importance across all
-        examples in the batch.
-
         Args:
 
                 inputs (tensor or tuple of tensors):  Input for which ablation
diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 748e903b8a..35538be92a 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -24,6 +24,40 @@
 
 
 class GradientShap(GradientAttribution):
+    r"""
+    Implements gradient SHAP based on the implementation from SHAP's primary
+    author. For reference, please, view:
+
+    https://github.com/slundberg/shap\
+    #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
+
+    A Unified Approach to Interpreting Model Predictions
+    http://papers.nips.cc/paper\
+    7062-a-unified-approach-to-interpreting-model-predictions
+
+    GradientShap approximates SHAP values by computing the expectations of
+    gradients by randomly sampling from the distribution of baselines/references.
+    It adds white noise to each input sample `n_samples` times, selects a
+    random baseline from baselines' distribution and a random point along the
+    path between the baseline and the input, and computes the gradient of outputs
+    with respect to those selected random points. The final SHAP values represent
+    the expected values of gradients * (inputs - baselines).
+
+    GradientShap makes an assumption that the input features are independent
+    and that the explanation model is linear, meaning that the explanations
+    are modeled through the additive composition of feature effects.
+    Under those assumptions, SHAP value can be approximated as the expectation
+    of gradients that are computed for randomly generated `n_samples` input
+    samples after adding gaussian noise `n_samples` times to each input for
+    different baselines/references.
+
+    In some sense it can be viewed as an approximation of integrated gradients
+    by computing the expectations of gradients for different baselines.
+
+    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    randomly draw samples from the distribution of baselines, add noise to input
+    samples and compute the expectation (smoothgrad).
+    """
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
@@ -79,39 +113,6 @@ def attribute(
         TensorOrTupleOfTensorsGeneric, Tuple[TensorOrTupleOfTensorsGeneric, Tensor]
     ]:
         r"""
-        Implements gradient SHAP based on the implementation from SHAP's primary
-        author. For reference, please, view:
-
-        https://github.com/slundberg/shap\
-        #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
-
-        A Unified Approach to Interpreting Model Predictions
-        http://papers.nips.cc/paper\
-        7062-a-unified-approach-to-interpreting-model-predictions
-
-        GradientShap approximates SHAP values by computing the expectations of
-        gradients by randomly sampling from the distribution of baselines/references.
-        It adds white noise to each input sample `n_samples` times, selects a
-        random baseline from baselines' distribution and a random point along the
-        path between the baseline and the input, and computes the gradient of outputs
-        with respect to those selected random points. The final SHAP values represent
-        the expected values of gradients * (inputs - baselines).
-
-        GradientShap makes an assumption that the input features are independent
-        and that the explanation model is linear, meaning that the explanations
-        are modeled through the additive composition of feature effects.
-        Under those assumptions, SHAP value can be approximated as the expectation
-        of gradients that are computed for randomly generated `n_samples` input
-        samples after adding gaussian noise `n_samples` times to each input for
-        different baselines/references.
-
-        In some sense it can be viewed as an approximation of integrated gradients
-        by computing the expectations of gradients for different baselines.
-
-        Current implementation uses Smoothgrad from `NoiseTunnel` in order to
-        randomly draw samples from the distribution of baselines, add noise to input
-        samples and compute the expectation (smoothgrad).
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which SHAP attribution
@@ -222,19 +223,19 @@ def attribute(
                         The deltas are ordered by each input example and `n_samples`
                         noisy samples generated for it.
 
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> net = ImageClassifier()
-                >>> gradient_shap = GradientShap(net)
-                >>> input = torch.randn(3, 3, 32, 32, requires_grad=True)
-                >>> # choosing baselines randomly
-                >>> baselines = torch.randn(20, 3, 32, 32)
-                >>> # Computes gradient shap for the input
-                >>> # Attribution size matches input size: 3x3x32x32
-                >>> attribution = gradient_shap.attribute(input, baselines,
-                                                                 target=5)
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> net = ImageClassifier()
+            >>> gradient_shap = GradientShap(net)
+            >>> input = torch.randn(3, 3, 32, 32, requires_grad=True)
+            >>> # choosing baselines randomly
+            >>> baselines = torch.randn(20, 3, 32, 32)
+            >>> # Computes gradient shap for the input
+            >>> # Attribution size matches input size: 3x3x32x32
+            >>> attribution = gradient_shap.attribute(input, baselines,
+                                                                target=5)
 
         """
         # since `baselines` is a distribution, we can generate it using a function
diff --git a/captum/attr/_core/guided_backprop_deconvnet.py b/captum/attr/_core/guided_backprop_deconvnet.py
index 8be1bf1907..3755830235 100644
--- a/captum/attr/_core/guided_backprop_deconvnet.py
+++ b/captum/attr/_core/guided_backprop_deconvnet.py
@@ -94,6 +94,21 @@ def _remove_hooks(self):
 
 
 class GuidedBackprop(ModifiedReluGradientAttribution):
+    r"""
+    Computes attribution using guided backpropagation. Guided backpropagation
+    computes the gradient of the target output with respect to the input,
+    but gradients of ReLU functions are overridden so that only
+    non-negative gradients are backpropagated.
+
+    More details regarding the guided backpropagation algorithm can be found
+    in the original paper here:
+    https://arxiv.org/abs/1412.6806
+
+    Warning: Ensure that all ReLU operations in the forward function of the
+    given model are performed using a module (nn.module.ReLU).
+    If nn.functional.ReLU is used, gradients are not overridden appropriately.
+    """
+
     def __init__(self, model: Module):
         r"""
         Args:
@@ -111,19 +126,6 @@ def attribute(
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        Computes attribution using guided backpropagation. Guided backpropagation
-        computes the gradient of the target output with respect to the input,
-        but gradients of ReLU functions are overridden so that only
-        non-negative gradients are backpropagated.
-
-        More details regarding the guided backpropagation algorithm can be found
-        in the original paper here:
-        https://arxiv.org/abs/1412.6806
-
-        Warning: Ensure that all ReLU operations in the forward function of the
-        given model are performed using a module (nn.module.ReLU).
-        If nn.functional.ReLU is used, gradients are not overridden appropriately.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -197,6 +199,24 @@ def attribute(
 
 
 class Deconvolution(ModifiedReluGradientAttribution):
+    r"""
+    Computes attribution using deconvolution. Deconvolution
+    computes the gradient of the target output with respect to the input,
+    but gradients of ReLU functions are overridden so that the gradient
+    of the ReLU input is simply computed taking ReLU of the output gradient,
+    essentially only propagating non-negative gradients (without
+    dependence on the sign of the ReLU input).
+
+    More details regarding the deconvolution algorithm can be found
+    in these papers:
+    https://arxiv.org/abs/1311.2901
+    https://link.springer.com/chapter/10.1007/978-3-319-46466-4_8
+
+    Warning: Ensure that all ReLU operations in the forward function of the
+    given model are performed using a module (nn.module.ReLU).
+    If nn.functional.ReLU is used, gradients are not overridden appropriately.
+    """
+
     def __init__(self, model: Module):
         r"""
         Args:
@@ -212,22 +232,6 @@ def attribute(
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        Computes attribution using deconvolution. Deconvolution
-        computes the gradient of the target output with respect to the input,
-        but gradients of ReLU functions are overridden so that the gradient
-        of the ReLU input is simply computed taking ReLU of the output gradient,
-        essentially only propagating non-negative gradients (without
-        dependence on the sign of the ReLU input).
-
-        More details regarding the deconvolution algorithm can be found
-        in these papers:
-        https://arxiv.org/abs/1311.2901
-        https://link.springer.com/chapter/10.1007/978-3-319-46466-4_8
-
-        Warning: Ensure that all ReLU operations in the forward function of the
-        given model are performed using a module (nn.module.ReLU).
-        If nn.functional.ReLU is used, gradients are not overridden appropriately.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
diff --git a/captum/attr/_core/guided_grad_cam.py b/captum/attr/_core/guided_grad_cam.py
index c2122fb9ab..d4aeaa8647 100644
--- a/captum/attr/_core/guided_grad_cam.py
+++ b/captum/attr/_core/guided_grad_cam.py
@@ -15,6 +15,37 @@
 
 
 class GuidedGradCam(GradientAttribution):
+    r"""
+    Computes element-wise product of guided backpropagation attributions
+    with upsampled (non-negative) GradCAM attributions.
+    GradCAM attributions are computed with respect to the layer
+    provided in the constructor, and attributions
+    are upsampled to match the input size. GradCAM is designed for
+    convolutional neural networks, and is usually applied to the last
+    convolutional layer.
+
+    Note that if multiple input tensors are provided, attributions for
+    each input tensor are computed by upsampling the GradCAM
+    attributions to match that input's dimensions. If interpolation is
+    not possible for the input tensor dimensions and interpolation mode,
+    then an empty tensor is returned in the attributions for the
+    corresponding position of that input tensor. This can occur if the
+    input tensor does not have the same number of dimensions as the chosen
+    layer's output or is not either 3D, 4D or 5D.
+
+    Note that attributions are only meaningful for input tensors
+    which are spatially alligned with the chosen layer, e.g. an input
+    image tensor for a convolutional layer.
+
+    More details regarding GuidedGradCAM can be found in the original
+    GradCAM paper here:
+    https://arxiv.org/pdf/1610.02391.pdf
+
+    Warning: Ensure that all ReLU operations in the forward function of the
+    given model are performed using a module (nn.module.ReLU).
+    If nn.functional.ReLU is used, gradients are not overridden appropriately.
+    """
+
     def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
@@ -44,137 +75,108 @@ def attribute(
         attribute_to_layer_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-            Computes element-wise product of guided backpropagation attributions
-            with upsampled (non-negative) GradCAM attributions.
-            GradCAM attributions are computed with respect to the layer
-            provided in the constructor, and attributions
-            are upsampled to match the input size. GradCAM is designed for
-            convolutional neural networks, and is usually applied to the last
-            convolutional layer.
-
-            Note that if multiple input tensors are provided, attributions for
-            each input tensor are computed by upsampling the GradCAM
-            attributions to match that input's dimensions. If interpolation is
-            not possible for the input tensor dimensions and interpolation mode,
-            then an empty tensor is returned in the attributions for the
-            corresponding position of that input tensor. This can occur if the
-            input tensor does not have the same number of dimensions as the chosen
-            layer's output or is not either 3D, 4D or 5D.
-
-            Note that attributions are only meaningful for input tensors
-            which are spatially alligned with the chosen layer, e.g. an input
-            image tensor for a convolutional layer.
-
-            More details regarding GuidedGradCAM can be found in the original
-            GradCAM paper here:
-            https://arxiv.org/pdf/1610.02391.pdf
-
-            Warning: Ensure that all ReLU operations in the forward function of the
-            given model are performed using a module (nn.module.ReLU).
-            If nn.functional.ReLU is used, gradients are not overridden appropriately.
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which attributions
-                            are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                interpolate_mode (str, optional): Method for interpolation, which
-                            must be a valid input interpolation mode for
-                            torch.nn.functional. These methods are
-                            "nearest", "area", "linear" (3D-only), "bilinear"
-                            (4D-only), "bicubic" (4D-only), "trilinear" (5D-only)
-                            based on the number of dimensions of the chosen layer
-                            output (which must also match the number of
-                            dimensions for the input tensor). Note that
-                            the original GradCAM paper uses "bilinear"
-                            interpolation, but we default to "nearest" for
-                            applicability to any of 3D, 4D or 5D tensors.
-                            Default: "nearest"
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attribution with respect to the layer input
-                            or output in `LayerGradCam`.
-                            If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to
-                            layer inputs, otherwise it will be computed with respect
-                            to layer outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal layer, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
-
-            Returns:
-                *tensor* of **attributions**:
-                - **attributions** (*tensor*):
-                        Element-wise product of (upsampled) GradCAM
-                        and Guided Backprop attributions.
-                        If a single tensor is provided as inputs, a single tensor is
-                        returned. If a tuple is provided for inputs, a tuple of
-                        corresponding sized tensors is returned.
-                        Attributions will be the same size as the provided inputs,
-                        with each value providing the attribution of the
-                        corresponding input index.
-                        If the GradCAM attributions cannot be upsampled to the shape
-                        of a given input tensor, None is returned in the corresponding
-                        index position.
-
-
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv4, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx50x8x8.
-                >>> # It is the last convolution layer, which is the recommended
-                >>> # use case for GuidedGradCAM.
-                >>> net = ImageClassifier()
-                >>> guided_gc = GuidedGradCam(net, net.conv4)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes guided GradCAM attributions for class 3.
-                >>> # attribution size matches input size, Nx3x32x32
-                >>> attribution = guided_gc.attribute(input, 3)
+        Args:
+
+            inputs (tensor or tuple of tensors):  Input for which attributions
+                        are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            interpolate_mode (str, optional): Method for interpolation, which
+                        must be a valid input interpolation mode for
+                        torch.nn.functional. These methods are
+                        "nearest", "area", "linear" (3D-only), "bilinear"
+                        (4D-only), "bicubic" (4D-only), "trilinear" (5D-only)
+                        based on the number of dimensions of the chosen layer
+                        output (which must also match the number of
+                        dimensions for the input tensor). Note that
+                        the original GradCAM paper uses "bilinear"
+                        interpolation, but we default to "nearest" for
+                        applicability to any of 3D, 4D or 5D tensors.
+                        Default: "nearest"
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attribution with respect to the layer input
+                        or output in `LayerGradCam`.
+                        If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to
+                        layer inputs, otherwise it will be computed with respect
+                        to layer outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal layer, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
+
+        Returns:
+            *tensor* of **attributions**:
+            - **attributions** (*tensor*):
+                    Element-wise product of (upsampled) GradCAM
+                    and Guided Backprop attributions.
+                    If a single tensor is provided as inputs, a single tensor is
+                    returned. If a tuple is provided for inputs, a tuple of
+                    corresponding sized tensors is returned.
+                    Attributions will be the same size as the provided inputs,
+                    with each value providing the attribution of the
+                    corresponding input index.
+                    If the GradCAM attributions cannot be upsampled to the shape
+                    of a given input tensor, None is returned in the corresponding
+                    index position.
+
+
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv4, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx50x8x8.
+            >>> # It is the last convolution layer, which is the recommended
+            >>> # use case for GuidedGradCAM.
+            >>> net = ImageClassifier()
+            >>> guided_gc = GuidedGradCam(net, net.conv4)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes guided GradCAM attributions for class 3.
+            >>> # attribution size matches input size, Nx3x32x32
+            >>> attribution = guided_gc.attribute(input, 3)
         """
         is_inputs_tuple = _is_tuple(inputs)
         inputs = _format_input(inputs)
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index 30ca1fdd19..5f363b411c 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -8,6 +8,12 @@
 
 
 class InputXGradient(GradientAttribution):
+    r"""
+    A baseline approach for computing the attribution. It multiplies input with
+    the gradient with respect to input.
+    https://arxiv.org/abs/1611.07270
+    """
+    
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
@@ -24,10 +30,6 @@ def attribute(
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        A baseline approach for computing the attribution. It multiplies input with
-        the gradient with respect to input.
-        https://arxiv.org/abs/1611.07270
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
diff --git a/captum/attr/_core/layer/grad_cam.py b/captum/attr/_core/layer/grad_cam.py
index 4ed1a99cb1..3bdb93fdcc 100644
--- a/captum/attr/_core/layer/grad_cam.py
+++ b/captum/attr/_core/layer/grad_cam.py
@@ -20,6 +20,39 @@
 
 
 class LayerGradCam(LayerAttribution, GradientAttribution):
+    r"""
+    Computes GradCAM attribution for chosen layer. GradCAM is designed for
+    convolutional neural networks, and is usually applied to the last
+    convolutional layer.
+
+    GradCAM computes the gradients of the target output with respect to
+    the given layer, averages for each output channel (dimension 2 of
+    output), and multiplies the average gradient for each channel by the
+    layer activations. The results are summed over all channels.
+
+    Note that in the original GradCAM algorithm described in the paper,
+    ReLU is applied to the output, returning only non-negative attributions.
+    For providing more flexibility to the user, we choose to not perform the
+    ReLU internally by default and return the sign information. To match the
+    original GradCAM algorithm, it is necessary to pass the parameter
+    relu_attributions=True to apply ReLU on the final
+    attributions or alternatively only visualize the positive attributions.
+
+    Note: this procedure sums over the second dimension (# of channels),
+    so the output of GradCAM attributions will have a second
+    dimension of 1, but all other dimensions will match that of the layer
+    output.
+
+    GradCAM attributions are generally upsampled and can be viewed as a
+    mask to the input, since a convolutional layer output generally
+    matches the input image spatially. This upsampling can be performed
+    using LayerAttribution.interpolate, as shown in the example below.
+
+    More details regarding the GradCAM method can be found in the
+    original paper here:
+    https://arxiv.org/pdf/1610.02391.pdf
+    """
+
     def __init__(
         self,
         forward_func: Callable,
@@ -53,135 +86,104 @@ def attribute(
         relu_attributions: bool = False,
     ) -> Union[Tensor, Tuple[Tensor, ...]]:
         r"""
-            Computes GradCAM attribution for chosen layer. GradCAM is designed for
-            convolutional neural networks, and is usually applied to the last
-            convolutional layer.
-
-            GradCAM computes the gradients of the target output with respect to
-            the given layer, averages for each output channel (dimension 2 of
-            output), and multiplies the average gradient for each channel by the
-            layer activations. The results are summed over all channels.
-
-            Note that in the original GradCAM algorithm described in the paper,
-            ReLU is applied to the output, returning only non-negative attributions.
-            For providing more flexibility to the user, we choose to not perform the
-            ReLU internally by default and return the sign information. To match the
-            original GradCAM algorithm, it is necessary to pass the parameter
-            relu_attributions=True to apply ReLU on the final
-            attributions or alternatively only visualize the positive attributions.
-
-            Note: this procedure sums over the second dimension (# of channels),
-            so the output of GradCAM attributions will have a second
-            dimension of 1, but all other dimensions will match that of the layer
-            output.
-
-            GradCAM attributions are generally upsampled and can be viewed as a
-            mask to the input, since a convolutional layer output generally
-            matches the input image spatially. This upsampling can be performed
-            using LayerAttribution.interpolate, as shown in the example below.
-
-            More details regarding the GradCAM method can be found in the
-            original paper here:
-            https://arxiv.org/pdf/1610.02391.pdf
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which attributions
-                            are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attributions with respect to the layer input
-                            or output. If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to the
-                            layer input, otherwise it will be computed with respect
-                            to layer output.
-                            Note that currently it is assumed that either the input
-                            or the outputs of internal layers, depending on whether we
-                            attribute to the input or output, are single tensors.
-                            Support for multiple tensors will be added later.
-                            Default: False
-                relu_attributions (bool, optional): Indicates whether to
-                            apply a ReLU operation on the final attribution,
-                            returning only non-negative attributions. Setting this
-                            flag to True matches the original GradCAM algorithm,
-                            otherwise, by default, both positive and negative
-                            attributions are returned.
-                            Default: False
-
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Attributions based on GradCAM method.
-                            Attributions will be the same size as the
-                            output of the given layer, except for dimension 2,
-                            which will be 1 due to summing over channels.
-                            Attributions are returned in a tuple based on whether
-                            the layer inputs / outputs are contained in a tuple
-                            from a forward hook. For standard modules, inputs of
-                            a single tensor are usually wrapped in a tuple, while
-                            outputs of a single tensor are not.
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains a layer conv4, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx50x8x8.
-                >>> # It is the last convolution layer, which is the recommended
-                >>> # use case for GradCAM.
-                >>> net = ImageClassifier()
-                >>> layer_gc = LayerGradCam(net, net.conv4)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes layer GradCAM for class 3.
-                >>> # attribution size matches layer output except for dimension
-                >>> # 1, so dimensions of attr would be Nx1x8x8.
-                >>> attr = layer_gc.attribute(input, 3)
-                >>> # GradCAM attributions are often upsampled and viewed as a
-                >>> # mask to the input, since the convolutional layer output
-                >>> # spatially matches the original input image.
-                >>> # This can be done with LayerAttribution's interpolate method.
-                >>> upsampled_attr = LayerAttribution.interpolate(attr, (32, 32))
+        Args:
+
+            inputs (tensor or tuple of tensors):  Input for which attributions
+                        are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attributions with respect to the layer input
+                        or output. If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to the
+                        layer input, otherwise it will be computed with respect
+                        to layer output.
+                        Note that currently it is assumed that either the input
+                        or the outputs of internal layers, depending on whether we
+                        attribute to the input or output, are single tensors.
+                        Support for multiple tensors will be added later.
+                        Default: False
+            relu_attributions (bool, optional): Indicates whether to
+                        apply a ReLU operation on the final attribution,
+                        returning only non-negative attributions. Setting this
+                        flag to True matches the original GradCAM algorithm,
+                        otherwise, by default, both positive and negative
+                        attributions are returned.
+                        Default: False
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Attributions based on GradCAM method.
+                        Attributions will be the same size as the
+                        output of the given layer, except for dimension 2,
+                        which will be 1 due to summing over channels.
+                        Attributions are returned in a tuple based on whether
+                        the layer inputs / outputs are contained in a tuple
+                        from a forward hook. For standard modules, inputs of
+                        a single tensor are usually wrapped in a tuple, while
+                        outputs of a single tensor are not.
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains a layer conv4, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx50x8x8.
+            >>> # It is the last convolution layer, which is the recommended
+            >>> # use case for GradCAM.
+            >>> net = ImageClassifier()
+            >>> layer_gc = LayerGradCam(net, net.conv4)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes layer GradCAM for class 3.
+            >>> # attribution size matches layer output except for dimension
+            >>> # 1, so dimensions of attr would be Nx1x8x8.
+            >>> attr = layer_gc.attribute(input, 3)
+            >>> # GradCAM attributions are often upsampled and viewed as a
+            >>> # mask to the input, since the convolutional layer output
+            >>> # spatially matches the original input image.
+            >>> # This can be done with LayerAttribution's interpolate method.
+            >>> upsampled_attr = LayerAttribution.interpolate(attr, (32, 32))
         """
         inputs = _format_input(inputs)
         additional_forward_args = _format_additional_forward_args(
diff --git a/captum/attr/_core/layer/internal_influence.py b/captum/attr/_core/layer/internal_influence.py
index 374b37373e..b5727b1e51 100644
--- a/captum/attr/_core/layer/internal_influence.py
+++ b/captum/attr/_core/layer/internal_influence.py
@@ -22,6 +22,19 @@
 
 
 class InternalInfluence(LayerAttribution, GradientAttribution):
+    r"""
+    Computes internal influence by approximating the integral of gradients
+    for a particular layer along the path from a baseline input to the
+    given input.
+    If no baseline is provided, the default baseline is the zero tensor.
+    More details on this approach can be found here:
+    https://arxiv.org/pdf/1802.03788.pdf
+
+    Note that this method is similar to applying integrated gradients and
+    taking the layer as input, integrating the gradient of the layer with
+    respect to the output.
+    """
+
     def __init__(
         self,
         forward_func: Callable,
@@ -60,150 +73,139 @@ def attribute(
         attribute_to_layer_input: bool = False,
     ) -> Union[Tensor, Tuple[Tensor, ...]]:
         r"""
-            Computes internal influence by approximating the integral of gradients
-            for a particular layer along the path from a baseline input to the
-            given input.
-            If no baseline is provided, the default baseline is the zero tensor.
-            More details on this approach can be found here:
-            https://arxiv.org/pdf/1802.03788.pdf
-
-            Note that this method is similar to applying integrated gradients and
-            taking the layer as input, integrating the gradient of the layer with
-            respect to the output.
-
-            Args:
+        Args:
 
-                inputs (tensor or tuple of tensors):  Input for which internal
-                            influence is computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                baselines scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define a starting point from which integral
-                            is computed and can be provided as:
+            inputs (tensor or tuple of tensors):  Input for which internal
+                        influence is computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            baselines scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define a starting point from which integral
+                        is computed and can be provided as:
 
-                            - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
+                        - a single tensor, if inputs is a single tensor, with
+                            exactly the same dimensions as inputs or the first
+                            dimension is one and the remaining dimensions match
+                            with inputs.
 
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
+                        - a single scalar, if inputs is a single tensor, which will
+                            be broadcasted for each input value in input tensor.
 
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
+                        - a tuple of tensors or scalars, the baseline corresponding
+                            to each tensor in the inputs' tuple can be:
+                            - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+                            - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
 
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
 
-                            Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
+                        Default: None
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
 
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
 
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
 
-                            For outputs with > 2 dimensions, targets can be either:
+                        For outputs with > 2 dimensions, targets can be either:
 
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
 
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
 
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            For a tensor, the first dimension of the tensor must
-                            correspond to the number of examples. It will be
-                            repeated for each of `n_steps` along the integrated
-                            path. For all other types, the given argument is used
-                            for all forward evaluations.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                n_steps (int, optional): The number of steps used by the approximation
-                            method. Default: 50.
-                method (string, optional): Method for approximating the integral,
-                            one of `riemann_right`, `riemann_left`, `riemann_middle`,
-                            `riemann_trapezoid` or `gausslegendre`.
-                            Default: `gausslegendre` if no method is provided.
-                internal_batch_size (int, optional): Divides total #steps * #examples
-                            data points into chunks of size internal_batch_size,
-                            which are computed (forward / backward passes)
-                            sequentially.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain internal_batch_size / num_devices examples.
-                            If internal_batch_size is None, then all evaluations
-                            are processed in one batch.
-                            Default: None
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attribution with respect to the layer input
-                            or output. If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to
-                            layer inputs, otherwise it will be computed with respect
-                            to layer outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal layer, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        For a tensor, the first dimension of the tensor must
+                        correspond to the number of examples. It will be
+                        repeated for each of `n_steps` along the integrated
+                        path. For all other types, the given argument is used
+                        for all forward evaluations.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            n_steps (int, optional): The number of steps used by the approximation
+                        method. Default: 50.
+            method (string, optional): Method for approximating the integral,
+                        one of `riemann_right`, `riemann_left`, `riemann_middle`,
+                        `riemann_trapezoid` or `gausslegendre`.
+                        Default: `gausslegendre` if no method is provided.
+            internal_batch_size (int, optional): Divides total #steps * #examples
+                        data points into chunks of size internal_batch_size,
+                        which are computed (forward / backward passes)
+                        sequentially.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain internal_batch_size / num_devices examples.
+                        If internal_batch_size is None, then all evaluations
+                        are processed in one batch.
+                        Default: None
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attribution with respect to the layer input
+                        or output. If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to
+                        layer inputs, otherwise it will be computed with respect
+                        to layer outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal layer, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
 
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Internal influence of each neuron in given
-                            layer output. Attributions will always be the same size
-                            as the output or input of the given layer depending on
-                            whether `attribute_to_layer_input` is set to `False` or
-                            `True`respectively.
-                            Attributions are returned in a tuple based on whether
-                            the layer inputs / outputs are contained in a tuple
-                            from a forward hook. For standard modules, inputs of
-                            a single tensor are usually wrapped in a tuple, while
-                            outputs of a single tensor are not.
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Internal influence of each neuron in given
+                        layer output. Attributions will always be the same size
+                        as the output or input of the given layer depending on
+                        whether `attribute_to_layer_input` is set to `False` or
+                        `True`respectively.
+                        Attributions are returned in a tuple based on whether
+                        the layer inputs / outputs are contained in a tuple
+                        from a forward hook. For standard modules, inputs of
+                        a single tensor are usually wrapped in a tuple, while
+                        outputs of a single tensor are not.
 
-            Examples::
+        Examples::
 
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> layer_int_inf = InternalInfluence(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes layer internal influence.
-                >>> # attribution size matches layer output, Nx12x32x32
-                >>> attribution = layer_int_inf.attribute(input)
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> layer_int_inf = InternalInfluence(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes layer internal influence.
+            >>> # attribution size matches layer output, Nx12x32x32
+            >>> attribution = layer_int_inf.attribute(input)
         """
         inputs, baselines = _format_input_baseline(inputs, baselines)
         _validate_input(inputs, baselines, n_steps, method)
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 30729dc0a1..95c78fe3a4 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -11,6 +11,10 @@
 
 
 class LayerActivation(LayerAttribution):
+    r"""
+    Computes activation of selected layer for given input.
+    """
+    
     def __init__(
         self,
         forward_func: Callable,
@@ -43,68 +47,66 @@ def attribute(
         attribute_to_layer_input: bool = False,
     ) -> Union[Tensor, Tuple[Tensor, ...]]:
         r"""
-            Computes activation of selected layer for given input.
-
-            Args:
+        Args:
 
-                inputs (tensor or tuple of tensors):  Input for which layer
-                            activation is computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attribution with respect to the layer input
-                            or output. If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to
-                            layer input, otherwise it will be computed with respect
-                            to layer output.
-                            Note that currently it is assumed that either the input
-                            or the output of internal layer, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
+            inputs (tensor or tuple of tensors):  Input for which layer
+                        activation is computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attribution with respect to the layer input
+                        or output. If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to
+                        layer input, otherwise it will be computed with respect
+                        to layer output.
+                        Note that currently it is assumed that either the input
+                        or the output of internal layer, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
 
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Activation of each neuron in given layer output.
-                            Attributions will always be the same size as the
-                            output of the given layer.
-                            Attributions are returned in a tuple based on whether
-                            the layer inputs / outputs are contained in a tuple
-                            from a forward hook. For standard modules, inputs of
-                            a single tensor are usually wrapped in a tuple, while
-                            outputs of a single tensor are not.
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Activation of each neuron in given layer output.
+                        Attributions will always be the same size as the
+                        output of the given layer.
+                        Attributions are returned in a tuple based on whether
+                        the layer inputs / outputs are contained in a tuple
+                        from a forward hook. For standard modules, inputs of
+                        a single tensor are usually wrapped in a tuple, while
+                        outputs of a single tensor are not.
 
 
 
-            Examples::
+        Examples::
 
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> layer_act = LayerActivation(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes layer activation.
-                >>> # attribution is layer output, with size Nx12x32x32
-                >>> attribution = layer_cond.attribute(input)
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> layer_act = LayerActivation(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes layer activation.
+            >>> # attribution is layer output, with size Nx12x32x32
+            >>> attribution = layer_cond.attribute(input)
         """
         with torch.no_grad():
             layer_eval, is_layer_tuple = _forward_layer_eval(
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index 40a3c74360..b85cf620f2 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -21,6 +21,20 @@
 
 
 class LayerConductance(LayerAttribution, GradientAttribution):
+    r"""
+    Computes conductance with respect to the given layer. The
+    returned output is in the shape of the layer's output, showing the total
+    conductance of each hidden layer neuron.
+
+    The details of the approach can be found here:
+    https://arxiv.org/abs/1805.12233
+    https://arxiv.org/pdf/1807.09946.pdf
+
+    Note that this provides the total conductance of each neuron in the
+    layer's output. To obtain the breakdown of a neuron's conductance by input
+    features, utilize NeuronConductance instead, and provide the target
+    neuron index.
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -98,167 +112,154 @@ def attribute(
         Tensor, Tuple[Tensor, ...], Tuple[Union[Tensor, Tuple[Tensor, ...]], Tensor],
     ]:
         r"""
-            Computes conductance with respect to the given layer. The
-            returned output is in the shape of the layer's output, showing the total
-            conductance of each hidden layer neuron.
-
-            The details of the approach can be found here:
-            https://arxiv.org/abs/1805.12233
-            https://arxiv.org/pdf/1807.09946.pdf
-
-            Note that this provides the total conductance of each neuron in the
-            layer's output. To obtain the breakdown of a neuron's conductance by input
-            features, utilize NeuronConductance instead, and provide the target
-            neuron index.
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which layer
-                            conductance is computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define the starting point from which integral
-                            is computed and can be provided as:
-
-                            - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
-
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
-
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
-
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
-
-                            Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            For a tensor, the first dimension of the tensor must
-                            correspond to the number of examples. It will be repeated
-                            for each of `n_steps` along the integrated path.
-                            For all other types, the given argument is used for
-                            all forward evaluations.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                n_steps (int, optional): The number of steps used by the approximation
-                            method. Default: 50.
-                method (string, optional): Method for approximating the integral,
-                            one of `riemann_right`, `riemann_left`, `riemann_middle`,
-                            `riemann_trapezoid` or `gausslegendre`.
-                            Default: `gausslegendre` if no method is provided.
-                internal_batch_size (int, optional): Divides total #steps * #examples
-                            data points into chunks of size internal_batch_size,
-                            which are computed (forward / backward passes)
-                            sequentially.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain internal_batch_size / num_devices examples.
-                            If internal_batch_size is None, then all evaluations are
-                            processed in one batch.
-                            Default: None
-                return_convergence_delta (bool, optional): Indicates whether to return
-                            convergence delta or not. If `return_convergence_delta`
-                            is set to True convergence delta will be returned in
-                            a tuple following attributions.
-                            Default: False
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attribution with respect to the layer input
-                            or output. If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to
-                            layer inputs, otherwise it will be computed with respect
-                            to layer outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal layer, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
-
-            Returns:
-                **attributions** or 2-element tuple of **attributions**, **delta**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Conductance of each neuron in given layer input or
-                            output. Attributions will always be the same size as
-                            the input or output of the given layer, depending on
-                            whether we attribute to the inputs or outputs
-                            of the layer which is decided by the input flag
-                            `attribute_to_layer_input`.
-                            Attributions are returned in a tuple based on whether
-                            the layer inputs / outputs are contained in a tuple
-                            from a forward hook. For standard modules, inputs of
-                            a single tensor are usually wrapped in a tuple, while
-                            outputs of a single tensor are not.
-                - **delta** (*tensor*, returned if return_convergence_delta=True):
-                            The difference between the total
-                            approximated and true conductance.
-                            This is computed using the property that the total sum of
-                            forward_func(inputs) - forward_func(baselines) must equal
-                            the total sum of the attributions.
-                            Delta is calculated per example, meaning that the number of
-                            elements in returned delta tensor is equal to the number of
-                            of examples in inputs.
-
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> layer_cond = LayerConductance(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes layer conductance for class 3.
-                >>> # attribution size matches layer output, Nx12x32x32
-                >>> attribution = layer_cond.attribute(input, target=3)
+        Args:
+
+            inputs (tensor or tuple of tensors):  Input for which layer
+                        conductance is computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define the starting point from which integral
+                        is computed and can be provided as:
+
+                        - a single tensor, if inputs is a single tensor, with
+                            exactly the same dimensions as inputs or the first
+                            dimension is one and the remaining dimensions match
+                            with inputs.
+
+                        - a single scalar, if inputs is a single tensor, which will
+                            be broadcasted for each input value in input tensor.
+
+                        - a tuple of tensors or scalars, the baseline corresponding
+                            to each tensor in the inputs' tuple can be:
+                            - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+                            - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
+
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
+
+                        Default: None
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        For a tensor, the first dimension of the tensor must
+                        correspond to the number of examples. It will be repeated
+                        for each of `n_steps` along the integrated path.
+                        For all other types, the given argument is used for
+                        all forward evaluations.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            n_steps (int, optional): The number of steps used by the approximation
+                        method. Default: 50.
+            method (string, optional): Method for approximating the integral,
+                        one of `riemann_right`, `riemann_left`, `riemann_middle`,
+                        `riemann_trapezoid` or `gausslegendre`.
+                        Default: `gausslegendre` if no method is provided.
+            internal_batch_size (int, optional): Divides total #steps * #examples
+                        data points into chunks of size internal_batch_size,
+                        which are computed (forward / backward passes)
+                        sequentially.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain internal_batch_size / num_devices examples.
+                        If internal_batch_size is None, then all evaluations are
+                        processed in one batch.
+                        Default: None
+            return_convergence_delta (bool, optional): Indicates whether to return
+                        convergence delta or not. If `return_convergence_delta`
+                        is set to True convergence delta will be returned in
+                        a tuple following attributions.
+                        Default: False
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attribution with respect to the layer input
+                        or output. If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to
+                        layer inputs, otherwise it will be computed with respect
+                        to layer outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal layer, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
+
+        Returns:
+            **attributions** or 2-element tuple of **attributions**, **delta**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Conductance of each neuron in given layer input or
+                        output. Attributions will always be the same size as
+                        the input or output of the given layer, depending on
+                        whether we attribute to the inputs or outputs
+                        of the layer which is decided by the input flag
+                        `attribute_to_layer_input`.
+                        Attributions are returned in a tuple based on whether
+                        the layer inputs / outputs are contained in a tuple
+                        from a forward hook. For standard modules, inputs of
+                        a single tensor are usually wrapped in a tuple, while
+                        outputs of a single tensor are not.
+            - **delta** (*tensor*, returned if return_convergence_delta=True):
+                        The difference between the total
+                        approximated and true conductance.
+                        This is computed using the property that the total sum of
+                        forward_func(inputs) - forward_func(baselines) must equal
+                        the total sum of the attributions.
+                        Delta is calculated per example, meaning that the number of
+                        elements in returned delta tensor is equal to the number of
+                        of examples in inputs.
+
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> layer_cond = LayerConductance(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes layer conductance for class 3.
+            >>> # attribution size matches layer output, Nx12x32x32
+            >>> attribution = layer_cond.attribute(input, target=3)
         """
         inputs, baselines = _format_input_baseline(inputs, baselines)
         _validate_input(inputs, baselines, n_steps, method)
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index 7cf9c839ca..c13b4610b3 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -43,6 +43,34 @@
 
 
 class LayerDeepLift(LayerAttribution, DeepLift):
+    r"""
+    Implements DeepLIFT algorithm for the layer based on the following paper:
+    Learning Important Features Through Propagating Activation Differences,
+    Avanti Shrikumar, et. al.
+    https://arxiv.org/abs/1704.02685
+
+    and the gradient formulation proposed in:
+    Towards better understanding of gradient-based attribution methods for
+    deep neural networks,  Marco Ancona, et.al.
+    https://openreview.net/pdf?id=Sy21R9JAW
+
+    This implementation supports only Rescale rule. RevealCancel rule will
+    be supported in later releases.
+    Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
+    Integrated Gradients, it runs significantly faster than Integrated
+    Gradients and is preferred for large datasets.
+
+    Currently we only support a limited number of non-linear activations
+    but the plan is to expand the list in the future.
+
+    Note: As we know, currently we cannot access the building blocks,
+    of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
+    Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
+    with performance similar to built-in ones using TorchScript.
+    More details on how to build custom RNNs can be found here:
+    https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
+    """
+
     def __init__(self, model: Module, layer: Module):
         r"""
         Args:
@@ -99,32 +127,6 @@ def attribute(
         Tensor, Tuple[Tensor, ...], Tuple[Union[Tensor, Tuple[Tensor, ...]], Tensor],
     ]:
         r""""
-        Implements DeepLIFT algorithm for the layer based on the following paper:
-        Learning Important Features Through Propagating Activation Differences,
-        Avanti Shrikumar, et. al.
-        https://arxiv.org/abs/1704.02685
-
-        and the gradient formulation proposed in:
-        Towards better understanding of gradient-based attribution methods for
-        deep neural networks,  Marco Ancona, et.al.
-        https://openreview.net/pdf?id=Sy21R9JAW
-
-        This implementation supports only Rescale rule. RevealCancel rule will
-        be supported in later releases.
-        Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
-        Integrated Gradients, it runs significantly faster than Integrated
-        Gradients and is preferred for large datasets.
-
-        Currently we only support a limited number of non-linear activations
-        but the plan is to expand the list in the future.
-
-        Note: As we know, currently we cannot access the building blocks,
-        of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
-        Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
-        with performance similar to built-in ones using TorchScript.
-        More details on how to build custom RNNs can be found here:
-        https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
@@ -343,6 +345,26 @@ def chunk_output_fn(out: TensorOrTupleOfTensorsGeneric,) -> Sequence:
 
 
 class LayerDeepLiftShap(LayerDeepLift, DeepLiftShap):
+    r"""
+    Extends LayerDeepLift and DeepLiftShap algorithms and approximates SHAP
+    values for given input `layer`.
+    For each input sample - baseline pair it computes DeepLift attributions
+    with respect to inputs or outputs of given `layer` averages
+    resulting attributions across baselines. Whether to compute the attributions
+    with respect to the inputs or outputs of the layer is defined by the
+    input flag `attribute_to_layer_input`.
+    More details about the algorithm can be found here:
+
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+
+    Note that the explanation model:
+        1. Assumes that input features are independent of one another
+        2. Is linear, meaning that the explanations are modeled through
+            the additive composition of feature effects.
+    Although, it assumes a linear model for each explanation, the overall
+    model across multiple explanations can be complex and non-linear.
+    """
+    
     def __init__(self, model: Module, layer: Module) -> None:
         r"""
         Args:
@@ -404,24 +426,6 @@ def attribute(
         Tensor, Tuple[Tensor, ...], Tuple[Union[Tensor, Tuple[Tensor, ...]], Tensor],
     ]:
         r"""
-        Extends LayerDeepLift and DeepLiftShap algorithms and approximates SHAP
-        values for given input `layer`.
-        For each input sample - baseline pair it computes DeepLift attributions
-        with respect to inputs or outputs of given `layer` averages
-        resulting attributions across baselines. Whether to compute the attributions
-        with respect to the inputs or outputs of the layer is defined by the
-        input flag `attribute_to_layer_input`.
-        More details about the algorithm can be found here:
-
-        http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
-
-        Note that the explanation model:
-            1. Assumes that input features are independent of one another
-            2. Is linear, meaning that the explanations are modeled through
-               the additive composition of feature effects.
-        Although, it assumes a linear model for each explanation, the overall
-        model across multiple explanations can be complex and non-linear.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index 2fc95d41d8..e79088ee7f 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -20,6 +20,18 @@
 
 
 class LayerFeatureAblation(LayerAttribution, PerturbationAttribution):
+    r"""
+    A perturbation based approach to computing layer attribution, involving
+    replacing values in the input / output of a layer with a given baseline /
+    reference, and computing the difference in output. By default, each
+    neuron (scalar input / output value) within the layer is replaced
+    independently.
+    Passing a layer mask allows grouping neurons to be
+    ablated together.
+    Each neuron in the group will be given the same attribution value
+    equal to the change in target as a result of ablating the entire neuron
+    group.
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -59,17 +71,6 @@ def attribute(
         perturbations_per_eval: int = 1,
     ) -> Union[Tensor, Tuple[Tensor, ...]]:
         r"""
-            A perturbation based approach to computing layer attribution, involving
-            replacing values in the input / output of a layer with a given baseline /
-            reference, and computing the difference in output. By default, each
-            neuron (scalar input / output value) within the layer is replaced
-            independently.
-            Passing a layer mask allows grouping neurons to be
-            ablated together.
-            Each neuron in the group will be given the same attribution value
-            equal to the change in target as a result of ablating the entire neuron
-            group.
-
             Args:
 
                 inputs (tensor or tuple of tensors):  Input for which layer
diff --git a/captum/attr/_core/layer/layer_gradient_shap.py b/captum/attr/_core/layer/layer_gradient_shap.py
index 1037a3066e..4b5daa8ef1 100644
--- a/captum/attr/_core/layer/layer_gradient_shap.py
+++ b/captum/attr/_core/layer/layer_gradient_shap.py
@@ -28,6 +28,42 @@
 
 
 class LayerGradientShap(LayerAttribution, GradientAttribution):
+    r"""
+    Implements gradient SHAP for layer based on the implementation from SHAP's
+    primary author. For reference, please, view:
+
+    https://github.com/slundberg/shap\
+    #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
+
+    A Unified Approach to Interpreting Model Predictions
+    http://papers.nips.cc/paper\
+    7062-a-unified-approach-to-interpreting-model-predictions
+
+    GradientShap approximates SHAP values by computing the expectations of
+    gradients by randomly sampling from the distribution of baselines/references.
+    It adds white noise to each input sample `n_samples` times, selects a
+    random baseline from baselines' distribution and a random point along the
+    path between the baseline and the input, and computes the gradient of
+    outputs with respect to selected random points in chosen `layer`.
+    The final SHAP values represent the expected values of
+    `gradients * (layer_attr_inputs - layer_attr_baselines)`.
+
+    GradientShap makes an assumption that the input features are independent
+    and that the explanation model is linear, meaning that the explanations
+    are modeled through the additive composition of feature effects.
+    Under those assumptions, SHAP value can be approximated as the expectation
+    of gradients that are computed for randomly generated `n_samples` input
+    samples after adding gaussian noise `n_samples` times to each input for
+    different baselines/references.
+
+    In some sense it can be viewed as an approximation of integrated gradients
+    by computing the expectations of gradients for different baselines.
+
+    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    randomly draw samples from the distribution of baselines, add noise to input
+    samples and compute the expectation (smoothgrad).
+    """
+
     def __init__(
         self,
         forward_func: Callable,
@@ -97,40 +133,6 @@ def attribute(
         Tensor, Tuple[Tensor, ...], Tuple[Union[Tensor, Tuple[Tensor, ...]], Tensor],
     ]:
         r"""
-        Implements gradient SHAP for layer based on the implementation from SHAP's
-        primary author. For reference, please, view:
-
-        https://github.com/slundberg/shap\
-        #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
-
-        A Unified Approach to Interpreting Model Predictions
-        http://papers.nips.cc/paper\
-        7062-a-unified-approach-to-interpreting-model-predictions
-
-        GradientShap approximates SHAP values by computing the expectations of
-        gradients by randomly sampling from the distribution of baselines/references.
-        It adds white noise to each input sample `n_samples` times, selects a
-        random baseline from baselines' distribution and a random point along the
-        path between the baseline and the input, and computes the gradient of
-        outputs with respect to selected random points in chosen `layer`.
-        The final SHAP values represent the expected values of
-        `gradients * (layer_attr_inputs - layer_attr_baselines)`.
-
-        GradientShap makes an assumption that the input features are independent
-        and that the explanation model is linear, meaning that the explanations
-        are modeled through the additive composition of feature effects.
-        Under those assumptions, SHAP value can be approximated as the expectation
-        of gradients that are computed for randomly generated `n_samples` input
-        samples after adding gaussian noise `n_samples` times to each input for
-        different baselines/references.
-
-        In some sense it can be viewed as an approximation of integrated gradients
-        by computing the expectations of gradients for different baselines.
-
-        Current implementation uses Smoothgrad from `NoiseTunnel` in order to
-        randomly draw samples from the distribution of baselines, add noise to input
-        samples and compute the expectation (smoothgrad).
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input which are used to compute
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index b63c12681a..e597a1edde 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -19,6 +19,10 @@
 
 
 class LayerGradientXActivation(LayerAttribution, GradientAttribution):
+    r"""
+    Computes element-wise product of gradient and activation for selected
+    layer on given inputs.
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -53,89 +57,86 @@ def attribute(
         attribute_to_layer_input: bool = False,
     ) -> Union[Tensor, Tuple[Tensor, ...]]:
         r"""
-            Computes element-wise product of gradient and activation for selected
-            layer on given inputs.
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which attributions
-                            are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                attribute_to_layer_input (bool, optional): Indicates whether to
-                            compute the attribution with respect to the layer input
-                            or output. If `attribute_to_layer_input` is set to True
-                            then the attributions will be computed with respect to
-                            layer input, otherwise it will be computed with respect
-                            to layer output.
-                            Default: False
-
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Product of gradient and activation for each
-                            neuron in given layer output.
-                            Attributions will always be the same size as the
-                            output of the given layer.
-                            Attributions are returned in a tuple based on whether
-                            the layer inputs / outputs are contained in a tuple
-                            from a forward hook. For standard modules, inputs of
-                            a single tensor are usually wrapped in a tuple, while
-                            outputs of a single tensor are not.
-
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> layer_ga = LayerGradientXActivation(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # Computes layer activation x gradient for class 3.
-                >>> # attribution size matches layer output, Nx12x32x32
-                >>> attribution = layer_ga.attribute(input, 3)
+        Args:
+
+            inputs (tensor or tuple of tensors):  Input for which attributions
+                        are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            attribute_to_layer_input (bool, optional): Indicates whether to
+                        compute the attribution with respect to the layer input
+                        or output. If `attribute_to_layer_input` is set to True
+                        then the attributions will be computed with respect to
+                        layer input, otherwise it will be computed with respect
+                        to layer output.
+                        Default: False
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Product of gradient and activation for each
+                        neuron in given layer output.
+                        Attributions will always be the same size as the
+                        output of the given layer.
+                        Attributions are returned in a tuple based on whether
+                        the layer inputs / outputs are contained in a tuple
+                        from a forward hook. For standard modules, inputs of
+                        a single tensor are usually wrapped in a tuple, while
+                        outputs of a single tensor are not.
+
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> layer_ga = LayerGradientXActivation(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # Computes layer activation x gradient for class 3.
+            >>> # attribution size matches layer output, Nx12x32x32
+            >>> attribution = layer_ga.attribute(input, 3)
         """
         inputs = _format_input(inputs)
         additional_forward_args = _format_additional_forward_args(
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index 1a3af7076a..af7435ae85 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -21,6 +21,13 @@
 
 
 class NeuronConductance(NeuronAttribution, GradientAttribution):
+    r"""
+    Computes conductance with respect to particular hidden neuron. The
+    returned output is in the shape of the input, showing the attribution
+    / conductance of each input feature to the selected hidden layer neuron.
+    The details of the approach can be found here:
+    https://arxiv.org/abs/1805.12233
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -69,154 +76,148 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-            Computes conductance with respect to particular hidden neuron. The
-            returned output is in the shape of the input, showing the attribution
-            / conductance of each input feature to the selected hidden layer neuron.
-            The details of the approach can be found here:
-            https://arxiv.org/abs/1805.12233
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which neuron
-                            conductance is computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                neuron_index (int or tuple): Index of neuron in output of given
-                              layer for which attribution is desired. Length of
-                              this tuple must be one less than the number of
-                              dimensions in the output of the given layer (since
-                              dimension 0 corresponds to number of examples).
-                              An integer may be provided instead of a tuple of
-                              length 1.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define the starting point from which integral
-                            is computed and can be provided as:
-
-                            - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
-
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
-
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
-
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
-
-                            Default: None
-                target (int, tuple, tensor or list, optional):  Output indices for
-                            which gradients are computed (for classification cases,
-                            this is usually the target class).
-                            If the network returns a scalar value per example,
-                            no target index is necessary.
-                            For general 2D outputs, targets can be either:
-
-                            - a single integer or a tensor containing a single
-                                integer, which is applied to all input examples
-
-                            - a list of integers or a 1D tensor, with length matching
-                                the number of examples in inputs (dim 0). Each integer
-                                is applied as the target for the corresponding example.
-
-                            For outputs with > 2 dimensions, targets can be either:
-
-                            - A single tuple, which contains #output_dims - 1
-                                elements. This target index is applied to all examples.
-
-                            - A list of tuples with length equal to the number of
-                                examples in inputs (dim 0), and each tuple containing
-                                #output_dims - 1 elements. Each tuple is applied as the
-                                target for the corresponding example.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            For a tensor, the first dimension of the tensor must
-                            correspond to the number of examples. It will be
-                            repeated for each of `n_steps` along the integrated
-                            path. For all other types, the given argument is used
-                            for all forward evaluations.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                n_steps (int, optional): The number of steps used by the approximation
-                            method. Default: 50.
-                method (string, optional): Method for approximating the integral,
-                            one of `riemann_right`, `riemann_left`, `riemann_middle`,
-                            `riemann_trapezoid` or `gausslegendre`.
-                            Default: `gausslegendre` if no method is provided.
-                internal_batch_size (int, optional): Divides total #steps * #examples
-                            data points into chunks of size internal_batch_size,
-                            which are computed (forward / backward passes)
-                            sequentially.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain internal_batch_size / num_devices examples.
-                            If internal_batch_size is None, then all evaluations are
-                            processed in one batch.
-                            Default: None
-                attribute_to_neuron_input (bool, optional): Indicates whether to
-                            compute the attributions with respect to the neuron input
-                            or output. If `attribute_to_neuron_input` is set to True
-                            then the attributions will be computed with respect to
-                            neuron's inputs, otherwise it will be computed with respect
-                            to neuron's outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal neuron, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
-
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Conductance for
-                            particular neuron with respect to each input feature.
-                            Attributions will always be the same size as the provided
-                            inputs, with each value providing the attribution of the
-                            corresponding input index.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
-
-            Examples::
+        Args:
 
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> neuron_cond = NeuronConductance(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # To compute neuron attribution, we need to provide the neuron
-                >>> # index for which attribution is desired. Since the layer output
-                >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
-                >>> # which indexes a particular neuron in the layer output.
-                >>> # Computes neuron conductance for neuron with
-                >>> # index (4,1,2).
-                >>> attribution = neuron_cond.attribute(input, (4,1,2))
+            inputs (tensor or tuple of tensors):  Input for which neuron
+                        conductance is computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            neuron_index (int or tuple): Index of neuron in output of given
+                            layer for which attribution is desired. Length of
+                            this tuple must be one less than the number of
+                            dimensions in the output of the given layer (since
+                            dimension 0 corresponds to number of examples).
+                            An integer may be provided instead of a tuple of
+                            length 1.
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define the starting point from which integral
+                        is computed and can be provided as:
+
+                        - a single tensor, if inputs is a single tensor, with
+                            exactly the same dimensions as inputs or the first
+                            dimension is one and the remaining dimensions match
+                            with inputs.
+
+                        - a single scalar, if inputs is a single tensor, which will
+                            be broadcasted for each input value in input tensor.
+
+                        - a tuple of tensors or scalars, the baseline corresponding
+                            to each tensor in the inputs' tuple can be:
+                            - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+                            - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
+
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
+
+                        Default: None
+            target (int, tuple, tensor or list, optional):  Output indices for
+                        which gradients are computed (for classification cases,
+                        this is usually the target class).
+                        If the network returns a scalar value per example,
+                        no target index is necessary.
+                        For general 2D outputs, targets can be either:
+
+                        - a single integer or a tensor containing a single
+                            integer, which is applied to all input examples
+
+                        - a list of integers or a 1D tensor, with length matching
+                            the number of examples in inputs (dim 0). Each integer
+                            is applied as the target for the corresponding example.
+
+                        For outputs with > 2 dimensions, targets can be either:
+
+                        - A single tuple, which contains #output_dims - 1
+                            elements. This target index is applied to all examples.
+
+                        - A list of tuples with length equal to the number of
+                            examples in inputs (dim 0), and each tuple containing
+                            #output_dims - 1 elements. Each tuple is applied as the
+                            target for the corresponding example.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        For a tensor, the first dimension of the tensor must
+                        correspond to the number of examples. It will be
+                        repeated for each of `n_steps` along the integrated
+                        path. For all other types, the given argument is used
+                        for all forward evaluations.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            n_steps (int, optional): The number of steps used by the approximation
+                        method. Default: 50.
+            method (string, optional): Method for approximating the integral,
+                        one of `riemann_right`, `riemann_left`, `riemann_middle`,
+                        `riemann_trapezoid` or `gausslegendre`.
+                        Default: `gausslegendre` if no method is provided.
+            internal_batch_size (int, optional): Divides total #steps * #examples
+                        data points into chunks of size internal_batch_size,
+                        which are computed (forward / backward passes)
+                        sequentially.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain internal_batch_size / num_devices examples.
+                        If internal_batch_size is None, then all evaluations are
+                        processed in one batch.
+                        Default: None
+            attribute_to_neuron_input (bool, optional): Indicates whether to
+                        compute the attributions with respect to the neuron input
+                        or output. If `attribute_to_neuron_input` is set to True
+                        then the attributions will be computed with respect to
+                        neuron's inputs, otherwise it will be computed with respect
+                        to neuron's outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal neuron, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Conductance for
+                        particular neuron with respect to each input feature.
+                        Attributions will always be the same size as the provided
+                        inputs, with each value providing the attribution of the
+                        corresponding input index.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple is provided for inputs, a tuple of
+                        corresponding sized tensors is returned.
+
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> neuron_cond = NeuronConductance(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # To compute neuron attribution, we need to provide the neuron
+            >>> # index for which attribution is desired. Since the layer output
+            >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
+            >>> # which indexes a particular neuron in the layer output.
+            >>> # Computes neuron conductance for neuron with
+            >>> # index (4,1,2).
+            >>> attribution = neuron_cond.attribute(input, (4,1,2))
         """
         is_inputs_tuple = _is_tuple(inputs)
 
diff --git a/captum/attr/_core/neuron/neuron_deep_lift.py b/captum/attr/_core/neuron/neuron_deep_lift.py
index d0f5a8af36..9df695bc72 100644
--- a/captum/attr/_core/neuron/neuron_deep_lift.py
+++ b/captum/attr/_core/neuron/neuron_deep_lift.py
@@ -11,6 +11,34 @@
 
 
 class NeuronDeepLift(NeuronAttribution, GradientAttribution):
+    r"""
+    Implements DeepLIFT algorithm for the neuron based on the following paper:
+    Learning Important Features Through Propagating Activation Differences,
+    Avanti Shrikumar, et. al.
+    https://arxiv.org/abs/1704.02685
+
+    and the gradient formulation proposed in:
+    Towards better understanding of gradient-based attribution methods for
+    deep neural networks,  Marco Ancona, et.al.
+    https://openreview.net/pdf?id=Sy21R9JAW
+
+    This implementation supports only Rescale rule. RevealCancel rule will
+    be supported in later releases.
+    Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
+    Integrated Gradients, it runs significantly faster than Integrated
+    Gradients and is preferred for large datasets.
+
+    Currently we only support a limited number of non-linear activations
+    but the plan is to expand the list in the future.
+
+    Note: As we know, currently we cannot access the building blocks,
+    of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
+    Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
+    with performance similar to built-in ones using TorchScript.
+    More details on how to build custom RNNs can be found here:
+    https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
+    """
+
     def __init__(self, model: Module, layer: Module) -> None:
         r"""
         Args:
@@ -37,32 +65,6 @@ def attribute(
         custom_attribution_func: Union[None, Callable[..., Tuple[Tensor, ...]]] = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        Implements DeepLIFT algorithm for the neuron based on the following paper:
-        Learning Important Features Through Propagating Activation Differences,
-        Avanti Shrikumar, et. al.
-        https://arxiv.org/abs/1704.02685
-
-        and the gradient formulation proposed in:
-        Towards better understanding of gradient-based attribution methods for
-        deep neural networks,  Marco Ancona, et.al.
-        https://openreview.net/pdf?id=Sy21R9JAW
-
-        This implementation supports only Rescale rule. RevealCancel rule will
-        be supported in later releases.
-        Although DeepLIFT's(Rescale Rule) attribution quality is comparable with
-        Integrated Gradients, it runs significantly faster than Integrated
-        Gradients and is preferred for large datasets.
-
-        Currently we only support a limited number of non-linear activations
-        but the plan is to expand the list in the future.
-
-        Note: As we know, currently we cannot access the building blocks,
-        of PyTorch's built-in LSTM, RNNs and GRUs such as Tanh and Sigmoid.
-        Nonetheless, it is possible to build custom LSTMs, RNNS and GRUs
-        with performance similar to built-in ones using TorchScript.
-        More details on how to build custom RNNs can be found here:
-        https://pytorch.org/blog/optimizing-cuda-rnn-with-torchscript/
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
@@ -190,6 +192,26 @@ def attribute(
 
 
 class NeuronDeepLiftShap(NeuronAttribution, GradientAttribution):
+    r"""
+    Extends NeuronAttribution and uses LayerDeepLiftShap algorithms and
+    approximates SHAP values for given input `layer` and `neuron_index`.
+    For each input sample - baseline pair it computes DeepLift attributions
+    with respect to inputs or outputs of given `layer` and `neuron_index`
+    averages resulting attributions across baselines. Whether to compute the
+    attributions with respect to the inputs or outputs of the layer is defined
+    by the input flag `attribute_to_layer_input`.
+    More details about the algorithm can be found here:
+
+    http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
+
+    Note that the explanation model:
+        1. Assumes that input features are independent of one another
+        2. Is linear, meaning that the explanations are modeled through
+            the additive composition of feature effects.
+    Although, it assumes a linear model for each explanation, the overall
+    model across multiple explanations can be complex and non-linear.
+    """
+
     def __init__(self, model: Module, layer: Module) -> None:
         r"""
         Args:
@@ -217,23 +239,6 @@ def attribute(
         custom_attribution_func: Union[None, Callable[..., Tuple[Tensor, ...]]] = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-        Extends NeuronAttribution and uses LayerDeepLiftShap algorithms and
-        approximates SHAP values for given input `layer` and `neuron_index`.
-        For each input sample - baseline pair it computes DeepLift attributions
-        with respect to inputs or outputs of given `layer` and `neuron_index`
-        averages resulting attributions across baselines. Whether to compute the
-        attributions with respect to the inputs or outputs of the layer is defined
-        by the input flag `attribute_to_layer_input`.
-        More details about the algorithm can be found here:
-
-        http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf
-
-        Note that the explanation model:
-            1. Assumes that input features are independent of one another
-            2. Is linear, meaning that the explanations are modeled through
-               the additive composition of feature effects.
-        Although, it assumes a linear model for each explanation, the overall
-        model across multiple explanations can be complex and non-linear.
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which layer
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index 5216f5a5b1..b57b4b2658 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -13,6 +13,20 @@
 
 
 class NeuronFeatureAblation(NeuronAttribution, PerturbationAttribution):
+    r"""
+    A perturbation based approach to computing neuron attribution,
+    involving replacing each input feature with a given baseline /
+    reference, and computing the difference in the neuron's input / output.
+    By default, each scalar value within
+    each input tensor is taken as a feature and replaced independently. Passing
+    a feature mask, allows grouping features to be ablated together. This can
+    be used in cases such as images, where an entire segment or region
+    can be ablated, measuring the importance of the segment (feature group).
+    Each input scalar in the group will be given the same attribution value
+    equal to the change in target as a result of ablating the entire feature
+    group.
+    """
+    
     def __init__(
         self,
         forward_func: Callable,
@@ -51,117 +65,104 @@ def attribute(
         perturbations_per_eval: int = 1,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-            A perturbation based approach to computing neuron attribution,
-            involving replacing each input feature with a given baseline /
-            reference, and computing the difference in the neuron's input / output.
-            By default, each scalar value within
-            each input tensor is taken as a feature and replaced independently. Passing
-            a feature mask, allows grouping features to be ablated together. This can
-            be used in cases such as images, where an entire segment or region
-            can be ablated, measuring the importance of the segment (feature group).
-            Each input scalar in the group will be given the same attribution value
-            equal to the change in target as a result of ablating the entire feature
-            group.
-
-
-            Args:
+        Args:
 
-                inputs (tensor or tuple of tensors):  Input for which neuron
-                            attributions are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                neuron_index (int or tuple): Index of neuron in output of given
-                              layer for which attribution is desired. The length of
-                              this tuple must be one less than the number of
-                              dimensions in the output of the given layer (since
-                              dimension 0 corresponds to number of examples).
-                              An integer may be provided instead of a tuple of
-                              length 1.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define reference value which replaces each
-                            feature when ablated.
-                            Baselines can be provided as:
-                            - a single tensor, if inputs is a single tensor, with
+            inputs (tensor or tuple of tensors):  Input for which neuron
+                        attributions are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            neuron_index (int or tuple): Index of neuron in output of given
+                            layer for which attribution is desired. The length of
+                            this tuple must be one less than the number of
+                            dimensions in the output of the given layer (since
+                            dimension 0 corresponds to number of examples).
+                            An integer may be provided instead of a tuple of
+                            length 1.
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define reference value which replaces each
+                        feature when ablated.
+                        Baselines can be provided as:
+                        - a single tensor, if inputs is a single tensor, with
+                            exactly the same dimensions as inputs or
+                            broadcastable to match the dimensions of inputs
+                        - a single scalar, if inputs is a single tensor, which will
+                            be broadcasted for each input value in input tensor.
+                        - a tuple of tensors or scalars, the baseline corresponding
+                            to each tensor in the inputs' tuple can be:
+                            - either a tensor with
                                 exactly the same dimensions as inputs or
                                 broadcastable to match the dimensions of inputs
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with
-                                    exactly the same dimensions as inputs or
-                                    broadcastable to match the dimensions of inputs
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                feature_mask (tensor or tuple of tensors, optional):
-                            feature_mask defines a mask for the input, grouping
-                            features which should be ablated together. feature_mask
-                            should contain the same number of tensors as inputs.
-                            Each tensor should
-                            be the same size as the corresponding input or
-                            broadcastable to match the input tensor. Each tensor
-                            should contain integers in the range 0 to num_features
-                            - 1, and indices corresponding to the same feature should
-                            have the same value.
-                            Note that features within each input tensor are ablated
-                            independently (not across tensors).
-                            If None, then a feature mask is constructed which assigns
-                            each scalar within a tensor as a separate feature, which
-                            is ablated independently.
-                            Default: None
-                attribute_to_neuron_input (bool, optional): Indicates whether to
-                            compute the attributions with respect to the neuron input
-                            or output. If `attribute_to_neuron_input` is set to True
-                            then the attributions will be computed with respect to
-                            neuron's inputs, otherwise it will be computed with respect
-                            to neuron's outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal neurons, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
-                perturbations_per_eval (int, optional): Allows ablation of multiple
-                            features to be processed simultaneously in one call to
-                            forward_fn.
-                            Each forward pass will contain a maximum of
-                            perturbations_per_eval * #examples samples.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain at most
-                            (perturbations_per_eval * #examples) / num_devices
-                            samples.
-                            Default: 1
-
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Attributions of particular neuron with respect to each input
-                            feature. Attributions will always be the same size as the
-                            provided inputs, with each value providing the attribution
-                            of the corresponding input index.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
+                            - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            feature_mask (tensor or tuple of tensors, optional):
+                        feature_mask defines a mask for the input, grouping
+                        features which should be ablated together. feature_mask
+                        should contain the same number of tensors as inputs.
+                        Each tensor should
+                        be the same size as the corresponding input or
+                        broadcastable to match the input tensor. Each tensor
+                        should contain integers in the range 0 to num_features
+                        - 1, and indices corresponding to the same feature should
+                        have the same value.
+                        Note that features within each input tensor are ablated
+                        independently (not across tensors).
+                        If None, then a feature mask is constructed which assigns
+                        each scalar within a tensor as a separate feature, which
+                        is ablated independently.
+                        Default: None
+            attribute_to_neuron_input (bool, optional): Indicates whether to
+                        compute the attributions with respect to the neuron input
+                        or output. If `attribute_to_neuron_input` is set to True
+                        then the attributions will be computed with respect to
+                        neuron's inputs, otherwise it will be computed with respect
+                        to neuron's outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal neurons, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
+            perturbations_per_eval (int, optional): Allows ablation of multiple
+                        features to be processed simultaneously in one call to
+                        forward_fn.
+                        Each forward pass will contain a maximum of
+                        perturbations_per_eval * #examples samples.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain at most
+                        (perturbations_per_eval * #examples) / num_devices
+                        samples.
+                        Default: 1
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Attributions of particular neuron with respect to each input
+                        feature. Attributions will always be the same size as the
+                        provided inputs, with each value providing the attribution
+                        of the corresponding input index.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple is provided for inputs, a tuple of
+                        corresponding sized tensors is returned.
 
         Examples::
 
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index 786e47f75c..ffdc0a33f4 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -18,6 +18,10 @@
 
 
 class NeuronGradient(NeuronAttribution, GradientAttribution):
+    r"""
+    Computes the gradient of the output of a particular neuron with
+    respect to the inputs of the network.
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -55,78 +59,75 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-            Computes the gradient of the output of a particular neuron with
-            respect to the inputs of the network.
-
-            Args:
+        Args:
 
-                inputs (tensor or tuple of tensors):  Input for which neuron
-                            gradients are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                neuron_index (int or tuple): Index of neuron in output of given
-                              layer for which attribution is desired. Length of
-                              this tuple must be one less than the number of
-                              dimensions in the output of the given layer (since
-                              dimension 0 corresponds to number of examples).
-                              An integer may be provided instead of a tuple of
-                              length 1.
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                attribute_to_neuron_input (bool, optional): Indicates whether to
-                            compute the attributions with respect to the neuron input
-                            or output. If `attribute_to_neuron_input` is set to True
-                            then the attributions will be computed with respect to
-                            neuron's inputs, otherwise it will be computed with respect
-                            to neuron's outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal neurons, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
+            inputs (tensor or tuple of tensors):  Input for which neuron
+                        gradients are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            neuron_index (int or tuple): Index of neuron in output of given
+                            layer for which attribution is desired. Length of
+                            this tuple must be one less than the number of
+                            dimensions in the output of the given layer (since
+                            dimension 0 corresponds to number of examples).
+                            An integer may be provided instead of a tuple of
+                            length 1.
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            attribute_to_neuron_input (bool, optional): Indicates whether to
+                        compute the attributions with respect to the neuron input
+                        or output. If `attribute_to_neuron_input` is set to True
+                        then the attributions will be computed with respect to
+                        neuron's inputs, otherwise it will be computed with respect
+                        to neuron's outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal neurons, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
 
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Gradients of particular neuron with respect to each input
-                            feature. Attributions will always be the same size as the
-                            provided inputs, with each value providing the attribution
-                            of the corresponding input index.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Gradients of particular neuron with respect to each input
+                        feature. Attributions will always be the same size as the
+                        provided inputs, with each value providing the attribution
+                        of the corresponding input index.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple is provided for inputs, a tuple of
+                        corresponding sized tensors is returned.
 
-            Examples::
+        Examples::
 
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> neuron_ig = NeuronGradient(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # To compute neuron attribution, we need to provide the neuron
-                >>> # index for which attribution is desired. Since the layer output
-                >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
-                >>> # which indexes a particular neuron in the layer output.
-                >>> # For this example, we choose the index (4,1,2).
-                >>> # Computes neuron gradient for neuron with
-                >>> # index (4,1,2).
-                >>> attribution = neuron_ig.attribute(input, (4,1,2))
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> neuron_ig = NeuronGradient(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # To compute neuron attribution, we need to provide the neuron
+            >>> # index for which attribution is desired. Since the layer output
+            >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
+            >>> # which indexes a particular neuron in the layer output.
+            >>> # For this example, we choose the index (4,1,2).
+            >>> # Computes neuron gradient for neuron with
+            >>> # index (4,1,2).
+            >>> attribution = neuron_ig.attribute(input, (4,1,2))
         """
         is_inputs_tuple = _is_tuple(inputs)
         inputs = _format_input(inputs)
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 1468e49008..9b47eb8924 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -9,6 +9,41 @@
 
 
 class NeuronGradientShap(NeuronAttribution, GradientAttribution):
+    r"""
+    Implements gradient SHAP for a neuron in a hidden layer based on the
+    implementation from SHAP's primary author. For reference, please, view:
+
+    https://github.com/slundberg/shap\
+    #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
+
+    A Unified Approach to Interpreting Model Predictions
+    http://papers.nips.cc/paper\
+    7062-a-unified-approach-to-interpreting-model-predictions
+
+    GradientShap approximates SHAP values by computing the expectations of
+    gradients by randomly sampling from the distribution of baselines/references.
+    It adds white noise to each input sample `n_samples` times, selects a
+    random baseline from baselines' distribution and a random point along the
+    path between the baseline and the input, and computes the gradient of the
+    neuron with index `neuron_index` with respect to those selected random
+    points. The final SHAP values represent the expected values of
+    `gradients * (inputs - baselines)`.
+
+    GradientShap makes an assumption that the input features are independent
+    and that the explanation model is linear, meaning that the explanations
+    are modeled through the additive composition of feature effects.
+    Under those assumptions, SHAP value can be approximated as the expectation
+    of gradients that are computed for randomly generated `n_samples` input
+    samples after adding gaussian noise `n_samples` times to each input for
+    different baselines/references.
+
+    In some sense it can be viewed as an approximation of integrated gradients
+    by computing the expectations of gradients for different baselines.
+
+    Current implementation uses Smoothgrad from `NoiseTunnel` in order to
+    randomly draw samples from the distribution of baselines, add noise to input
+    samples and compute the expectation (smoothgrad).
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -50,40 +85,6 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-        Implements gradient SHAP for a neuron in a hidden layer based on the
-        implementation from SHAP's primary author. For reference, please, view:
-
-        https://github.com/slundberg/shap\
-        #deep-learning-example-with-gradientexplainer-tensorflowkeraspytorch-models
-
-        A Unified Approach to Interpreting Model Predictions
-        http://papers.nips.cc/paper\
-        7062-a-unified-approach-to-interpreting-model-predictions
-
-        GradientShap approximates SHAP values by computing the expectations of
-        gradients by randomly sampling from the distribution of baselines/references.
-        It adds white noise to each input sample `n_samples` times, selects a
-        random baseline from baselines' distribution and a random point along the
-        path between the baseline and the input, and computes the gradient of the
-        neuron with index `neuron_index` with respect to those selected random
-        points. The final SHAP values represent the expected values of
-        `gradients * (inputs - baselines)`.
-
-        GradientShap makes an assumption that the input features are independent
-        and that the explanation model is linear, meaning that the explanations
-        are modeled through the additive composition of feature effects.
-        Under those assumptions, SHAP value can be approximated as the expectation
-        of gradients that are computed for randomly generated `n_samples` input
-        samples after adding gaussian noise `n_samples` times to each input for
-        different baselines/references.
-
-        In some sense it can be viewed as an approximation of integrated gradients
-        by computing the expectations of gradients for different baselines.
-
-        Current implementation uses Smoothgrad from `NoiseTunnel` in order to
-        randomly draw samples from the distribution of baselines, add noise to input
-        samples and compute the expectation (smoothgrad).
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which SHAP attribution
@@ -172,20 +173,20 @@ def attribute(
                         returned. If a tuple is provided for inputs, a tuple of
                         corresponding sized tensors is returned.
 
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> net = ImageClassifier()
-                >>> neuron_grad_shap = NeuronGradientShap(net, net.linear2)
-                >>> input = torch.randn(3, 3, 32, 32, requires_grad=True)
-                >>> # choosing baselines randomly
-                >>> baselines = torch.randn(20, 3, 32, 32)
-                >>> # Computes gradient SHAP of first neuron in linear2 layer
-                >>> # with respect to the input's of the network.
-                >>> # Attribution size matches input size: 3x3x32x32
-                >>> attribution = neuron_grad_shap.attribute(input, neuron_ind=0
-                                                             baselines)
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> net = ImageClassifier()
+            >>> neuron_grad_shap = NeuronGradientShap(net, net.linear2)
+            >>> input = torch.randn(3, 3, 32, 32, requires_grad=True)
+            >>> # choosing baselines randomly
+            >>> baselines = torch.randn(20, 3, 32, 32)
+            >>> # Computes gradient SHAP of first neuron in linear2 layer
+            >>> # with respect to the input's of the network.
+            >>> # Attribution size matches input size: 3x3x32x32
+            >>> attribution = neuron_grad_shap.attribute(input, neuron_ind=0
+                                                            baselines)
 
         """
         gs = GradientShap(self.forward_func)
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 166c91f90e..62430fa636 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -10,6 +10,23 @@
 
 
 class NeuronDeconvolution(NeuronAttribution, GradientAttribution):
+    """        
+    Computes attribution of the given neuron using deconvolution.
+    Deconvolution computes the gradient of the target output with
+    respect to the input, but gradients of ReLU functions are overriden so
+    that the gradient of the ReLU input is simply computed taking ReLU of
+    the output gradient, essentially only propagating non-negative gradients
+    (without dependence on the sign of the ReLU input).
+
+    More details regarding the deconvolution algorithm can be found
+    in these papers:
+    https://arxiv.org/abs/1311.2901
+    https://link.springer.com/chapter/10.1007/978-3-319-46466-4_8
+
+    Warning: Ensure that all ReLU operations in the forward function of the
+    given model are performed using a module (nn.module.ReLU).
+    If nn.functional.ReLU is used, gradients are not overriden appropriately.
+    """
     def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
@@ -44,22 +61,6 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        Computes attribution of the given neuron using deconvolution.
-        Deconvolution computes the gradient of the target output with
-        respect to the input, but gradients of ReLU functions are overriden so
-        that the gradient of the ReLU input is simply computed taking ReLU of
-        the output gradient, essentially only propagating non-negative gradients
-        (without dependence on the sign of the ReLU input).
-
-        More details regarding the deconvolution algorithm can be found
-        in these papers:
-        https://arxiv.org/abs/1311.2901
-        https://link.springer.com/chapter/10.1007/978-3-319-46466-4_8
-
-        Warning: Ensure that all ReLU operations in the forward function of the
-        given model are performed using a module (nn.module.ReLU).
-        If nn.functional.ReLU is used, gradients are not overriden appropriately.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
@@ -137,6 +138,20 @@ def attribute(
 
 
 class NeuronGuidedBackprop(NeuronAttribution, GradientAttribution):
+    r"""
+    Computes attribution of the given neuron using guided backpropagation.
+    Guided backpropagation computes the gradient of the target neuron
+    with respect to the input, but gradients of ReLU functions are overriden
+    so that only non-negative gradients are backpropagated.
+
+    More details regarding the guided backpropagation algorithm can be found
+    in the original paper here:
+    https://arxiv.org/abs/1412.6806
+
+    Warning: Ensure that all ReLU operations in the forward function of the
+    given model are performed using a module (nn.module.ReLU).
+    If nn.functional.ReLU is used, gradients are not overriden appropriately.
+    """
     def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
@@ -168,19 +183,6 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        Computes attribution of the given neuron using guided backpropagation.
-        Guided backpropagation computes the gradient of the target neuron
-        with respect to the input, but gradients of ReLU functions are overriden
-        so that only non-negative gradients are backpropagated.
-
-        More details regarding the guided backpropagation algorithm can be found
-        in the original paper here:
-        https://arxiv.org/abs/1412.6806
-
-        Warning: Ensure that all ReLU operations in the forward function of the
-        given model are performed using a module (nn.module.ReLU).
-        If nn.functional.ReLU is used, gradients are not overriden appropriately.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index 5f83510204..ad421fe1b1 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -11,6 +11,17 @@
 
 
 class NeuronIntegratedGradients(NeuronAttribution, GradientAttribution):
+    r"""
+    Approximates the integral of gradients for a particular neuron
+    along the path from a baseline input to the given input.
+    If no baseline is provided, the default baseline is the zero tensor.
+    More details regarding the integrated gradient method can be found in the
+    original paper here:
+    https://arxiv.org/abs/1703.01365
+
+    Note that this method is equivalent to applying integrated gradients
+    where the output is the output of the identified neuron.
+    """
     def __init__(
         self,
         forward_func: Callable,
@@ -52,136 +63,125 @@ def attribute(
         attribute_to_neuron_input: bool = False,
     ) -> TensorOrTupleOfTensorsGeneric:
         r"""
-            Approximates the integral of gradients for a particular neuron
-            along the path from a baseline input to the given input.
-            If no baseline is provided, the default baseline is the zero tensor.
-            More details regarding the integrated gradient method can be found in the
-            original paper here:
-            https://arxiv.org/abs/1703.01365
-
-            Note that this method is equivalent to applying integrated gradients
-            where the output is the output of the identified neuron.
-
-
-            Args:
-
-                inputs (tensor or tuple of tensors):  Input for which neuron integrated
-                            gradients are computed. If forward_func takes a single
-                            tensor as input, a single input tensor should be provided.
-                            If forward_func takes multiple tensors as input, a tuple
-                            of the input tensors should be provided. It is assumed
-                            that for all given input tensors, dimension 0 corresponds
-                            to the number of examples, and if multiple input tensors
-                            are provided, the examples must be aligned appropriately.
-                neuron_index (int or tuple): Index of neuron in output of given
-                              layer for which attribution is desired. Length of
-                              this tuple must be one less than the number of
-                              dimensions in the output of the given layer (since
-                              dimension 0 corresponds to number of examples).
-                              An integer may be provided instead of a tuple of
-                              length 1.
-                baselines (scalar, tensor, tuple of scalars or tensors, optional):
-                            Baselines define the starting point from which integral
-                            is computed.
-                            Baselines can be provided as:
-
-                            - a single tensor, if inputs is a single tensor, with
-                                exactly the same dimensions as inputs or the first
-                                dimension is one and the remaining dimensions match
-                                with inputs.
-
-                            - a single scalar, if inputs is a single tensor, which will
-                                be broadcasted for each input value in input tensor.
-
-                            - a tuple of tensors or scalars, the baseline corresponding
-                                to each tensor in the inputs' tuple can be:
-                                - either a tensor with matching dimensions to
-                                    corresponding tensor in the inputs' tuple
-                                    or the first dimension is one and the remaining
-                                    dimensions match with the corresponding
-                                    input tensor.
-                                - or a scalar, corresponding to a tensor in the
-                                    inputs' tuple. This scalar value is broadcasted
-                                    for corresponding input tensor.
-
-                            In the cases when `baselines` is not provided, we internally
-                            use zero scalar corresponding to each input tensor.
-
-                            Default: None
-                additional_forward_args (any, optional): If the forward function
-                            requires additional arguments other than the inputs for
-                            which attributions should not be computed, this argument
-                            can be provided. It must be either a single additional
-                            argument of a Tensor or arbitrary (non-tuple) type or a
-                            tuple containing multiple additional arguments including
-                            tensors or any arbitrary python types. These arguments
-                            are provided to forward_func in order following the
-                            arguments in inputs.
-                            For a tensor, the first dimension of the tensor must
-                            correspond to the number of examples. It will be
-                            repeated for each of `n_steps` along the integrated
-                            path. For all other types, the given argument is used
-                            for all forward evaluations.
-                            Note that attributions are not computed with respect
-                            to these arguments.
-                            Default: None
-                n_steps (int, optional): The number of steps used by the approximation
-                            method. Default: 50.
-                method (string, optional): Method for approximating the integral,
-                            one of `riemann_right`, `riemann_left`, `riemann_middle`,
-                            `riemann_trapezoid` or `gausslegendre`.
-                            Default: `gausslegendre` if no method is provided.
-                internal_batch_size (int, optional): Divides total #steps * #examples
-                            data points into chunks of size internal_batch_size,
-                            which are computed (forward / backward passes)
-                            sequentially.
-                            For DataParallel models, each batch is split among the
-                            available devices, so evaluations on each available
-                            device contain internal_batch_size / num_devices examples.
-                            If internal_batch_size is None, then all evaluations are
-                            processed in one batch.
-                            Default: None
-                attribute_to_neuron_input (bool, optional): Indicates whether to
-                            compute the attributions with respect to the neuron input
-                            or output. If `attribute_to_neuron_input` is set to True
-                            then the attributions will be computed with respect to
-                            neuron's inputs, otherwise it will be computed with respect
-                            to neuron's outputs.
-                            Note that currently it is assumed that either the input
-                            or the output of internal neuron, depending on whether we
-                            attribute to the input or output, is a single tensor.
-                            Support for multiple tensors will be added later.
-                            Default: False
-
-            Returns:
-                *tensor* or tuple of *tensors* of **attributions**:
-                - **attributions** (*tensor* or tuple of *tensors*):
-                            Integrated gradients for particular neuron with
-                            respect to each input feature.
-                            Attributions will always be the same size as the provided
-                            inputs, with each value providing the attribution of the
-                            corresponding input index.
-                            If a single tensor is provided as inputs, a single tensor is
-                            returned. If a tuple is provided for inputs, a tuple of
-                            corresponding sized tensors is returned.
-
-            Examples::
-
-                >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
-                >>> # and returns an Nx10 tensor of class probabilities.
-                >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
-                >>> # and the output of this layer has dimensions Nx12x32x32.
-                >>> net = ImageClassifier()
-                >>> neuron_ig = NeuronIntegratedGradients(net, net.conv1)
-                >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
-                >>> # To compute neuron attribution, we need to provide the neuron
-                >>> # index for which attribution is desired. Since the layer output
-                >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
-                >>> # which indexes a particular neuron in the layer output.
-                >>> # For this example, we choose the index (4,1,2).
-                >>> # Computes neuron integrated gradients for neuron with
-                >>> # index (4,1,2).
-                >>> attribution = neuron_ig.attribute(input, (4,1,2))
+        Args:
+
+            inputs (tensor or tuple of tensors):  Input for which neuron integrated
+                        gradients are computed. If forward_func takes a single
+                        tensor as input, a single input tensor should be provided.
+                        If forward_func takes multiple tensors as input, a tuple
+                        of the input tensors should be provided. It is assumed
+                        that for all given input tensors, dimension 0 corresponds
+                        to the number of examples, and if multiple input tensors
+                        are provided, the examples must be aligned appropriately.
+            neuron_index (int or tuple): Index of neuron in output of given
+                            layer for which attribution is desired. Length of
+                            this tuple must be one less than the number of
+                            dimensions in the output of the given layer (since
+                            dimension 0 corresponds to number of examples).
+                            An integer may be provided instead of a tuple of
+                            length 1.
+            baselines (scalar, tensor, tuple of scalars or tensors, optional):
+                        Baselines define the starting point from which integral
+                        is computed.
+                        Baselines can be provided as:
+
+                        - a single tensor, if inputs is a single tensor, with
+                            exactly the same dimensions as inputs or the first
+                            dimension is one and the remaining dimensions match
+                            with inputs.
+
+                        - a single scalar, if inputs is a single tensor, which will
+                            be broadcasted for each input value in input tensor.
+
+                        - a tuple of tensors or scalars, the baseline corresponding
+                            to each tensor in the inputs' tuple can be:
+                            - either a tensor with matching dimensions to
+                                corresponding tensor in the inputs' tuple
+                                or the first dimension is one and the remaining
+                                dimensions match with the corresponding
+                                input tensor.
+                            - or a scalar, corresponding to a tensor in the
+                                inputs' tuple. This scalar value is broadcasted
+                                for corresponding input tensor.
+
+                        In the cases when `baselines` is not provided, we internally
+                        use zero scalar corresponding to each input tensor.
+
+                        Default: None
+            additional_forward_args (any, optional): If the forward function
+                        requires additional arguments other than the inputs for
+                        which attributions should not be computed, this argument
+                        can be provided. It must be either a single additional
+                        argument of a Tensor or arbitrary (non-tuple) type or a
+                        tuple containing multiple additional arguments including
+                        tensors or any arbitrary python types. These arguments
+                        are provided to forward_func in order following the
+                        arguments in inputs.
+                        For a tensor, the first dimension of the tensor must
+                        correspond to the number of examples. It will be
+                        repeated for each of `n_steps` along the integrated
+                        path. For all other types, the given argument is used
+                        for all forward evaluations.
+                        Note that attributions are not computed with respect
+                        to these arguments.
+                        Default: None
+            n_steps (int, optional): The number of steps used by the approximation
+                        method. Default: 50.
+            method (string, optional): Method for approximating the integral,
+                        one of `riemann_right`, `riemann_left`, `riemann_middle`,
+                        `riemann_trapezoid` or `gausslegendre`.
+                        Default: `gausslegendre` if no method is provided.
+            internal_batch_size (int, optional): Divides total #steps * #examples
+                        data points into chunks of size internal_batch_size,
+                        which are computed (forward / backward passes)
+                        sequentially.
+                        For DataParallel models, each batch is split among the
+                        available devices, so evaluations on each available
+                        device contain internal_batch_size / num_devices examples.
+                        If internal_batch_size is None, then all evaluations are
+                        processed in one batch.
+                        Default: None
+            attribute_to_neuron_input (bool, optional): Indicates whether to
+                        compute the attributions with respect to the neuron input
+                        or output. If `attribute_to_neuron_input` is set to True
+                        then the attributions will be computed with respect to
+                        neuron's inputs, otherwise it will be computed with respect
+                        to neuron's outputs.
+                        Note that currently it is assumed that either the input
+                        or the output of internal neuron, depending on whether we
+                        attribute to the input or output, is a single tensor.
+                        Support for multiple tensors will be added later.
+                        Default: False
+
+        Returns:
+            *tensor* or tuple of *tensors* of **attributions**:
+            - **attributions** (*tensor* or tuple of *tensors*):
+                        Integrated gradients for particular neuron with
+                        respect to each input feature.
+                        Attributions will always be the same size as the provided
+                        inputs, with each value providing the attribution of the
+                        corresponding input index.
+                        If a single tensor is provided as inputs, a single tensor is
+                        returned. If a tuple is provided for inputs, a tuple of
+                        corresponding sized tensors is returned.
+
+        Examples::
+
+            >>> # ImageClassifier takes a single input tensor of images Nx3x32x32,
+            >>> # and returns an Nx10 tensor of class probabilities.
+            >>> # It contains an attribute conv1, which is an instance of nn.conv2d,
+            >>> # and the output of this layer has dimensions Nx12x32x32.
+            >>> net = ImageClassifier()
+            >>> neuron_ig = NeuronIntegratedGradients(net, net.conv1)
+            >>> input = torch.randn(2, 3, 32, 32, requires_grad=True)
+            >>> # To compute neuron attribution, we need to provide the neuron
+            >>> # index for which attribution is desired. Since the layer output
+            >>> # is Nx12x32x32, we need a tuple in the form (0..11,0..31,0..31)
+            >>> # which indexes a particular neuron in the layer output.
+            >>> # For this example, we choose the index (4,1,2).
+            >>> # Computes neuron integrated gradients for neuron with
+            >>> # index (4,1,2).
+            >>> attribution = neuron_ig.attribute(input, (4,1,2))
         """
         ig = IntegratedGradients(self.forward_func)
         ig.gradient_func = construct_neuron_grad_fn(
diff --git a/captum/attr/_core/noise_tunnel.py b/captum/attr/_core/noise_tunnel.py
index 391a7c32e1..47162227d4 100644
--- a/captum/attr/_core/noise_tunnel.py
+++ b/captum/attr/_core/noise_tunnel.py
@@ -33,6 +33,30 @@ class NoiseTunnelType(Enum):
 
 
 class NoiseTunnel(Attribution):
+    r"""
+    Adds gaussian noise to each input in the batch `n_samples` times
+    and applies the given attribution algorithm to each of the samples.
+    The attributions of the samples are combined based on the given noise
+    tunnel type (nt_type):
+    If nt_type is `smoothgrad`, the mean of the sampled attributions is
+    returned. This approximates smoothing the given attribution method
+    with a Gaussian Kernel.
+    If nt_type is `smoothgrad_sq`, the mean of the squared sample attributions
+    is returned.
+    If nt_type is `vargrad`, the variance of the sample attributions is
+    returned.
+
+    More details about adding noise can be found in the following papers:
+        https://arxiv.org/abs/1810.03292
+        https://arxiv.org/abs/1810.03307
+        https://arxiv.org/abs/1706.03825
+        https://arxiv.org/pdf/1806.10758
+    This method currently also supports batches of multiple examples input,
+    however it can be computationally expensive depending on the model,
+    the dimensionality of the data and execution environment.
+    It is assumed that the batch size is the first dimension of input tensors.
+    """
+
     def __init__(self, attribution_method: Attribution) -> None:
         r"""
         attribution_method (Attribution): An instance of any attribution algorithm
@@ -54,28 +78,6 @@ def attribute(
         **kwargs: Any
     ):
         r"""
-        Adds gaussian noise to each input in the batch `n_samples` times
-        and applies the given attribution algorithm to each of the samples.
-        The attributions of the samples are combined based on the given noise
-        tunnel type (nt_type):
-        If nt_type is `smoothgrad`, the mean of the sampled attributions is
-        returned. This approximates smoothing the given attribution method
-        with a Gaussian Kernel.
-        If nt_type is `smoothgrad_sq`, the mean of the squared sample attributions
-        is returned.
-        If nt_type is `vargrad`, the variance of the sample attributions is
-        returned.
-
-        More details about adding noise can be found in the following papers:
-            https://arxiv.org/abs/1810.03292
-            https://arxiv.org/abs/1810.03307
-            https://arxiv.org/abs/1706.03825
-            https://arxiv.org/pdf/1806.10758
-        This method currently also supports batches of multiple examples input,
-        however it can be computationally expensive depending on the model,
-        the dimensionality of the data and execution environment.
-        It is assumed that the batch size is the first dimension of input tensors.
-
         Args:
 
             inputs (tensor or tuple of tensors):  Input for which integrated
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
index 86073d106c..28cb32248a 100644
--- a/captum/attr/_core/occlusion.py
+++ b/captum/attr/_core/occlusion.py
@@ -16,6 +16,26 @@
 
 
 class Occlusion(FeatureAblation):
+    r"""
+    A perturbation based approach to compute attribution, involving
+    replacing each contiguous rectangular region with a given baseline /
+    reference, and computing the difference in output. For features located
+    in multiple regions (hyperrectangles), the corresponding output differences
+    are averaged to compute the attribution for that feature.
+
+    The first patch is applied with the corner aligned with all indices 0,
+    and strides are applied until the entire dimension range is covered. Note
+    that this may cause the final patch applied in a direction to be cut-off
+    and thus smaller than the target occlusion shape.
+
+    More details regarding the occlusion (or grey-box / sliding window)
+    method can be found in the original paper and in the DeepExplain
+    implementation.
+    https://arxiv.org/abs/1311.2901
+    https://github.com/marcoancona/DeepExplain/blob/master/deepexplain\
+    /tensorflow/methods.py#L401
+    """
+
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
@@ -39,24 +59,6 @@ def attribute(  # type: ignore
         perturbations_per_eval: int = 1,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        A perturbation based approach to compute attribution, involving
-        replacing each contiguous rectangular region with a given baseline /
-        reference, and computing the difference in output. For features located
-        in multiple regions (hyperrectangles), the corresponding output differences
-        are averaged to compute the attribution for that feature.
-
-        The first patch is applied with the corner aligned with all indices 0,
-        and strides are applied until the entire dimension range is covered. Note
-        that this may cause the final patch applied in a direction to be cut-off
-        and thus smaller than the target occlusion shape.
-
-        More details regarding the occlusion (or grey-box / sliding window)
-        method can be found in the original paper and in the DeepExplain
-        implementation.
-        https://arxiv.org/abs/1311.2901
-        https://github.com/marcoancona/DeepExplain/blob/master/deepexplain\
-        /tensorflow/methods.py#L401
-
         Args:
 
                 inputs (tensor or tuple of tensors):  Input for which occlusion
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index eaf7049dba..18a7f07b3b 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -12,6 +12,14 @@
 
 
 class Saliency(GradientAttribution):
+    r"""
+    A baseline approach for computing input attribution. It returns
+    the gradients with respect to inputs. If `abs` is set to True, which is
+    the default, the absolute value of the gradients is returned.
+
+    More details about the approach can be found in the following paper:
+        https://arxiv.org/pdf/1312.6034.pdf
+    """
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
@@ -29,13 +37,6 @@ def attribute(
         additional_forward_args: Any = None,
     ) -> TensorOrTupleOfTensorsGeneric:
         r""""
-        A baseline approach for computing input attribution. It returns
-        the gradients with respect to inputs. If `abs` is set to True, which is
-        the default, the absolute value of the gradients is returned.
-
-        More details about the approach can be found in the following paper:
-            https://arxiv.org/pdf/1312.6034.pdf
-
         Args:
 
                 inputs (tensor or tuple of tensors):  Input for which integrated
diff --git a/captum/attr/_utils/attribution.py b/captum/attr/_utils/attribution.py
index 770c4234cf..e77c5f275f 100644
--- a/captum/attr/_utils/attribution.py
+++ b/captum/attr/_utils/attribution.py
@@ -409,13 +409,13 @@ def interpolate(
 
 class NeuronAttribution(InternalAttribution):
     r"""
-        Neuron attribution provides input attribution for a given neuron, quanitfying
-        the importance of each input feature in the activation of a particular neuron.
-        Calling attribute on a NeuronAttribution object requires also providing
-        the index of the neuron in the output of the given layer for which attributions
-        are required.
-        The output attribution of calling attribute on a NeuronAttribution object
-        always matches the size of the input.
+    Neuron attribution provides input attribution for a given neuron, quanitfying
+    the importance of each input feature in the activation of a particular neuron.
+    Calling attribute on a NeuronAttribution object requires also providing
+    the index of the neuron in the output of the given layer for which attributions
+    are required.
+    The output attribution of calling attribute on a NeuronAttribution object
+    always matches the size of the input.
     """
 
     def __init__(

From c2bc506c5e1342a79d0d9c79b601dda983632249 Mon Sep 17 00:00:00 2001
From: Vivek Miglani <vivekm@fb.com>
Date: Mon, 2 Mar 2020 10:59:19 -0800
Subject: [PATCH 2/3] Fixes

---
 captum/attr/_core/gradient_shap.py                           | 1 +
 captum/attr/_core/input_x_gradient.py                        | 2 +-
 captum/attr/_core/layer/layer_activation.py                  | 2 +-
 captum/attr/_core/layer/layer_conductance.py                 | 1 +
 captum/attr/_core/layer/layer_deep_lift.py                   | 2 +-
 captum/attr/_core/layer/layer_feature_ablation.py            | 1 +
 captum/attr/_core/layer/layer_gradient_x_activation.py       | 1 +
 captum/attr/_core/neuron/neuron_conductance.py               | 1 +
 captum/attr/_core/neuron/neuron_feature_ablation.py          | 2 +-
 captum/attr/_core/neuron/neuron_gradient.py                  | 1 +
 captum/attr/_core/neuron/neuron_gradient_shap.py             | 1 +
 captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py | 4 +++-
 captum/attr/_core/neuron/neuron_integrated_gradients.py      | 1 +
 captum/attr/_core/saliency.py                                | 1 +
 14 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/captum/attr/_core/gradient_shap.py b/captum/attr/_core/gradient_shap.py
index 35538be92a..b591308c70 100644
--- a/captum/attr/_core/gradient_shap.py
+++ b/captum/attr/_core/gradient_shap.py
@@ -58,6 +58,7 @@ class GradientShap(GradientAttribution):
     randomly draw samples from the distribution of baselines, add noise to input
     samples and compute the expectation (smoothgrad).
     """
+
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
diff --git a/captum/attr/_core/input_x_gradient.py b/captum/attr/_core/input_x_gradient.py
index 5f363b411c..801740dafc 100644
--- a/captum/attr/_core/input_x_gradient.py
+++ b/captum/attr/_core/input_x_gradient.py
@@ -13,7 +13,7 @@ class InputXGradient(GradientAttribution):
     the gradient with respect to input.
     https://arxiv.org/abs/1611.07270
     """
-    
+
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:
diff --git a/captum/attr/_core/layer/layer_activation.py b/captum/attr/_core/layer/layer_activation.py
index 95c78fe3a4..28ffc4de49 100644
--- a/captum/attr/_core/layer/layer_activation.py
+++ b/captum/attr/_core/layer/layer_activation.py
@@ -14,7 +14,7 @@ class LayerActivation(LayerAttribution):
     r"""
     Computes activation of selected layer for given input.
     """
-    
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/layer/layer_conductance.py b/captum/attr/_core/layer/layer_conductance.py
index b85cf620f2..baa7aced74 100644
--- a/captum/attr/_core/layer/layer_conductance.py
+++ b/captum/attr/_core/layer/layer_conductance.py
@@ -35,6 +35,7 @@ class LayerConductance(LayerAttribution, GradientAttribution):
     features, utilize NeuronConductance instead, and provide the target
     neuron index.
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/layer/layer_deep_lift.py b/captum/attr/_core/layer/layer_deep_lift.py
index c13b4610b3..c617ceb106 100644
--- a/captum/attr/_core/layer/layer_deep_lift.py
+++ b/captum/attr/_core/layer/layer_deep_lift.py
@@ -364,7 +364,7 @@ class LayerDeepLiftShap(LayerDeepLift, DeepLiftShap):
     Although, it assumes a linear model for each explanation, the overall
     model across multiple explanations can be complex and non-linear.
     """
-    
+
     def __init__(self, model: Module, layer: Module) -> None:
         r"""
         Args:
diff --git a/captum/attr/_core/layer/layer_feature_ablation.py b/captum/attr/_core/layer/layer_feature_ablation.py
index e79088ee7f..96618aa785 100644
--- a/captum/attr/_core/layer/layer_feature_ablation.py
+++ b/captum/attr/_core/layer/layer_feature_ablation.py
@@ -32,6 +32,7 @@ class LayerFeatureAblation(LayerAttribution, PerturbationAttribution):
     equal to the change in target as a result of ablating the entire neuron
     group.
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/layer/layer_gradient_x_activation.py b/captum/attr/_core/layer/layer_gradient_x_activation.py
index e597a1edde..ef51808a8a 100644
--- a/captum/attr/_core/layer/layer_gradient_x_activation.py
+++ b/captum/attr/_core/layer/layer_gradient_x_activation.py
@@ -23,6 +23,7 @@ class LayerGradientXActivation(LayerAttribution, GradientAttribution):
     Computes element-wise product of gradient and activation for selected
     layer on given inputs.
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/neuron/neuron_conductance.py b/captum/attr/_core/neuron/neuron_conductance.py
index af7435ae85..e7aec753da 100644
--- a/captum/attr/_core/neuron/neuron_conductance.py
+++ b/captum/attr/_core/neuron/neuron_conductance.py
@@ -28,6 +28,7 @@ class NeuronConductance(NeuronAttribution, GradientAttribution):
     The details of the approach can be found here:
     https://arxiv.org/abs/1805.12233
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/neuron/neuron_feature_ablation.py b/captum/attr/_core/neuron/neuron_feature_ablation.py
index b57b4b2658..a51718c2aa 100644
--- a/captum/attr/_core/neuron/neuron_feature_ablation.py
+++ b/captum/attr/_core/neuron/neuron_feature_ablation.py
@@ -26,7 +26,7 @@ class NeuronFeatureAblation(NeuronAttribution, PerturbationAttribution):
     equal to the change in target as a result of ablating the entire feature
     group.
     """
-    
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/neuron/neuron_gradient.py b/captum/attr/_core/neuron/neuron_gradient.py
index ffdc0a33f4..11bea817b3 100644
--- a/captum/attr/_core/neuron/neuron_gradient.py
+++ b/captum/attr/_core/neuron/neuron_gradient.py
@@ -22,6 +22,7 @@ class NeuronGradient(NeuronAttribution, GradientAttribution):
     Computes the gradient of the output of a particular neuron with
     respect to the inputs of the network.
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/neuron/neuron_gradient_shap.py b/captum/attr/_core/neuron/neuron_gradient_shap.py
index 9b47eb8924..b3e6405c99 100644
--- a/captum/attr/_core/neuron/neuron_gradient_shap.py
+++ b/captum/attr/_core/neuron/neuron_gradient_shap.py
@@ -44,6 +44,7 @@ class NeuronGradientShap(NeuronAttribution, GradientAttribution):
     randomly draw samples from the distribution of baselines, add noise to input
     samples and compute the expectation (smoothgrad).
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
index 62430fa636..bfc554910b 100644
--- a/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
+++ b/captum/attr/_core/neuron/neuron_guided_backprop_deconvnet.py
@@ -10,7 +10,7 @@
 
 
 class NeuronDeconvolution(NeuronAttribution, GradientAttribution):
-    """        
+    r"""
     Computes attribution of the given neuron using deconvolution.
     Deconvolution computes the gradient of the target output with
     respect to the input, but gradients of ReLU functions are overriden so
@@ -27,6 +27,7 @@ class NeuronDeconvolution(NeuronAttribution, GradientAttribution):
     given model are performed using a module (nn.module.ReLU).
     If nn.functional.ReLU is used, gradients are not overriden appropriately.
     """
+
     def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
@@ -152,6 +153,7 @@ class NeuronGuidedBackprop(NeuronAttribution, GradientAttribution):
     given model are performed using a module (nn.module.ReLU).
     If nn.functional.ReLU is used, gradients are not overriden appropriately.
     """
+
     def __init__(
         self, model: Module, layer: Module, device_ids: Union[None, List[int]] = None
     ) -> None:
diff --git a/captum/attr/_core/neuron/neuron_integrated_gradients.py b/captum/attr/_core/neuron/neuron_integrated_gradients.py
index ad421fe1b1..8a04fc13d2 100644
--- a/captum/attr/_core/neuron/neuron_integrated_gradients.py
+++ b/captum/attr/_core/neuron/neuron_integrated_gradients.py
@@ -22,6 +22,7 @@ class NeuronIntegratedGradients(NeuronAttribution, GradientAttribution):
     Note that this method is equivalent to applying integrated gradients
     where the output is the output of the identified neuron.
     """
+
     def __init__(
         self,
         forward_func: Callable,
diff --git a/captum/attr/_core/saliency.py b/captum/attr/_core/saliency.py
index 18a7f07b3b..0b3a5d7845 100644
--- a/captum/attr/_core/saliency.py
+++ b/captum/attr/_core/saliency.py
@@ -20,6 +20,7 @@ class Saliency(GradientAttribution):
     More details about the approach can be found in the following paper:
         https://arxiv.org/pdf/1312.6034.pdf
     """
+
     def __init__(self, forward_func: Callable) -> None:
         r"""
         Args:

From 8e56c5c53d204843ad2958e51346c0b4c73741f1 Mon Sep 17 00:00:00 2001
From: Vivek Miglani <vivekm@fb.com>
Date: Mon, 2 Mar 2020 11:07:08 -0800
Subject: [PATCH 3/3] Minor fix

---
 captum/attr/_core/deep_lift.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/captum/attr/_core/deep_lift.py b/captum/attr/_core/deep_lift.py
index 31f276d3b6..da51b91d9c 100644
--- a/captum/attr/_core/deep_lift.py
+++ b/captum/attr/_core/deep_lift.py
@@ -536,7 +536,6 @@ class DeepLiftShap(DeepLift):
             the additive composition of feature effects.
     Although, it assumes a linear model for each explanation, the overall
     model across multiple explanations can be complex and non-linear.
-
     """
 
     def __init__(self, model: Module) -> None: