diff --git a/captum/optim/__init__.py b/captum/optim/__init__.py
index 9177d5c62c..828ac03dd2 100644
--- a/captum/optim/__init__.py
+++ b/captum/optim/__init__.py
@@ -7,6 +7,7 @@
 from captum.optim._param.image.images import ImageTensor  # noqa: F401
 from captum.optim._utils import circuits, reducer  # noqa: F401
 from captum.optim._utils.image import atlas  # noqa: F401
+from captum.optim._utils.image import dataset  # noqa: F401
 from captum.optim._utils.image.common import (  # noqa: F401
     hue_to_rgb,
     make_grid_image,
@@ -28,6 +29,7 @@
     "reducer",
     "make_grid_image",
     "atlas",
+    "dataset",
     "hue_to_rgb",
     "nchannels_to_rgb",
     "save_tensor_as_image",
diff --git a/captum/optim/_core/optimization.py b/captum/optim/_core/optimization.py
index cd11db9e34..6ce3fb3e13 100644
--- a/captum/optim/_core/optimization.py
+++ b/captum/optim/_core/optimization.py
@@ -1,5 +1,3 @@
-"""captum.optim.optimization."""
-
 import warnings
 from typing import Callable, Iterable, Optional
 
@@ -31,10 +29,24 @@ class InputOptimization(Objective, Parameterized):
     """
     Core function that optimizes an input to maximize a target (aka objective).
     This is similar to gradient-based methods for adversarial examples, such
-    as FGSM. The code for this was based on the implementation by the authors of Lucid.
-    For more details, see the following:
-        https://github.com/tensorflow/lucid
-        https://distill.pub/2017/feature-visualization/
+    as :class:`FGSM <captum.robust.FGSM>`. The code for this was based on the
+    implementation by the authors of Lucid. For more details, see the following:
+
+      * https://github.com/tensorflow/lucid
+      * https://distill.pub/2017/feature-visualization/
+
+    Alias: ``captum.optim.InputOptimization``
+
+    Example::
+
+        >>> model = opt.models.googlenet(pretrained=True)
+        >>> loss_fn = opt.loss.LayerActivation(model.mixed4c)
+        >>> image = opt.images.NaturalImage(size=(224, 224))
+        >>> transform = opt.transforms.TransformationRobustness()
+        >>>
+        >>> obj = opt.InputOptimization(model, loss_fn, image, transform)
+        >>> history = obj.optimize(opt.optimization.n_steps(512))
+        >>> image().show(figsize=(10, 10)) # Display results
     """
 
     def __init__(
@@ -47,13 +59,32 @@ def __init__(
         r"""
         Args:
 
-            model (nn.Module, optional):  The reference to PyTorch model instance.
-            input_param (nn.Module, optional):  A module that generates an input,
-                        consumed by the model.
-            transform (nn.Module, optional):  A module that transforms or preprocesses
-                        the input before being passed to the model.
-            loss_function (callable): The loss function to minimize during optimization
-                        optimization.
+            model (nn.Module, optional): The reference to PyTorch model instance. Set
+                to ``None`` for no model instance.
+            loss_function (Callable): The :mod:`Loss <.loss>` objective instance to
+                minimize during optimization.
+            input_param (InputParameterization, optional): A module that generates an
+                input, consumed by the model. Example: An
+                :mod:`ImageParameterization <captum.optim.images>` instance.
+            transform (nn.Module, optional): A module that transforms or preprocesses
+                the input before being passed to the model. Set to
+                :class:`torch.nn.Identity` for no transforms.
+
+        Instance variables that be used in the :func:`InputOptimization.optimize`
+        function, custom optimization functions, and StopCriteria functions:
+
+        Attributes:
+
+            model (torch.nn.Module): The given model instance given when initializing
+                ``InputOptimization``. If ``model`` was set to ``None`` during
+                initialization, then an instance of :class:`torch.nn.Identity` will be
+                returned.
+            input_param (InputParameterization): The given input parameterization
+                instance given when initializing ``InputOptimization``.
+            loss_function (Loss): The composable :mod:`Loss <.loss>` instance given
+                when initializing ``InputOptimization``.
+            transform (torch.nn.Module): The given transform instance given when
+                initializing ``InputOptimization``.
         """
         self.model = model or nn.Identity()
         # Grab targets from loss_function
@@ -76,9 +107,9 @@ def loss(self) -> torch.Tensor:
         r"""Compute loss value for current iteration.
 
         Returns:
-            *tensor* representing **loss**:
-            - **loss** (*tensor*):
-                        Size of the tensor corresponds to the targets passed.
+            tensor representing **loss**:
+            - **loss** (torch.Tensor): Size of the tensor corresponds to the targets
+                passed.
         """
         input_t = self.input_param()
 
@@ -95,7 +126,9 @@ def loss(self) -> torch.Tensor:
         return loss_value
 
     def cleanup(self) -> None:
-        r"""Garbage collection, mainly removing hooks."""
+        r"""Garbage collection, mainly removing hooks.
+        This should only be run after optimize is finished running.
+        """
         self.hooks.remove_hooks()
 
     # Targets are managed by ModuleOutputHooks; we mainly just want a convenient setter
@@ -109,6 +142,11 @@ def targets(self, value: Iterable[nn.Module]) -> None:
         self.hooks = ModuleOutputsHook(value)
 
     def parameters(self) -> Iterable[nn.Parameter]:
+        """
+        Returns:
+            parameters (iterable of torch.nn.Parameter): An iterable of parameters in
+                the input parameterization.
+        """
         return self.input_param.parameters()
 
     def optimize(
@@ -122,18 +160,19 @@ def optimize(
 
         Args:
 
-            stop_criteria (StopCriteria, optional):  A function that is called
-                        every iteration and returns a bool that determines whether
-                        to stop the optimization.
-                        See captum.optim.typing.StopCriteria for details.
-            optimizer (Optimizer, optional):  An torch.optim.Optimizer used to
-                        optimize the input based on the loss function.
+            stop_criteria (StopCriteria, optional): A function that is called
+                every iteration and returns a bool that determines whether to stop the
+                optimization.
+                Default: :func:`n_steps(512) <.n_steps>`
+            optimizer (torch.optim.Optimizer, optional): A ``torch.optim.Optimizer``
+                instance to use for optimizing the input based on the loss function.
+                Default: :class:`torch.optim.Adam`
             loss_summarize_fn (Callable, optional): The function to use for summarizing
                 tensor outputs from loss functions.
-                Default: default_loss_summarize
-            lr: (float, optional): If no optimizer is given, then lr is used as the
+                Default: :func:`.default_loss_summarize`
+            lr (float, optional): If no optimizer is given, then lr is used as the
                 learning rate for the Adam optimizer.
-                Default: 0.025
+                Default: ``0.025``
 
         Returns:
             history (torch.Tensor): A stack of loss values per iteration. The size
@@ -163,13 +202,18 @@ def optimize(
 def n_steps(n: int, show_progress: bool = True) -> StopCriteria:
     """StopCriteria generator that uses number of steps as a stop criteria.
 
+    Example::
+
+        >>> stop_criteria = opt.optimization.n_steps(512, True)
+
     Args:
-        n (int):  Number of steps to run optimization.
-        show_progress (bool, optional):  Whether or not to show progress bar.
-            Default: True
+
+        n (int): Number of steps to run optimization.
+        show_progress (bool, optional): Whether or not to show progress bar.
+            Default: ``True``
 
     Returns:
-        *StopCriteria* callable
+        StopCriteria (Callable): A stop criteria function.
     """
 
     if show_progress:
diff --git a/captum/optim/_param/image/images.py b/captum/optim/_param/image/images.py
index fa313b38af..16e5f625e0 100644
--- a/captum/optim/_param/image/images.py
+++ b/captum/optim/_param/image/images.py
@@ -21,6 +21,29 @@
 
 
 class ImageTensor(torch.Tensor):
+    r"""
+    A subclass of :class:`torch.Tensor` that provides functions for easy loading,
+    saving, and displaying image tensors.
+
+    Alias: ``captum.optim.ImageTensor``
+
+    Example using file path or URL::
+
+        >>> image_tensor = opt.images.ImageTensor.load(<path/to/image_file>)
+        >>> image_tensor.export(filename="image_tensor.jpg")  # Save image(s)
+        >>> image_tensor.show()  # Displays image(s) via Matplotlib
+
+    Example using ``torch.Tensor``::
+
+        >>> image_tensor = torch.randn(1, 3, 224, 224)
+        >>> image_tensor = opt.images.ImageTensor(image_tensor)
+
+    Example using ``np.ndarray``::
+
+        >>> image_tensor = np.random.rand(1, 3, 224, 224)
+        >>> image_tensor = opt.images.ImageTensor(image_tensor)
+    """
+
     @staticmethod
     def __new__(
         cls: Type["ImageTensor"],
@@ -32,10 +55,10 @@ def __new__(
         Args:
 
             x (list or np.ndarray or torch.Tensor): A list, NumPy array, or PyTorch
-                tensor to create an `ImageTensor` from.
+                tensor to create an ``ImageTensor`` from.
 
         Returns:
-           x (ImageTensor): An `ImageTensor` instance.
+           x (ImageTensor): An ``ImageTensor`` instance.
         """
         if isinstance(x, torch.Tensor) and x.is_cuda:
             x.show = MethodType(cls.show, x)
@@ -45,17 +68,18 @@ def __new__(
             return super().__new__(cls, x, *args, **kwargs)
 
     @classmethod
-    def open(cls, path: str, scale: float = 255.0, mode: str = "RGB") -> "ImageTensor":
+    def load(cls, path: str, scale: float = 255.0, mode: str = "RGB") -> "ImageTensor":
         """
-        Load an image file from a URL or local filepath directly into an `ImageTensor`.
+        Load an image file from a URL or local filepath directly into an
+        ``ImageTensor``.
 
         Args:
 
             path (str): A URL or filepath to an image.
             scale (float, optional): The image scale to use.
-                Default: 255.0
+                Default: ``255.0``
             mode (str, optional): The image loading mode / colorspace to use.
-                Default: "RGB"
+                Default: ``"RGB"``
 
         Returns:
            x (ImageTensor): An `ImageTensor` instance.
@@ -69,6 +93,11 @@ def open(cls, path: str, scale: float = 255.0, mode: str = "RGB") -> "ImageTenso
         img_np = np.array(img.convert(mode)).astype(np.float32)
         return cls(img_np.transpose(2, 0, 1) / scale)
 
+    @classmethod
+    def open(cls, path: str, scale: float = 255.0, mode: str = "RGB") -> "ImageTensor":
+        r"""Alias for :func:`load`."""
+        return cls.load(path=path, scale=scale, mode=mode)
+
     def __repr__(self) -> str:
         prefix = "ImageTensor("
         indent = len(prefix)
@@ -104,24 +133,27 @@ def show(
         pad_value: float = 0.0,
     ) -> None:
         """
-        Display an `ImageTensor`.
+        Display image(s) in the ``ImageTensor`` instance using
+        :func:`captum.optim.show`.
 
         Args:
 
-            figsize (Tuple[int, int], optional): height & width to use
-                for displaying the `ImageTensor` figure.
-            scale (float, optional): Value to multiply the `ImageTensor` by so that
+            figsize (tuple of int, optional): The height & width to use for displaying
+                the ``ImageTensor`` figure, in the format of: (height, width).
+                Default: ``None``
+            scale (float, optional): Value to multiply the ``ImageTensor`` by so that
                 it's value range is [0-255] for display.
-                Default: 255.0
+                Default: ``255.0``
             images_per_row (int, optional): The number of images per row to use for the
-                grid image. Default is set to None for no grid image creation.
-                Default: None
+                grid image. Default is set to ``None`` for no grid image creation.
+                Default: ``None``
             padding (int, optional): The amount of padding between images in the grid
-                images. This parameter only has an effect if `nrow` is not None.
-                Default: 2
+                images. This parameter only has an effect if ``images_per_row`` is not
+                ``None``.
+                Default: ``2``
             pad_value (float, optional): The value to use for the padding. This
-                parameter only has an effect if `nrow` is not None.
-                Default: 0.0
+                parameter only has an effect if ``images_per_row`` is not None.
+                Default: ``0.0``
         """
         show(
             self,
@@ -142,27 +174,29 @@ def export(
         pad_value: float = 0.0,
     ) -> None:
         """
-        Save an `ImageTensor` as an image file.
+        Save image(s) in the `ImageTensor` instance as an image file, using
+        :func:`captum.optim.save_tensor_as_image`.
 
         Args:
 
-            filename (str): The filename to use when saving the `ImageTensor` as an
+            filename (str): The filename to use when saving the ``ImageTensor`` as an
                 image file.
-            scale (float, optional): Value to multiply the `ImageTensor` by so that
+            scale (float, optional): Value to multiply the ``ImageTensor`` by so that
                 it's value range is [0-255] for saving.
-                Default: 255.0
+                Default: ``255.0``
             mode (str, optional): A PIL / Pillow supported colorspace. Default is
                 set to None for automatic RGB / RGBA detection and usage.
-                Default: None
+                Default: ``None``
             images_per_row (int, optional): The number of images per row to use for the
                 grid image. Default is set to None for no grid image creation.
-                Default: None
+                Default: ``None``
             padding (int, optional): The amount of padding between images in the grid
-                images. This parameter only has an effect if `nrow` is not None.
-                Default: 2
+                images. This parameter only has an effect if ``images_per_row`` is not
+                ``None``.
+                Default: ``2``
             pad_value (float, optional): The value to use for the padding. This
-                parameter only has an effect if `nrow` is not None.
-                Default: 0.0
+                parameter only has an effect if ``images_per_row`` is not ``None``.
+                Default: ``0.0``
         """
         save_tensor_as_image(
             self,
diff --git a/captum/optim/_utils/image/atlas.py b/captum/optim/_utils/image/atlas.py
index 5954a3a471..dd68bccc64 100644
--- a/captum/optim/_utils/image/atlas.py
+++ b/captum/optim/_utils/image/atlas.py
@@ -78,6 +78,7 @@ def calc_grid_indices(
     ]
 
     Args:
+
         xy_grid (torch.tensor): The xy coordinate grid activation samples, with a shape
             of: [n_points, 2].
         grid_size (Tuple[int, int]): The grid_size of grid cells to use. The grid_size
@@ -86,6 +87,7 @@ def calc_grid_indices(
             Default: (0.0, 1.0)
         y_extent (Tuple[float, float], optional): The y axis range to use.
             Default: (0.0, 1.0)
+
     Returns:
         indices (list of list of torch.Tensors): List of lists of grid indices
             stored inside tensors to use. Each 1D tensor of indices has a size of:
diff --git a/captum/optim/_utils/image/common.py b/captum/optim/_utils/image/common.py
index f1cdc5f477..77da453678 100644
--- a/captum/optim/_utils/image/common.py
+++ b/captum/optim/_utils/image/common.py
@@ -90,10 +90,10 @@ def show(
             grid image. Default is set to None for no grid image creation.
             Default: None
         padding (int, optional): The amount of padding between images in the grid
-            images. This parameter only has an effect if nrow is not None.
+            images. This parameter only has an effect if `images_per_row` is not None.
             Default: 2
         pad_value (float, optional): The value to use for the padding. This parameter
-            only has an effect if nrow is not None.
+            only has an effect if `images_per_row` is not None.
             Default: 0.0
     """
 
@@ -140,10 +140,10 @@ def save_tensor_as_image(
             grid image. Default is set to None for no grid image creation.
             Default: None
         padding (int, optional): The amount of padding between images in the grid
-            images. This parameter only has an effect if `nrow` is not None.
+            images. This parameter only has an effect if `images_per_row` is not None.
             Default: 2
         pad_value (float, optional): The value to use for the padding. This parameter
-            only has an effect if `nrow` is not None.
+            only has an effect if `images_per_row` is not None.
             Default: 0.0
     """
 
@@ -208,6 +208,7 @@ def _dot_cossim(
     a specified dimension.
 
     Args:
+
         x (torch.Tensor): The tensor that you wish to compute the cosine similarity
             for in relation to tensor y.
         y (torch.Tensor): The tensor that you wish to compute the cosine similarity
@@ -216,6 +217,7 @@ def _dot_cossim(
         dim (int, optional): The target dimension for computing cosine similarity.
         eps (float, optional): If cossim_pow is greater than zero, the desired
             epsilon value to use for cosine similarity calculations.
+
     Returns:
         tensor (torch.Tensor): Dot cosine similarity between x and y, along the
         specified dim.
@@ -241,13 +243,16 @@ def hue_to_rgb(
 ) -> torch.Tensor:
     """
     Create an RGB unit vector based on a hue of the input angle.
+
     Args:
+
         angle (float): The hue angle to create an RGB color for.
         device (torch.device, optional): The device to create the angle color tensor
             on.
             Default: torch.device("cpu")
         warp (bool, optional): Whether or not to make colors more distinguishable.
             Default: True
+
     Returns:
         color_vec (torch.Tensor): A color vector.
     """
@@ -293,6 +298,7 @@ def nchannels_to_rgb(
             Default: True
         eps (float, optional): An optional epsilon value.
             Default: 1e-4
+
     Returns:
         tensor (torch.Tensor): An NCHW RGB image tensor.
     """
@@ -326,6 +332,7 @@ def weights_to_heatmap_2d(
     no excitation or inhibition.
 
     Args:
+
         weight (torch.Tensor):  A 2d tensor to create the heatmap from.
         colors (list of str):  A list of 5 strings containing hex triplet
             (six digit), three-byte hexadecimal color values to use for coloring
diff --git a/captum/optim/_utils/image/dataset.py b/captum/optim/_utils/image/dataset.py
index c894173990..7f03129ac7 100644
--- a/captum/optim/_utils/image/dataset.py
+++ b/captum/optim/_utils/image/dataset.py
@@ -1,6 +1,7 @@
 from typing import cast
 
 import torch
+from packaging import version
 
 try:
     from tqdm.auto import tqdm
@@ -18,11 +19,11 @@ def image_cov(x: torch.Tensor) -> torch.Tensor:
 
     Args:
 
-        x (torch.Tensor):  One or more NCHW image tensors stacked across the batch
+        x (torch.Tensor): One or more NCHW image tensors stacked across the batch
             dimension.
 
     Returns:
-        *tensor* (torch.Tensor):  The average color channel covariance matrix for the
+        tensor (torch.Tensor): The average color channel covariance matrix for the
             for the input tensor, with a shape of: [n_channels, n_channels].
     """
 
@@ -41,18 +42,27 @@ def dataset_cov_matrix(
     """
     Calculate the covariance matrix for an image dataset.
 
+    Example::
+
+        >>> # Load image dataset
+        >>> dataset = torchvision.datasets.ImageFolder("<path/to/dataset>")
+        >>> dataset_loader = torch.utils.data.DataLoader(dataset)
+        >>> # Calculate dataset COV matrix
+        >>> cov_mtx = opt.dataset.dataset_cov(dataset_loader, True)
+        >>> print(cov_mtx)
+
     Args:
 
-        loader (torch.utils.data.DataLoader):  The reference to a PyTorch
+        loader (torch.utils.data.DataLoader): The reference to a PyTorch
             dataloader instance.
         show_progress (bool, optional): Whether or not to display a tqdm progress bar.
-            Default: False
-        device (torch.device, optional): The PyTorch device to use for for calculating
-            the cov matrix.
-            Default: torch.device("cpu")
+            Default: ``False``
+        device (torch.device, optional): The PyTorch device to use for calculating the
+            cov matrix.
+            Default: ``torch.device("cpu")``
 
     Returns:
-        *tensor*:  A covariance matrix for the specified dataset.
+        tensor (torch.Tensor): A covariance matrix for the specified dataset.
     """
 
     if show_progress:
@@ -73,6 +83,15 @@ def dataset_cov_matrix(
     return cov_mtx
 
 
+# Handle older versions of PyTorch
+# Defined outside of function in order to support JIT
+_torch_norm = (
+    torch.linalg.norm
+    if version.parse(torch.__version__) >= version.parse("1.7.0")
+    else torch.norm
+)
+
+
 def cov_matrix_to_klt(
     cov_mtx: torch.Tensor, normalize: bool = False, epsilon: float = 1e-10
 ) -> torch.Tensor:
@@ -81,22 +100,22 @@ def cov_matrix_to_klt(
 
     Args:
 
-        cov_mtx (tensor):  A 3 by 3 covariance matrix generated from a dataset.
-        normalize (bool):  Whether or not to normalize the resulting KLT matrix.
-            Default: False
-        epsilon (float):
+        cov_mtx (torch.Tensor): A 3 by 3 covariance matrix generated from a dataset.
+        normalize (bool): Whether or not to normalize the resulting KLT matrix.
+            Default: ``False``
+        epsilon (float, optional): A small epsilon value to use for numerical
+            stability.
+            Default: ``1e-10``
 
     Returns:
-        *tensor*:  A KLT matrix for the specified covariance matrix.
+        tensor (torch.Tensor): A KLT matrix for the specified covariance
+            matrix.
     """
 
-    # Handle older versions of PyTorch
-    torch_norm = torch.linalg.norm if torch.__version__ >= "1.9.0" else torch.norm
-
     U, S, V = torch.svd(cov_mtx)
     svd_sqrt = U @ torch.diag(torch.sqrt(S + epsilon))
     if normalize:
-        svd_sqrt / torch.max(torch_norm(svd_sqrt, dim=0))
+        svd_sqrt / torch.max(_torch_norm(svd_sqrt, dim=0))
     return svd_sqrt
 
 
@@ -107,25 +126,34 @@ def dataset_klt_matrix(
     device: torch.device = torch.device("cpu"),
 ) -> torch.Tensor:
     """
-    Calculate the color correlation matrix, also known as
-    a Karhunen-Loève transform (KLT) matrix, for a dataset.
-    The color correlation matrix can then used in color decorrelation
-    transforms for models trained on the dataset.
+    Calculate the color correlation matrix, also known as a Karhunen-Loève transform
+    (KLT) matrix, for a dataset. The color correlation matrix can then used in color
+    decorrelation & recorrelation transforms like
+    :class:`captum.optim.transforms.ToRGB` for models trained on the dataset.
+
+    Example::
+
+        >>> # Load image dataset
+        >>> dataset = torchvision.datasets.ImageFolder("<path/to/dataset>")
+        >>> dataset_loader = torch.utils.data.DataLoader(dataset)
+        >>> # Calculate dataset KLT matrix
+        >>> klt_mtx = opt.dataset.dataset_klt_matrix(dataset_loader, True, True)
+        >>> print(klt_mtx)
 
     Args:
 
-        loader (torch.utils.data.DataLoader):  The reference to a PyTorch
+        loader (torch.utils.data.DataLoader): The reference to a PyTorch
             dataloader instance.
-        normalize (bool):  Whether or not to normalize the resulting KLT matrix.
-            Default: False
+        normalize (bool): Whether or not to normalize the resulting KLT matrix.
+            Default: ``False``
         show_progress (bool, optional): Whether or not to display a tqdm progress bar.
-            Default: False
-        device (torch.device, optional): The PyTorch device to use for for calculating
-            the cov matrix.
-            Default: torch.device("cpu")
+            Default: ``False``
+        device (torch.device, optional): The PyTorch device to use for calculating the
+            cov matrix.
+            Default: ``torch.device("cpu")``
 
     Returns:
-        *tensor*:  A KLT matrix for the specified dataset.
+        tensor (torch.Tensor): A KLT matrix for the specified dataset.
     """
 
     cov_mtx = dataset_cov_matrix(loader, show_progress=show_progress, device=device)
diff --git a/captum/optim/_utils/reducer.py b/captum/optim/_utils/reducer.py
index 2696d003d6..585d0157e0 100644
--- a/captum/optim/_utils/reducer.py
+++ b/captum/optim/_utils/reducer.py
@@ -22,6 +22,7 @@ class ChannelReducer:
     See here for more information: https://distill.pub/2018/building-blocks/
 
     Args:
+
         n_components (int, optional):  The number of channels to reduce the target
             dimension to.
         reduction_alg (str or callable, optional):  The desired dimensionality
@@ -71,11 +72,14 @@ def fit_transform(
     ) -> torch.Tensor:
         """
         Perform dimensionality reduction on an input tensor.
+
         Args:
+
             tensor (tensor):  A tensor to perform dimensionality reduction on.
             swap_2nd_and_last_dims (bool, optional): If true, input channels are
                 expected to be in the second dimension unless the input tensor has a
                 shape of CHW. Default is set to True.
+
         Returns:
             *tensor*:  A tensor with one of it's dimensions reduced.
         """
@@ -131,8 +135,10 @@ def posneg(x: torch.Tensor, dim: int = 0) -> torch.Tensor:
     NMF with regular NMF.
 
     Args:
+
         x (tensor):  A tensor to make positive.
         dim (int, optional):  The dimension to concatinate the two tensor halves at.
+
     Returns:
         tensor (torch.tensor):  A positive tensor for one-sided dimensionality
             reduction.
diff --git a/captum/optim/_utils/typing.py b/captum/optim/_utils/typing.py
index a0e3d6f1c0..10d37bd835 100755
--- a/captum/optim/_utils/typing.py
+++ b/captum/optim/_utils/typing.py
@@ -1,7 +1,8 @@
 import sys
 from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
 
-from torch import Tensor, __version__
+from torch import Tensor
+from torch import distributions
 from torch.nn import Module
 from torch.optim import Optimizer
 
@@ -33,16 +34,11 @@ def cleanup(self) -> None:
 LossFunction = Callable[[ModuleOutputMapping], Tensor]
 SingleTargetLossFunction = Callable[[Tensor], Tensor]
 
-if __version__ < "1.4.0":
-    NumSeqOrTensorOrProbDistType = Union[Sequence[int], Sequence[float], Tensor]
-else:
-    from torch import distributions
-
-    NumSeqOrTensorOrProbDistType = Union[
-        Sequence[int],
-        Sequence[float],
-        Tensor,
-        distributions.distribution.Distribution,
-    ]
+NumSeqOrTensorOrProbDistType = Union[
+    Sequence[int],
+    Sequence[float],
+    Tensor,
+    distributions.distribution.Distribution,
+]
 IntSeqOrIntType = Union[List[int], Tuple[int], Tuple[int, int], int]
 TupleOfTensorsOrTensorType = Union[Tuple[Tensor, ...], Tensor]
diff --git a/captum/optim/models/__init__.py b/captum/optim/models/__init__.py
index 0f809d5ef5..121fa09257 100755
--- a/captum/optim/models/__init__.py
+++ b/captum/optim/models/__init__.py
@@ -1,10 +1,12 @@
 from ._common import (  # noqa: F401
+    MaxPool2dRelaxed,
     RedirectedReluLayer,
     SkipLayer,
     collect_activations,
     get_model_layers,
     replace_layers,
     skip_layers,
+    Conv2dSame,
 )
 from ._image.inception5h_classes import INCEPTION5H_CLASSES  # noqa: F401
 from ._image.inception_v1 import InceptionV1, googlenet  # noqa: F401
@@ -17,6 +19,8 @@
 )
 
 __all__ = [
+    "Conv2dSame",
+    "MaxPool2dRelaxed",
     "RedirectedReluLayer",
     "SkipLayer",
     "collect_activations",
diff --git a/captum/optim/models/_common.py b/captum/optim/models/_common.py
index e65e281217..9fa9cda942 100644
--- a/captum/optim/models/_common.py
+++ b/captum/optim/models/_common.py
@@ -16,6 +16,9 @@ def get_model_layers(model: nn.Module) -> List[str]:
     Args:
 
         model (nn.Module): A PyTorch model or module instance to collect layers from.
+
+    Returns:
+        model_layers (list of str): A list of hookable layers in the model.
     """
     layers = []
 
@@ -68,6 +71,14 @@ class RedirectedReluLayer(nn.Module):
 
     @torch.jit.ignore
     def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+
+            x (torch.Tensor): A tensor to pass through RedirectedReLU.
+
+        Returns:
+            x (torch.Tensor): The output of RedirectedReLU.
+        """
         return RedirectedReLU.apply(input)
 
 
@@ -82,16 +93,24 @@ def replace_layers(
     Replace all target layers with new layers inside the specified model,
     possibly with the same initialization variables.
 
+    Example::
+
+        >>> model = opt.models.googlenet(pretrained=True)
+        >>> # Replace MaxPool2d layers with their AvgPool2d equivalents
+        >>> opt.models.replace_layers(model, nn.MaxPool2d, nn.AvgPool2d, True)
+
     Args:
-        model: (nn.Module): A PyTorch model instance.
-        layer1: (Type[nn.Module]): The layer class that you want to transfer
+
+        model (nn.Module): A PyTorch model instance.
+        layer1 (Type[nn.Module]): The layer class that you want to transfer
             initialization variables from.
-        layer2: (Type[nn.Module]): The layer class to create with the variables
-            from layer1.
-        transfer_vars (bool, optional): Wether or not to try and copy
-            initialization variables from layer1 instances to the replacement
-            layer2 instances.
-        kwargs: (Any, optional): Any additional variables to use when creating
+        layer2 (Type[nn.Module]): The layer class to create with the variables
+            from ``layer1``.
+        transfer_vars (bool, optional): Whether or not to try and copy
+            initialization variables from ``layer1`` instances to the replacement
+            ``layer2`` instances.
+            Default: ``False``
+        kwargs (Any, optional): Any additional variables to use when creating
             the new layer.
     """
 
@@ -112,13 +131,16 @@ def _transfer_layer_vars(
     """
     Given a layer instance, create a new layer instance of another class
     with the same initialization variables as the original layer.
+
     Args:
-        layer1: (nn.Module): A layer instance that you want to transfer
+
+        layer1 (nn.Module): A layer instance that you want to transfer
             initialization variables from.
-        layer2: (nn.Module): The layer class to create with the variables
+        layer2 (nn.Module): The layer class to create with the variables
             from of layer1.
-        kwargs: (Any, optional): Any additional variables to use when creating
+        kwargs (Any, optional): Any additional variables to use when creating
             the new layer.
+
     Returns:
         layer2 instance (nn.Module): An instance of layer2 with the initialization
             variables that it shares with layer1, and any specified additional
@@ -144,8 +166,7 @@ def _transfer_layer_vars(
 class Conv2dSame(nn.Conv2d):
     """
     Tensorflow like 'SAME' convolution wrapper for 2D convolutions.
-    TODO: Replace with torch.nn.Conv2d when support for padding='same'
-    is in stable version
+    torch.nn.Conv2d with padding='same' can be used when the stride is equal to 1.
     """
 
     def __init__(
@@ -170,24 +191,25 @@ def __init__(
            kernel_size (int or tuple of int): The desired kernel size to use.
            stride (int or tuple of int, optional): The desired stride for the
                cross-correlation.
-               Default: 1
+               Default: ``1``
            padding (int or tuple of int, optional): This value is always set to 0.
-               Default: 0
+               Default: ``0``
            dilation (int or tuple of int, optional): The desired spacing between the
                kernel points.
-               Default: 1
+               Default: ``1``
            groups (int, optional): Number of blocked connections from input channels
-               to output channels. Both in_channels and out_channels must be divisable
+               to output channels. Both in_channels and out_channels must be divisible
                by groups.
-               Default: 1
+               Default: ``1``
            bias (bool, optional): Whether or not to apply a learnable bias to the
                output.
+               Default: ``True``
         """
         super().__init__(
             in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias
         )
 
-    def calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
+    def _calc_same_pad(self, i: int, k: int, s: int, d: int) -> int:
         """
         Calculate the required padding for a dimension.
 
@@ -207,15 +229,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Args:
 
-            x (torch.tensor): The input tensor to apply 2D convolution to.
+            x (torch.Tensor): The input tensor to apply 2D convolution to.
 
         Returns
             x (torch.Tensor): The input tensor after the 2D convolution was applied.
         """
         ih, iw = x.size()[-2:]
         kh, kw = self.weight.size()[-2:]
-        pad_h = self.calc_same_pad(i=ih, k=kh, s=self.stride[0], d=self.dilation[0])
-        pad_w = self.calc_same_pad(i=iw, k=kw, s=self.stride[1], d=self.dilation[1])
+        pad_h = self._calc_same_pad(i=ih, k=kh, s=self.stride[0], d=self.dilation[0])
+        pad_w = self._calc_same_pad(i=iw, k=kw, s=self.stride[1], d=self.dilation[1])
 
         if pad_h > 0 or pad_w > 0:
             x = F.pad(
@@ -240,6 +262,13 @@ def collect_activations(
     """
     Collect target activations for a model.
 
+    Example::
+
+        >>> model = opt.models.googlenet(pretrained=True)
+        >>> target = model.mixed4c  # Target layer
+        >>> activ_dict = opt.models.collect_activations(model, target)
+        >>> activations = activ_dict[target]  # Get activations from dict
+
     Args:
 
         model (nn.Module): A PyTorch model instance.
@@ -247,13 +276,13 @@ def collect_activations(
             given model.
         model_input (torch.Tensor or tuple of torch.Tensor, optional): Optionally
             provide an input tensor to use when collecting the target activations.
-            Default: torch.zeros(1, 3, 224, 224)
+            Default: ``torch.zeros(1, 3, 224, 224)``
 
     Returns:
-        activ_dict (ModuleOutputMapping): A dictionary of collected activations where
-            the keys are the target layers.
+        activ_dict (dict[nn.Module, torch.Tensor]): A dictionary of collected
+            activations where the keys are the target layers.
     """
-    if not isinstance(targets, list):
+    if not isinstance(targets, (list, tuple)):
         targets = [targets]
     targets = list(dict.fromkeys(targets))
     catch_activ = ActivationFetcher(model, targets)
@@ -267,32 +296,35 @@ class SkipLayer(torch.nn.Module):
     during the forward pass. Use cases include removing nonlinear activation layers
     like ReLU for circuits research.
 
-    This layer works almost exactly the same way that nn.Indentiy does, except it also
-    ignores any additional arguments passed to the forward function. Any layer replaced
-    by SkipLayer must have the same input and output shapes.
+    This layer works almost exactly the same way that :class:`torch.nn.Identity` does,
+    except it also ignores any additional arguments passed to the forward function.
+    Any layer replaced by SkipLayer must have the same input and output shapes.
 
     See nn.Identity for more details:
     https://pytorch.org/docs/stable/generated/torch.nn.Identity.html
-
-    Args:
-        args (Any): Any argument. Arguments will be safely ignored.
-        kwargs (Any) Any keyword argument. Arguments will be safely ignored.
     """
 
     def __init__(self, *args, **kwargs) -> None:
+        """
+        Args:
+
+            args (Any, optional): Any argument. Arguments will be safely ignored.
+            kwargs (Any, optional) Any keyword argument. Arguments will be safely
+                ignored.
+        """
         super().__init__()
 
-    def forward(
-        self, x: Union[torch.Tensor, Tuple[torch.Tensor]], *args, **kwargs
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor]]:
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         """
         Args:
-            x (torch.Tensor or tuple of torch.Tensor): The input tensor or tensors.
-            args (Any): Any argument. Arguments will be safely ignored.
-            kwargs (Any) Any keyword argument. Arguments will be safely ignored.
+
+            x (torch.Tensor): The input tensor.
+            args (Any, optional): Any argument. Arguments will be safely ignored.
+            kwargs (Any, optional) Any keyword argument. Arguments will be safely
+                ignored.
+
         Returns:
-            x (torch.Tensor or tuple of torch.Tensor): The unmodified input tensor or
-                tensors.
+            x (torch.Tensor): The unmodified input tensor.
         """
         return x
 
@@ -301,17 +333,17 @@ def skip_layers(
     model: nn.Module, layers: Union[List[Type[nn.Module]], Type[nn.Module]]
 ) -> None:
     """
-    This function is a wrapper function for
-    replace_layers and replaces the target layer
-    with layers that do nothing.
-    This is useful for removing the nonlinear ReLU
-    layers when creating expanded weights.
+    This function is a wrapper function for :func:`.replace_layers` and replaces the
+    target layer with layers that do nothing. This is useful for removing the nonlinear
+    ReLU layers when creating expanded weights.
+
     Args:
+
         model (nn.Module): A PyTorch model instance.
-        layers (nn.Module or list of nn.Module): The layer
-            class type to replace in the model.
+        layers (nn.Module or list of nn.Module): The layer class type to replace in the
+            model.
     """
-    if not hasattr(layers, "__iter__"):
+    if not isinstance(layers, (tuple, list)):
         layers = cast(Type[nn.Module], layers)
         replace_layers(model, layers, SkipLayer)
     else:
@@ -330,9 +362,10 @@ class MaxPool2dRelaxed(torch.nn.Module):
     attributions of spatial posititions can be estimated using the rate at which
     increasing the neuron affects the output classes.
 
-    This layer peforms a MaxPool2d operation on the input, while using an equivalent
-    AvgPool2d layer to compute the gradient. This means that the forward pass returns
-    nn.MaxPool2d(input) while the backward pass uses nn.AvgPool2d(input).
+    This layer peforms a :class:`torch.nn.MaxPool2d` operation on the input, while
+    using an equivalent :class:`torch.nn.AvgPool2d` layer to compute the gradient.
+    This means that the forward pass returns ``nn.MaxPool2d(input)`` while the
+    backward pass uses ``nn.AvgPool2d(input)``.
 
     Carter, et al., "Activation Atlas", Distill, 2019.
     https://distill.pub/2019/activation-atlas/
@@ -348,24 +381,29 @@ class MaxPool2dRelaxed(torch.nn.Module):
 
     def __init__(
         self,
-        kernel_size: Union[int, Tuple[int, ...]],
-        stride: Optional[Union[int, Tuple[int, ...]]] = None,
-        padding: Union[int, Tuple[int, ...]] = 0,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Optional[Union[int, Tuple[int, int]]] = None,
+        padding: Union[int, Tuple[int, int]] = 0,
         ceil_mode: bool = False,
     ) -> None:
         """
         Args:
 
-            kernel_size (int or tuple of int): The size of the window to perform max &
-            average pooling with.
+            kernel_size (int or tuple of int): The size of the window to perform max
+                and average pooling with. Either a single int to use for both the
+                height & width or a tuple of 2 integers in format of: (height, width).
             stride (int or tuple of int, optional): The stride window size to use.
-                Default: None
+                Either a single int to use for both the height & width or a tuple of 2
+                integers in format of: (height, width).
+                Default: ``None``
             padding (int or tuple of int): The amount of zero padding to add to both
-                sides in the nn.MaxPool2d & nn.AvgPool2d modules.
-                Default: 0
+                sides in the ``nn.MaxPool2d`` & ``nn.AvgPool2d`` modules. Either a
+                single int to use for both the height & width or a tuple of 2 integers
+                in format of: (height, width).
+                Default: ``0``
             ceil_mode (bool, optional): Whether to use ceil or floor for creating the
                 output shape.
-                Default: False
+                Default: ``False``
         """
         super().__init__()
         self.maxpool = torch.nn.MaxPool2d(
diff --git a/captum/optim/models/_image/inception_v1_places365.py b/captum/optim/models/_image/inception_v1_places365.py
index 5ebca2a9b5..62a6834e16 100644
--- a/captum/optim/models/_image/inception_v1_places365.py
+++ b/captum/optim/models/_image/inception_v1_places365.py
@@ -18,35 +18,44 @@ def googlenet_places365(
     **kwargs: Any,
 ) -> "InceptionV1Places365":
     r"""GoogLeNet (also known as Inception v1 & Inception 5h) model architecture from
-    `"Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>`_.
+    `"Going Deeper with Convolutions" <https://arxiv.org/abs/1409.4842>`_.
 
     The pretrained GoogleNet model was trained using the MIT Places365 Standard
     dataset. See here for more information: https://arxiv.org/abs/1610.02055
 
+    Example::
+
+        >>> model = opt.models.googlenet_places365(pretrained=True)
+        >>> output = model(torch.zeros(1, 3, 224, 224))
+
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on the MIT
-            Places365 Standard dataset.
-            Default: False
-        progress (bool, optional): If True, displays a progress bar of the download to
-            stderr
-            Default: True
-        model_path (str, optional): Optional path for InceptionV1 model file.
-            Default: None
-        replace_relus_with_redirectedrelu (bool, optional): If True, return pretrained
-            model with Redirected ReLU in place of ReLU layers.
-            Default: *True* when pretrained is True otherwise *False*
-        use_linear_modules_only (bool, optional): If True, return pretrained
+
+        pretrained (bool, optional): If ``True``, returns a model pre-trained on the
+            MIT Places365 Standard dataset.
+            Default: ``False``
+        progress (bool, optional): If ``True``, displays a progress bar of the
+            download to stderr.
+            Default: ``True``
+        model_path (str, optional): Optional path for the InceptionV1 model file.
+            Default: ``None``
+        replace_relus_with_redirectedrelu (bool, optional): If ``True``, return
+            pretrained model with :class:`.RedirectedReLU` in place of ReLU layers.
+            Default: *``True``* when pretrained is True otherwise *``False``*
+        use_linear_modules_only (bool, optional): If ``True``, return pretrained
             model with all nonlinear layers replaced with linear equivalents.
-            Default: False
-        aux_logits (bool, optional): If True, adds two auxiliary branches that can
+            Default: ``False``
+        aux_logits (bool, optional): If ``True``, adds two auxiliary branches that can
             improve training.
-            Default: True
+            Default: ``True``
         out_features (int, optional): Number of output features in the model used for
-            training. Default: 365 when pretrained is True.
-            Default: 365
+            training.
+            Default: ``365``
         transform_input (bool, optional): If True, preprocesses the input according to
             the method with which it was trained on Places365.
-            Default: True
+            Default: ``True``
+
+    Returns:
+        model (InceptionV1Places365): An InceptionV1 Places365 model instance.
     """
 
     if pretrained:
@@ -95,19 +104,19 @@ def __init__(
 
             out_features (int, optional): Number of output features in the model used
                 for training.
-                Default: 365
-            aux_logits (bool, optional): If True, adds two auxiliary branches that can
-                improve training.
-                Default: True
-            transform_input (bool, optional): If True, preprocesses the input according
-                to the method with which it was trained on Places365.
-                Default: True
-            replace_relus_with_redirectedrelu (bool, optional): If True, return
-                pretrained model with Redirected ReLU in place of ReLU layers.
-                Default: False
-            use_linear_modules_only (bool, optional): If True, return pretrained model
-                with all nonlinear layers replaced with linear equivalents.
-                Default: False
+                Default: ``365``
+            aux_logits (bool, optional): If ``True``, adds two auxiliary branches that
+                can improve training.
+                Default: ``True``
+            transform_input (bool, optional): If ``True``, preprocesses the input
+                according to the method with which it was trained on Places365.
+                Default: ``True``
+            replace_relus_with_redirectedrelu (bool, optional): If ``True``, return
+                pretrained model with :class:`.RedirectedReLU` in place of ReLU layers.
+                Default: ``False``
+            use_linear_modules_only (bool, optional): If ``True``, return pretrained
+                model with all nonlinear layers replaced with linear equivalents.
+                Default: ``False``
         """
         super().__init__()
         self.aux_logits = aux_logits
@@ -281,20 +290,26 @@ def __init__(
         """
         Args:
 
-            in_channels (int, optional): The number of input channels to use for the
-                inception module.
-            c1x1 (int, optional):
-            c3x3reduce (int, optional):
-            c3x3 (int, optional):
-            c5x5reduce (int, optional):
-            c5x5 (int, optional):
-            pool_proj (int, optional):
+            in_channels (int): The number of input channels to use for the first
+                layers of the inception module branches.
+            c1x1 (int): The number of output channels to use for the first layer in
+                the c1x1 branch.
+            c3x3reduce (int): The number of output channels to use for the first layer
+                in the c3x3 branch.
+            c3x3 (int): The number of output channels to use for the second layer in
+                the c3x3 branch.
+            c5x5reduce (int): The number of output channels to use for the first layer
+                in the c5x5 branch.
+            c5x5 (int): The number of output channels to use for the second layer in
+                the c5x5 branch.
+            pool_proj (int): The number of output channels to use for the second layer
+                in the pool branch.
             activ (type of nn.Module, optional): The nn.Module class type to use for
                 activation layers.
-                Default: nn.ReLU
+                Default: :class:`torch.nn.ReLU`
             p_layer (type of nn.Module, optional): The nn.Module class type to use for
                 pooling layers.
-                Default: nn.MaxPool2d
+                Default: :class:`torch.nn.MaxPool2d`
         """
         super().__init__()
         self.conv_1x1 = nn.Conv2d(
@@ -388,13 +403,13 @@ def __init__(
 
             in_channels (int, optional): The number of input channels to use for the
                 auxiliary branch.
-                Default: 508
+                Default: ``508``
             out_features (int, optional): The number of output features to use for the
                 auxiliary branch.
-                Default: 1008
-            activ (type of nn.Module, optional): The nn.Module class type to use for
-                activation layers.
-                Default: nn.ReLU
+                Default: ``1008``
+            activ (type of nn.Module, optional): The ``nn.Module`` class type to use
+                for activation layers.
+                Default: :class:`torch.nn.ReLU`
         """
         super().__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d((4, 4))
diff --git a/setup.py b/setup.py
index 48bc6f4057..cd930850f8 100755
--- a/setup.py
+++ b/setup.py
@@ -133,6 +133,7 @@ def get_package_files(root, subdirs):
             "Model Understanding",
             "Feature Importance",
             "Neuron Importance",
+            "Feature Visualization",
             "PyTorch",
         ],
         classifiers=[
@@ -147,7 +148,7 @@ def get_package_files(root, subdirs):
         long_description=long_description,
         long_description_content_type="text/markdown",
         python_requires=">=3.6",
-        install_requires=["matplotlib", "numpy", "torch>=1.6"],
+        install_requires=["matplotlib", "numpy", "packaging", "torch>=1.6"],
         packages=find_packages(exclude=("tests", "tests.*")),
         extras_require={
             "dev": DEV_REQUIRES,
diff --git a/tests/optim/core/test_optimization.py b/tests/optim/core/test_optimization.py
index 7f77cf4b4d..1cd3301a98 100644
--- a/tests/optim/core/test_optimization.py
+++ b/tests/optim/core/test_optimization.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import unittest
+from typing import List
 
 import captum.optim as opt
 import torch
@@ -9,6 +10,54 @@
 
 
 class TestInputOptimization(BaseTest):
+    def test_input_optimization_init(self) -> None:
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
+            raise unittest.SkipTest(
+                "Skipping InputOptimization init test due to insufficient Torch"
+                + " version."
+            )
+        model = BasicModel_ConvNet_Optim()
+        loss_fn = opt.loss.ChannelActivation(model.layer, 1)
+        transform = torch.nn.Identity()
+        image_param = opt.images.NaturalImage()
+        obj = opt.InputOptimization(
+            model, loss_function=loss_fn, input_param=image_param, transform=transform
+        )
+
+        self.assertEqual(model, obj.model)
+        self.assertEqual(image_param, obj.input_param)
+        self.assertEqual(transform, obj.transform)
+        self.assertEqual(loss_fn, obj.loss_function)
+        self.assertEqual(list(image_param.parameters()), list(obj.parameters()))
+
+    def test_input_optimization_custom_optimize(self) -> None:
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
+            raise unittest.SkipTest(
+                "Skipping InputOptimization custom optimze test due to insufficient"
+                + " Torch version."
+            )
+        model = BasicModel_ConvNet_Optim()
+        loss_fn = opt.loss.ChannelActivation(model.layer, 0)
+        obj = opt.InputOptimization(model, loss_function=loss_fn)
+
+        stop_criteria = opt.optimization.n_steps(512, show_progress=False)
+        optimizer = torch.optim.Adam(obj.parameters(), lr=0.02)
+
+        history: List[torch.Tensor] = []
+        step = 0
+        try:
+            while stop_criteria(step, obj, history, optimizer):
+                optimizer.zero_grad()
+                loss_value = -1.0 * obj.loss().mean()
+                history.append(loss_value.clone().detach())
+                loss_value.backward()
+                optimizer.step()
+                step += 1
+        finally:
+            obj.cleanup()
+        history = torch.stack(history)
+        self.assertIsInstance(history, torch.Tensor)
+
     def test_input_optimization(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
diff --git a/tests/optim/models/test_googlenet_places365.py b/tests/optim/models/test_googlenet_places365.py
index d6e9cf321d..84f9291fb9 100644
--- a/tests/optim/models/test_googlenet_places365.py
+++ b/tests/optim/models/test_googlenet_places365.py
@@ -11,7 +11,7 @@
 
 class TestInceptionV1Places365(BaseTest):
     def test_load_inceptionv1_places365_with_redirected_relu(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping load pretrained InceptionV1 Places365 due to insufficient"
                 + " Torch version."
@@ -22,7 +22,7 @@ def test_load_inceptionv1_places365_with_redirected_relu(self) -> None:
         self.assertTrue(check_layer_in_model(model, RedirectedReluLayer))
 
     def test_load_inceptionv1_places365_no_redirected_relu(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping load pretrained InceptionV1 Places365 RedirectedRelu test"
                 + " due to insufficient Torch version."
@@ -34,7 +34,7 @@ def test_load_inceptionv1_places365_no_redirected_relu(self) -> None:
         self.assertTrue(check_layer_in_model(model, torch.nn.ReLU))
 
     def test_load_inceptionv1_places365_linear(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping load pretrained InceptionV1 Places365 linear test due to"
                 + " insufficient Torch version."
@@ -47,7 +47,7 @@ def test_load_inceptionv1_places365_linear(self) -> None:
         self.assertTrue(check_layer_in_model(model, torch.nn.AvgPool2d))
 
     def test_inceptionv1_places365_transform(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping InceptionV1 Places365 internal transform test due to"
                 + " insufficient Torch version."
@@ -62,7 +62,7 @@ def test_inceptionv1_places365_transform(self) -> None:
         assertTensorAlmostEqual(self, output, expected_output, 0)
 
     def test_inceptionv1_places365_transform_warning(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping InceptionV1 Places365 internal transform warning test due"
                 + " to insufficient Torch version."
@@ -75,7 +75,7 @@ def test_inceptionv1_places365_transform_warning(self) -> None:
             model._transform_input(x)
 
     def test_inceptionv1_places365_load_and_forward(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping basic pretrained InceptionV1 Places365 forward test due to"
                 + " insufficient Torch version."
@@ -86,7 +86,7 @@ def test_inceptionv1_places365_load_and_forward(self) -> None:
         self.assertEqual([list(o.shape) for o in outputs], [[1, 365]] * 3)
 
     def test_inceptionv1_places365_load_and_forward_diff_sizes(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping pretrained InceptionV1 Places365 forward with different"
                 + " sized inputs test due to insufficient Torch version."
@@ -102,7 +102,7 @@ def test_inceptionv1_places365_load_and_forward_diff_sizes(self) -> None:
         self.assertEqual([list(o.shape) for o in outputs2], [[1, 365]] * 3)
 
     def test_inceptionv1_places365_forward_no_aux(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping pretrained InceptionV1 Places365 with aux logits forward"
                 + " test due to insufficient Torch version."
@@ -113,7 +113,7 @@ def test_inceptionv1_places365_forward_no_aux(self) -> None:
         self.assertEqual(list(outputs.shape), [1, 365])
 
     def test_inceptionv1_places365_forward_cuda(self) -> None:
-        if torch.__version__ <= "1.6.0":
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
                 "Skipping pretrained InceptionV1 Places365 forward CUDA test due to"
                 + " insufficient Torch version."
diff --git a/tests/optim/param/test_images.py b/tests/optim/param/test_images.py
index 617d34a3a3..0ca59c1920 100644
--- a/tests/optim/param/test_images.py
+++ b/tests/optim/param/test_images.py
@@ -443,7 +443,7 @@ def test_simple_tensor_parameterization_with_grad(self) -> None:
         self.assertTrue(image_param.tensor.requires_grad)
 
     def test_simple_tensor_parameterization_jit_module_with_grad(self) -> None:
-        if torch.__version__ <= "1.8.0":
+        if version.parse(torch.__version__) <= version.parse("1.8.0"):
             raise unittest.SkipTest(
                 "Skipping SimpleTensorParameterization JIT module test due to"
                 + "  insufficient Torch version."
diff --git a/tests/optim/param/test_transforms.py b/tests/optim/param/test_transforms.py
index 385006a7ac..362fce9649 100644
--- a/tests/optim/param/test_transforms.py
+++ b/tests/optim/param/test_transforms.py
@@ -1335,7 +1335,7 @@ def test_ignore_alpha(self) -> None:
         assert rgb_tensor.size(1) == 3
 
     def test_ignore_alpha_jit_module(self) -> None:
-        if torch.__version__ <= "1.8.0":
+        if version.parse(torch.__version__) <= version.parse("1.8.0"):
             raise unittest.SkipTest(
                 "Skipping IgnoreAlpha JIT module test due to insufficient"
                 + " Torch version."