From aee4b43372e8108408e2456fde604af90c96e1b6 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@fb.com>
Date: Fri, 29 Jul 2022 19:12:13 -0400
Subject: [PATCH 1/3] docstrings for approximate_gp and fully_bayesian

---
 botorch/models/approximate_gp.py | 37 ++++++++++++++++++-------------
 botorch/models/fully_bayesian.py | 38 +++++++++++++++++++++++++++-----
 2 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
index 9c8c9c26fd..b1b30ac3a6 100644
--- a/botorch/models/approximate_gp.py
+++ b/botorch/models/approximate_gp.py
@@ -146,6 +146,13 @@ def fantasize(self, X, sampler=MCSampler, observation_noise=True, *args, **kwarg
 
 
 class _SingleTaskVariationalGP(ApproximateGP):
+    """
+    Base class wrapper for a stochastic variational Gaussian Process (SVGP)
+    model [hensman2013svgp]_.
+
+    Uses pivoted Cholesky initialization for the inducing points.
+    """
+
     def __init__(
         self,
         train_X: Tensor,
@@ -159,10 +166,6 @@ def __init__(
         inducing_points: Optional[Union[Tensor, int]] = None,
     ) -> None:
         r"""
-        Base class wrapper for a stochastic variational Gaussian Process (SVGP)
-        model [hensman2013svgp]_. Uses pivoted cholesky initialization for the
-        inducing points.
-
         Args:
             train_X: Training inputs (due to the ability of the SVGP to sub-sample
                 this does not have to be all of the training inputs).
@@ -255,7 +258,7 @@ def forward(self, X) -> MultivariateNormal:
 
 class SingleTaskVariationalGP(ApproximateGPyTorchModel):
     r"""A single-task variational GP model following [hensman2013svgp]_ with pivoted
-    cholesky initialization following [chen2018dpp]_ and [burt2020svgp]_.
+    Cholesky initialization following [chen2018dpp]_ and [burt2020svgp]_.
 
     A single-task variational GP using relatively strong priors on the Kernel
     hyperparameters, which work best when covariates are normalized to the unit
@@ -269,11 +272,19 @@ class SingleTaskVariationalGP(ApproximateGPyTorchModel):
 
     Use this model if you have a lot of data or if your responses are non-Gaussian.
 
-    To train this model, you should use `gpytorch.mlls.VariationalELBO` and not the
-    exact marginal log likelihood. Example mll:
-
-        mll = VariationalELBO(model.likelihood, model.model, num_data=train_X.shape[-2])
-
+    To train this model, you should use gpytorch.mlls.VariationalELBO and not
+    the exact marginal log likelihood.
+
+    Example:
+        >>> import torch
+        >>> from botorch.models import SingleTaskVariationalGP
+        >>> from gpytorch.mlls import VariationalELBO
+        >>>
+        >>> train_X = torch.rand(20, 2)
+        >>> model = SingleTaskVariationalGP(train_X)
+        >>> mll = VariationalELBO(
+        >>>     model.likelihood, model.model, num_data=train_X.shape[-2]
+        >>> )
     """
 
     def __init__(
@@ -292,15 +303,11 @@ def __init__(
         input_transform: Optional[InputTransform] = None,
     ) -> None:
         r"""
-        A single task stochastic variational Gaussian process model (SVGP) as described
-        by [hensman2013svgp]_. We use pivoted cholesky initialization [burt2020svgp]_ to
-        initialize the inducing points of the model.
-
         Args:
             train_X: Training inputs (due to the ability of the SVGP to sub-sample
                 this does not have to be all of the training inputs).
             train_Y: Training targets (optional).
-            likelihood: Instance of a GPyYorch likelihood. If omitted, uses a
+            likelihood: Instance of a GPyTorch likelihood. If omitted, uses a
                 either a `GaussianLikelihood` (if `num_outputs=1`) or a
                 `MultitaskGaussianLikelihood`(if `num_outputs>1`).
             num_outputs: Number of output responses per input (default: 1).
diff --git a/botorch/models/fully_bayesian.py b/botorch/models/fully_bayesian.py
index 620ea3a782..a3ed499d79 100644
--- a/botorch/models/fully_bayesian.py
+++ b/botorch/models/fully_bayesian.py
@@ -6,6 +6,16 @@
 
 r"""Gaussian Process Regression models with fully Bayesian inference.
 
+Fully Bayesian models use Bayesian inference over model hyperparameters, such
+as length scales and noise variance, learning a posterior distribution for each
+hyperparameter using NUTS. When we predict and compute acquisition functions
+from a fully Bayesian model, we are using varying sets of hyperparameters
+drawn from this posterior. By contrast, our “standard” models (e.g.
+`SingleTaskGP`) learn only a single best value for each hyperparameter using
+MAP. The fully Bayesian method generally results in a better and more
+well-calibrated model, but is more computationally intensive. For a full
+description, see [Eriksson2021saasbo].
+
 We use a lightweight PyTorch implementation of a Matern-5/2 kernel as there are some
 performance issues with running NUTS on top of standard GPyTorch models. The resulting
 hyperparameter samples are loaded into a batched GPyTorch model after fitting.
@@ -71,9 +81,19 @@ def reshape_and_detach(target: Tensor, new_value: Tensor) -> None:
 
 class PyroModel:
     r"""
-    Base class for a Pyro model.
-
-    :meta ignore:
+    Base class for a Pyro model; used to assist in learning hyperparameters.
+
+    This class and its subclasses are not a standard BoTorch models; instead
+    the subclasses are used as inputs to a `SaasFullyBayesianSingleTaskGP`,
+    which should then have its hyperparameters fit with
+    `fit_fully_bayesian_model_nuts`. (By default, its subclass `SaasPyroModel`
+    is used).  A `PyroModel`’s `sample` method should specify lightweight
+    PyTorch functionality, which will be used for fast model fitting with NUTS.
+    The utility of `PyroModel` is in enabling fast fitting with NUTS, since we
+    would otherwise need to use GPyTorch, which is computationally infeasible
+    in combination with Pyro.
+
+    :meta private:
     """
 
     def set_inputs(
@@ -115,6 +135,12 @@ class SaasPyroModel(PyroModel):
     The SAAS model uses sparsity-inducing priors to identift the most important
     parameters. This model is suitable for high-dimensional BO with potentially
     hundreds of tunable parameters. See [Eriksson2021saasbo]_ for more details.
+
+    `SaasPyroModel` is not a standard BoTorch model; instead, it is used as
+    an input to `SaasFullyBayesianSingleTaskGP`. It is used as a default keyword
+    argument, and end users are not likely to need to instantiate or modify a
+    `SaasPyroModel` unless they want to customize its attributes (such as
+    `covar_module`).
     """
 
     def sample(self) -> None:
@@ -274,9 +300,9 @@ class SaasFullyBayesianSingleTaskGP(SingleTaskGP):
     isn't compatible with `fit_gpytorch_model`.
 
     Example:
-    >>> saas_gp = SaasFullyBayesianSingleTaskGP(train_X, train_Y)
-    >>> fit_fully_bayesian_model_nuts(saas_gp)
-    >>> posterior = saas_gp.posterior(test_X)
+        >>> saas_gp = SaasFullyBayesianSingleTaskGP(train_X, train_Y)
+        >>> fit_fully_bayesian_model_nuts(saas_gp)
+        >>> posterior = saas_gp.posterior(test_X)
     """
 
     def __init__(

From 3d8e8938213f3f20fe47b81d38ec860c5426a96d Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <elizabeth.santorella@gmail.com>
Date: Tue, 2 Aug 2022 10:56:14 -0400
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: David Eriksson <dme65@cornell.edu>
---
 botorch/models/fully_bayesian.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/botorch/models/fully_bayesian.py b/botorch/models/fully_bayesian.py
index a3ed499d79..0bddcf17cb 100644
--- a/botorch/models/fully_bayesian.py
+++ b/botorch/models/fully_bayesian.py
@@ -7,10 +7,11 @@
 r"""Gaussian Process Regression models with fully Bayesian inference.
 
 Fully Bayesian models use Bayesian inference over model hyperparameters, such
-as length scales and noise variance, learning a posterior distribution for each
-hyperparameter using NUTS. When we predict and compute acquisition functions
-from a fully Bayesian model, we are using varying sets of hyperparameters
-drawn from this posterior. By contrast, our “standard” models (e.g.
+as lengthscales and noise variance, learning a posterior distribution for the
+hyperparameters using the No-U-Turn-Sampler (NUTS). This is followed by
+sampling a small set of hyperparameters (often ~16) from the posterior
+that we will use for model predictions and for computing acquisition function 
+values. By contrast, our “standard” models (e.g.
 `SingleTaskGP`) learn only a single best value for each hyperparameter using
 MAP. The fully Bayesian method generally results in a better and more
 well-calibrated model, but is more computationally intensive. For a full

From 2a506e9e726310f3e25f1bf4ab4f3f5030b2c3f3 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@fb.com>
Date: Tue, 2 Aug 2022 11:03:07 -0400
Subject: [PATCH 3/3] Fixed trailing whitespace

---
 botorch/models/fully_bayesian.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/botorch/models/fully_bayesian.py b/botorch/models/fully_bayesian.py
index 0bddcf17cb..e71d7837bf 100644
--- a/botorch/models/fully_bayesian.py
+++ b/botorch/models/fully_bayesian.py
@@ -10,16 +10,17 @@
 as lengthscales and noise variance, learning a posterior distribution for the
 hyperparameters using the No-U-Turn-Sampler (NUTS). This is followed by
 sampling a small set of hyperparameters (often ~16) from the posterior
-that we will use for model predictions and for computing acquisition function 
+that we will use for model predictions and for computing acquisition function
 values. By contrast, our “standard” models (e.g.
 `SingleTaskGP`) learn only a single best value for each hyperparameter using
 MAP. The fully Bayesian method generally results in a better and more
 well-calibrated model, but is more computationally intensive. For a full
 description, see [Eriksson2021saasbo].
 
-We use a lightweight PyTorch implementation of a Matern-5/2 kernel as there are some
-performance issues with running NUTS on top of standard GPyTorch models. The resulting
-hyperparameter samples are loaded into a batched GPyTorch model after fitting.
+We use a lightweight PyTorch implementation of a Matern-5/2 kernel as there are
+some performance issues with running NUTS on top of standard GPyTorch models.
+The resulting hyperparameter samples are loaded into a batched GPyTorch model
+after fitting.
 
 References: