meta-pytorch · saitcakmak · Sep 19, 2023
diff --git a/botorch/utils/datasets.py b/botorch/utils/datasets.py
@@ -9,8 +9,7 @@
 from __future__ import annotations
 
 import warnings
-from itertools import count, repeat
-from typing import Any, Dict, Hashable, Iterable, Optional, TypeVar, Union
+from typing import Any, Iterable, List, Optional, TypeVar, Union
 
 import torch
 from botorch.utils.containers import BotorchContainer, SliceContainer
@@ -31,10 +30,19 @@ class SupervisedDataset:
 
         X = torch.rand(16, 2)
         Y = torch.rand(16, 1)
-        A = SupervisedDataset(X, Y)
+        feature_names = ["learning_rate", "embedding_dim"]
+        outcome_names = ["neg training loss"]
+        A = SupervisedDataset(
+            X=X,
+            Y=Y,
+            feature_names=feature_names,
+            outcome_names=outcome_names,
+        )
         B = SupervisedDataset(
-            DenseContainer(X, event_shape=X.shape[-1:]),
-            DenseContainer(Y, event_shape=Y.shape[-1:]),
+            X=DenseContainer(X, event_shape=X.shape[-1:]),
+            Y=DenseContainer(Y, event_shape=Y.shape[-1:]),
+            feature_names=feature_names,
+            outcome_names=outcome_names,
         )
         assert A == B
     """
@@ -43,6 +51,9 @@ def __init__(
         self,
         X: Union[BotorchContainer, Tensor],
         Y: Union[BotorchContainer, Tensor],
+        *,
+        feature_names: List[str],
+        outcome_names: List[str],
         Yvar: Union[BotorchContainer, Tensor, None] = None,
         validate_init: bool = True,
     ) -> None:
@@ -51,13 +62,17 @@ def __init__(
         Args:
             X: A `Tensor` or `BotorchContainer` representing the input features.
             Y: A `Tensor` or `BotorchContainer` representing the outcomes.
+            feature_names: A list of names of the features in `X`.
+            outcome_names: A list of names of the outcomes in `Y`.
             Yvar: An optional `Tensor` or `BotorchContainer` representing
                 the observation noise.
             validate_init: If `True`, validates the input shapes.
         """
         self._X = X
         self._Y = Y
         self._Yvar = Yvar
+        self.feature_names = feature_names
+        self.outcome_names = outcome_names
         if validate_init:
             self._validate()
 
@@ -79,7 +94,23 @@ def Yvar(self) -> Optional[Tensor]:
             return self._Yvar
         return self._Yvar()
 
-    def _validate(self) -> None:
+    def _validate(
+        self,
+        validate_feature_names: bool = True,
+        validate_outcome_names: bool = True,
+    ) -> None:
+        r"""Checks that the shapes of the inputs are compatible with each other.
+
+        Args:
+            validate_feature_names: By default, we validate that the length of
+                `feature_names` matches the # of columns of `self.X`. If a
+                particular dataset, e.g., `RankingDataset`, is known to violate
+                this assumption, this can be set to `False`.
+            validate_outcome_names: By default, we validate that the length of
+                `outcomes_names` matches the # of columns of `self.Y`. If a
+                particular dataset, e.g., `RankingDataset`, is known to violate
+                this assumption, this can be set to `False`.
+        """
         shape_X = self.X.shape
         if isinstance(self._X, BotorchContainer):
             shape_X = shape_X[: len(shape_X) - len(self._X.event_shape)]
@@ -94,31 +125,16 @@ def _validate(self) -> None:
             raise ValueError("Batch dimensions of `X` and `Y` are incompatible.")
         if self.Yvar is not None and self.Yvar.shape != self.Y.shape:
             raise ValueError("Shapes of `Y` and `Yvar` are incompatible.")
-
-    @classmethod
-    def dict_from_iter(
-        cls,
-        X: MaybeIterable[Union[BotorchContainer, Tensor]],
-        Y: MaybeIterable[Union[BotorchContainer, Tensor]],
-        Yvar: Optional[MaybeIterable[Union[BotorchContainer, Tensor]]] = None,
-        *,
-        keys: Optional[Iterable[Hashable]] = None,
-    ) -> Dict[Hashable, SupervisedDataset]:
-        r"""Returns a dictionary of `SupervisedDataset` from iterables."""
-        single_X = isinstance(X, (Tensor, BotorchContainer))
-        single_Y = isinstance(Y, (Tensor, BotorchContainer))
-        if single_X:
-            X = (X,) if single_Y else repeat(X)
-        if single_Y:
-            Y = (Y,) if single_X else repeat(Y)
-        Yvar = repeat(Yvar) if isinstance(Yvar, (Tensor, BotorchContainer)) else Yvar
-
-        # Pass in Yvar only if it is not None.
-        iterables = (X, Y) if Yvar is None else (X, Y, Yvar)
-        return {
-            elements[0]: cls(*elements[1:])
-            for elements in zip(keys or count(), *iterables)
-        }
+        if validate_feature_names and len(self.feature_names) != self.X.shape[-1]:
+            raise ValueError(
+                "`X` must have the same number of columns as the number of "
+                "features in `feature_names`."
+            )
+        if validate_outcome_names and len(self.outcome_names) != self.Y.shape[-1]:
+            raise ValueError(
+                "`Y` must have the same number of columns as the number of "
+                "outcomes in `outcome_names`."
+            )
 
     def __eq__(self, other: Any) -> bool:
         return (
@@ -130,6 +146,8 @@ def __eq__(self, other: Any) -> bool:
                 if self.Yvar is None
                 else torch.equal(self.Yvar, other.Yvar)
             )
+            and self.feature_names == other.feature_names
+            and self.outcome_names == other.outcome_names
         )
 
 
@@ -145,14 +163,23 @@ def __init__(
         X: Union[BotorchContainer, Tensor],
         Y: Union[BotorchContainer, Tensor],
         Yvar: Union[BotorchContainer, Tensor],
+        feature_names: List[str],
+        outcome_names: List[str],
         validate_init: bool = True,
     ) -> None:
         r"""Initialize a `FixedNoiseDataset` -- deprecated!"""
         warnings.warn(
             "`FixedNoiseDataset` is deprecated. Use `SupervisedDataset` instead.",
             DeprecationWarning,
         )
-        super().__init__(X=X, Y=Y, Yvar=Yvar, validate_init=validate_init)
+        super().__init__(
+            X=X,
+            Y=Y,
+            feature_names=feature_names,
+            outcome_names=outcome_names,
+            Yvar=Yvar,
+            validate_init=validate_init,
+        )
 
 
 class RankingDataset(SupervisedDataset):
@@ -177,26 +204,49 @@ class RankingDataset(SupervisedDataset):
             torch.stack([torch.randperm(3) for _ in range(8)]),
             event_shape=torch.Size([3])
         )
-        dataset = RankingDataset(X, Y)
+        feature_names = ["item_0", "item_1"]
+        outcome_names = ["ranking outcome"]
+        dataset = RankingDataset(
+            X=X,
+            Y=Y,
+            feature_names=feature_names,
+            outcome_names=outcome_names,
+        )
     """
 
     def __init__(
         self,
         X: SliceContainer,
         Y: Union[BotorchContainer, Tensor],
+        feature_names: List[str],
+        outcome_names: List[str],
         validate_init: bool = True,
     ) -> None:
         r"""Construct a `RankingDataset`.
 
         Args:
             X: A `SliceContainer` representing the input features being ranked.
             Y: A `Tensor` or `BotorchContainer` representing the rankings.
+            feature_names: A list of names of the features in X.
+            outcome_names: A list of names of the outcomes in Y.
             validate_init: If `True`, validates the input shapes.
         """
-        super().__init__(X=X, Y=Y, Yvar=None, validate_init=validate_init)
+        super().__init__(
+            X=X,
+            Y=Y,
+            feature_names=feature_names,
+            outcome_names=outcome_names,
+            Yvar=None,
+            validate_init=validate_init,
+        )
 
     def _validate(self) -> None:
-        super()._validate()
+        super()._validate(validate_feature_names=False, validate_outcome_names=False)
+        if len(self.feature_names) != self._X.values.shape[-1]:
+            raise ValueError(
+                "The `values` field of `X` must have the same number of columns as "
+                "the number of features in `feature_names`."
+            )
 
         Y = self.Y
         arity = self._X.indices.shape[-1]

diff --git a/test/acquisition/test_input_constructors.py b/test/acquisition/test_input_constructors.py
@@ -108,10 +108,30 @@ def setUp(self, suppress_input_warnings: bool = True) -> None:
         X2 = torch.rand(3, 2)
         Y1 = torch.rand(3, 1)
         Y2 = torch.rand(3, 1)
+        feature_names = ["X1", "X2"]
+        outcome_names = ["Y"]
 
-        self.blockX_blockY = SupervisedDataset.dict_from_iter(X1, Y1)
-        self.blockX_multiY = SupervisedDataset.dict_from_iter(X1, (Y1, Y2))
-        self.multiX_multiY = SupervisedDataset.dict_from_iter((X1, X2), (Y1, Y2))
+        self.blockX_blockY = {
+            0: SupervisedDataset(
+                X1, Y1, feature_names=feature_names, outcome_names=outcome_names
+            )
+        }
+        self.blockX_multiY = {
+            0: SupervisedDataset(
+                X1, Y1, feature_names=feature_names, outcome_names=outcome_names
+            ),
+            1: SupervisedDataset(
+                X1, Y2, feature_names=feature_names, outcome_names=outcome_names
+            ),
+        }
+        self.multiX_multiY = {
+            0: SupervisedDataset(
+                X1, Y1, feature_names=feature_names, outcome_names=outcome_names
+            ),
+            1: SupervisedDataset(
+                X2, Y2, feature_names=feature_names, outcome_names=outcome_names
+            ),
+        }
         self.bounds = 2 * [(0.0, 1.0)]
 
 

diff --git a/test/models/test_fully_bayesian.py b/test/models/test_fully_bayesian.py
@@ -550,7 +550,9 @@ def test_construct_inputs(self):
             X, Y, Yvar, model = self._get_data_and_model(
                 infer_noise=infer_noise, **tkwargs
             )
-            training_data = SupervisedDataset(X, Y, Yvar)
+            training_data = SupervisedDataset(
+                X, Y, Yvar=Yvar, feature_names=["1", "2", "3", "4"], outcome_names=["1"]
+            )
 
             data_dict = model.construct_inputs(training_data)
             self.assertTrue(X.equal(data_dict["train_X"]))

diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
@@ -566,14 +566,9 @@ def test_construct_inputs(self):
         for dtype, infer_noise in [(torch.float, False), (torch.double, True)]:
             tkwargs = {"device": self.device, "dtype": dtype}
             task_feature = 0
-
-            if infer_noise:
-                datasets, (train_X, train_Y) = _gen_datasets(yvar=None, **tkwargs)
-                train_Yvar = None
-            else:
-                datasets, (train_X, train_Y, train_Yvar) = _gen_datasets(
-                    yvar=0.05, **tkwargs
-                )
+            datasets, (train_X, train_Y, train_Yvar) = _gen_datasets(
+                yvar=None if infer_noise else 0.05, **tkwargs
+            )
 
             model = SaasFullyBayesianMultiTaskGP(
                 train_X=train_X,

diff --git a/test/models/test_gp_regression.py b/test/models/test_gp_regression.py
@@ -374,7 +374,12 @@ def test_construct_inputs(self):
             )
             X = model_kwargs["train_X"]
             Y = model_kwargs["train_Y"]
-            training_data = SupervisedDataset(X, Y)
+            training_data = SupervisedDataset(
+                X,
+                Y,
+                feature_names=[f"x{i}" for i in range(X.shape[-1])],
+                outcome_names=["y"],
+            )
             data_dict = model.construct_inputs(training_data)
             self.assertTrue(X.equal(data_dict["train_X"]))
             self.assertTrue(Y.equal(data_dict["train_Y"]))
@@ -448,7 +453,13 @@ def test_construct_inputs(self):
             X = model_kwargs["train_X"]
             Y = model_kwargs["train_Y"]
             Yvar = model_kwargs["train_Yvar"]
-            training_data = SupervisedDataset(X, Y, Yvar)
+            training_data = SupervisedDataset(
+                X,
+                Y,
+                Yvar=Yvar,
+                feature_names=[f"x{i}" for i in range(X.shape[-1])],
+                outcome_names=["y"],
+            )
             data_dict = model.construct_inputs(training_data)
             self.assertTrue(X.equal(data_dict["train_X"]))
             self.assertTrue(Y.equal(data_dict["train_Y"]))

diff --git a/test/models/test_gp_regression_fidelity.py b/test/models/test_gp_regression_fidelity.py
@@ -414,7 +414,14 @@ def test_construct_inputs(self):
                     lin_truncated=lin_trunc,
                     **tkwargs,
                 )
-                training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"])
+
+                X = kwargs["train_X"]
+                training_data = SupervisedDataset(
+                    X=X,
+                    Y=kwargs["train_Y"],
+                    feature_names=[f"x{i}" for i in range(X.shape[-1])],
+                    outcome_names=["y"],
+                )
 
                 # missing fidelity features
                 with self.assertRaisesRegex(TypeError, "argument: 'fidelity_features'"):
@@ -523,7 +530,13 @@ def test_construct_inputs(self):
                     lin_truncated=lin_trunc,
                     **tkwargs,
                 )
-                training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"])
+                X = kwargs["train_X"]
+                training_data = SupervisedDataset(
+                    X=X,
+                    Y=kwargs["train_Y"],
+                    feature_names=[f"x{i}" for i in range(X.shape[-1])],
+                    outcome_names=["y"],
+                )
                 data_dict = model.construct_inputs(training_data, fidelity_features=[1])
                 self.assertTrue("train_Yvar" not in data_dict)
 
@@ -532,6 +545,8 @@ def test_construct_inputs(self):
                     X=kwargs["train_X"],
                     Y=kwargs["train_Y"],
                     Yvar=torch.full(kwargs["train_Y"].shape[:-1] + (1,), 0.1),
+                    feature_names=[f"x{i}" for i in range(X.shape[-1])],
+                    outcome_names=["y"],
                 )
 
                 # missing fidelity features

diff --git a/test/models/test_gp_regression_mixed.py b/test/models/test_gp_regression_mixed.py
@@ -273,7 +273,12 @@ def test_construct_inputs(self):
             tkwargs = {"device": self.device, "dtype": dtype}
             X, Y = _get_random_data(batch_shape=batch_shape, m=1, d=d, **tkwargs)
             cat_dims = list(range(ncat))
-            training_data = SupervisedDataset(X, Y)
+            training_data = SupervisedDataset(
+                X,
+                Y,
+                feature_names=[f"x{i}" for i in range(d)],
+                outcome_names=["y"],
+            )
             model_kwargs = MixedSingleTaskGP.construct_inputs(
                 training_data, categorical_features=cat_dims
             )
@@ -283,7 +288,13 @@ def test_construct_inputs(self):
             self.assertIsNone(model_kwargs["likelihood"])
 
         # With train_Yvar.
-        training_data = SupervisedDataset(X, Y, Y)
+        training_data = SupervisedDataset(
+            X,
+            Y,
+            Yvar=Y,
+            feature_names=[f"x{i}" for i in range(d)],
+            outcome_names=["y"],
+        )
         with self.assertWarnsRegex(InputDataWarning, "train_Yvar"):
             model_kwargs = MixedSingleTaskGP.construct_inputs(
                 training_data, categorical_features=cat_dims