diff --git a/botorch/utils/datasets.py b/botorch/utils/datasets.py index 670f9399f1..fd67f388b9 100644 --- a/botorch/utils/datasets.py +++ b/botorch/utils/datasets.py @@ -9,8 +9,7 @@ from __future__ import annotations import warnings -from itertools import count, repeat -from typing import Any, Dict, Hashable, Iterable, Optional, TypeVar, Union +from typing import Any, Iterable, List, Optional, TypeVar, Union import torch from botorch.utils.containers import BotorchContainer, SliceContainer @@ -31,10 +30,19 @@ class SupervisedDataset: X = torch.rand(16, 2) Y = torch.rand(16, 1) - A = SupervisedDataset(X, Y) + feature_names = ["learning_rate", "embedding_dim"] + outcome_names = ["neg training loss"] + A = SupervisedDataset( + X=X, + Y=Y, + feature_names=feature_names, + outcome_names=outcome_names, + ) B = SupervisedDataset( - DenseContainer(X, event_shape=X.shape[-1:]), - DenseContainer(Y, event_shape=Y.shape[-1:]), + X=DenseContainer(X, event_shape=X.shape[-1:]), + Y=DenseContainer(Y, event_shape=Y.shape[-1:]), + feature_names=feature_names, + outcome_names=outcome_names, ) assert A == B """ @@ -43,6 +51,9 @@ def __init__( self, X: Union[BotorchContainer, Tensor], Y: Union[BotorchContainer, Tensor], + *, + feature_names: List[str], + outcome_names: List[str], Yvar: Union[BotorchContainer, Tensor, None] = None, validate_init: bool = True, ) -> None: @@ -51,6 +62,8 @@ def __init__( Args: X: A `Tensor` or `BotorchContainer` representing the input features. Y: A `Tensor` or `BotorchContainer` representing the outcomes. + feature_names: A list of names of the features in `X`. + outcome_names: A list of names of the outcomes in `Y`. Yvar: An optional `Tensor` or `BotorchContainer` representing the observation noise. validate_init: If `True`, validates the input shapes. @@ -58,6 +71,8 @@ def __init__( self._X = X self._Y = Y self._Yvar = Yvar + self.feature_names = feature_names + self.outcome_names = outcome_names if validate_init: self._validate() @@ -79,7 +94,23 @@ def Yvar(self) -> Optional[Tensor]: return self._Yvar return self._Yvar() - def _validate(self) -> None: + def _validate( + self, + validate_feature_names: bool = True, + validate_outcome_names: bool = True, + ) -> None: + r"""Checks that the shapes of the inputs are compatible with each other. + + Args: + validate_feature_names: By default, we validate that the length of + `feature_names` matches the # of columns of `self.X`. If a + particular dataset, e.g., `RankingDataset`, is known to violate + this assumption, this can be set to `False`. + validate_outcome_names: By default, we validate that the length of + `outcomes_names` matches the # of columns of `self.Y`. If a + particular dataset, e.g., `RankingDataset`, is known to violate + this assumption, this can be set to `False`. + """ shape_X = self.X.shape if isinstance(self._X, BotorchContainer): shape_X = shape_X[: len(shape_X) - len(self._X.event_shape)] @@ -94,31 +125,16 @@ def _validate(self) -> None: raise ValueError("Batch dimensions of `X` and `Y` are incompatible.") if self.Yvar is not None and self.Yvar.shape != self.Y.shape: raise ValueError("Shapes of `Y` and `Yvar` are incompatible.") - - @classmethod - def dict_from_iter( - cls, - X: MaybeIterable[Union[BotorchContainer, Tensor]], - Y: MaybeIterable[Union[BotorchContainer, Tensor]], - Yvar: Optional[MaybeIterable[Union[BotorchContainer, Tensor]]] = None, - *, - keys: Optional[Iterable[Hashable]] = None, - ) -> Dict[Hashable, SupervisedDataset]: - r"""Returns a dictionary of `SupervisedDataset` from iterables.""" - single_X = isinstance(X, (Tensor, BotorchContainer)) - single_Y = isinstance(Y, (Tensor, BotorchContainer)) - if single_X: - X = (X,) if single_Y else repeat(X) - if single_Y: - Y = (Y,) if single_X else repeat(Y) - Yvar = repeat(Yvar) if isinstance(Yvar, (Tensor, BotorchContainer)) else Yvar - - # Pass in Yvar only if it is not None. - iterables = (X, Y) if Yvar is None else (X, Y, Yvar) - return { - elements[0]: cls(*elements[1:]) - for elements in zip(keys or count(), *iterables) - } + if validate_feature_names and len(self.feature_names) != self.X.shape[-1]: + raise ValueError( + "`X` must have the same number of columns as the number of " + "features in `feature_names`." + ) + if validate_outcome_names and len(self.outcome_names) != self.Y.shape[-1]: + raise ValueError( + "`Y` must have the same number of columns as the number of " + "outcomes in `outcome_names`." + ) def __eq__(self, other: Any) -> bool: return ( @@ -130,6 +146,8 @@ def __eq__(self, other: Any) -> bool: if self.Yvar is None else torch.equal(self.Yvar, other.Yvar) ) + and self.feature_names == other.feature_names + and self.outcome_names == other.outcome_names ) @@ -145,6 +163,8 @@ def __init__( X: Union[BotorchContainer, Tensor], Y: Union[BotorchContainer, Tensor], Yvar: Union[BotorchContainer, Tensor], + feature_names: List[str], + outcome_names: List[str], validate_init: bool = True, ) -> None: r"""Initialize a `FixedNoiseDataset` -- deprecated!""" @@ -152,7 +172,14 @@ def __init__( "`FixedNoiseDataset` is deprecated. Use `SupervisedDataset` instead.", DeprecationWarning, ) - super().__init__(X=X, Y=Y, Yvar=Yvar, validate_init=validate_init) + super().__init__( + X=X, + Y=Y, + feature_names=feature_names, + outcome_names=outcome_names, + Yvar=Yvar, + validate_init=validate_init, + ) class RankingDataset(SupervisedDataset): @@ -177,13 +204,22 @@ class RankingDataset(SupervisedDataset): torch.stack([torch.randperm(3) for _ in range(8)]), event_shape=torch.Size([3]) ) - dataset = RankingDataset(X, Y) + feature_names = ["item_0", "item_1"] + outcome_names = ["ranking outcome"] + dataset = RankingDataset( + X=X, + Y=Y, + feature_names=feature_names, + outcome_names=outcome_names, + ) """ def __init__( self, X: SliceContainer, Y: Union[BotorchContainer, Tensor], + feature_names: List[str], + outcome_names: List[str], validate_init: bool = True, ) -> None: r"""Construct a `RankingDataset`. @@ -191,12 +227,26 @@ def __init__( Args: X: A `SliceContainer` representing the input features being ranked. Y: A `Tensor` or `BotorchContainer` representing the rankings. + feature_names: A list of names of the features in X. + outcome_names: A list of names of the outcomes in Y. validate_init: If `True`, validates the input shapes. """ - super().__init__(X=X, Y=Y, Yvar=None, validate_init=validate_init) + super().__init__( + X=X, + Y=Y, + feature_names=feature_names, + outcome_names=outcome_names, + Yvar=None, + validate_init=validate_init, + ) def _validate(self) -> None: - super()._validate() + super()._validate(validate_feature_names=False, validate_outcome_names=False) + if len(self.feature_names) != self._X.values.shape[-1]: + raise ValueError( + "The `values` field of `X` must have the same number of columns as " + "the number of features in `feature_names`." + ) Y = self.Y arity = self._X.indices.shape[-1] diff --git a/test/acquisition/test_input_constructors.py b/test/acquisition/test_input_constructors.py index 79cb77fa60..53b36c9aad 100644 --- a/test/acquisition/test_input_constructors.py +++ b/test/acquisition/test_input_constructors.py @@ -108,10 +108,30 @@ def setUp(self, suppress_input_warnings: bool = True) -> None: X2 = torch.rand(3, 2) Y1 = torch.rand(3, 1) Y2 = torch.rand(3, 1) + feature_names = ["X1", "X2"] + outcome_names = ["Y"] - self.blockX_blockY = SupervisedDataset.dict_from_iter(X1, Y1) - self.blockX_multiY = SupervisedDataset.dict_from_iter(X1, (Y1, Y2)) - self.multiX_multiY = SupervisedDataset.dict_from_iter((X1, X2), (Y1, Y2)) + self.blockX_blockY = { + 0: SupervisedDataset( + X1, Y1, feature_names=feature_names, outcome_names=outcome_names + ) + } + self.blockX_multiY = { + 0: SupervisedDataset( + X1, Y1, feature_names=feature_names, outcome_names=outcome_names + ), + 1: SupervisedDataset( + X1, Y2, feature_names=feature_names, outcome_names=outcome_names + ), + } + self.multiX_multiY = { + 0: SupervisedDataset( + X1, Y1, feature_names=feature_names, outcome_names=outcome_names + ), + 1: SupervisedDataset( + X2, Y2, feature_names=feature_names, outcome_names=outcome_names + ), + } self.bounds = 2 * [(0.0, 1.0)] diff --git a/test/models/test_fully_bayesian.py b/test/models/test_fully_bayesian.py index 654babebe3..afd8b54dd5 100644 --- a/test/models/test_fully_bayesian.py +++ b/test/models/test_fully_bayesian.py @@ -550,7 +550,9 @@ def test_construct_inputs(self): X, Y, Yvar, model = self._get_data_and_model( infer_noise=infer_noise, **tkwargs ) - training_data = SupervisedDataset(X, Y, Yvar) + training_data = SupervisedDataset( + X, Y, Yvar=Yvar, feature_names=["1", "2", "3", "4"], outcome_names=["1"] + ) data_dict = model.construct_inputs(training_data) self.assertTrue(X.equal(data_dict["train_X"])) diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py index 3a8405b659..79954c7341 100644 --- a/test/models/test_fully_bayesian_multitask.py +++ b/test/models/test_fully_bayesian_multitask.py @@ -566,14 +566,9 @@ def test_construct_inputs(self): for dtype, infer_noise in [(torch.float, False), (torch.double, True)]: tkwargs = {"device": self.device, "dtype": dtype} task_feature = 0 - - if infer_noise: - datasets, (train_X, train_Y) = _gen_datasets(yvar=None, **tkwargs) - train_Yvar = None - else: - datasets, (train_X, train_Y, train_Yvar) = _gen_datasets( - yvar=0.05, **tkwargs - ) + datasets, (train_X, train_Y, train_Yvar) = _gen_datasets( + yvar=None if infer_noise else 0.05, **tkwargs + ) model = SaasFullyBayesianMultiTaskGP( train_X=train_X, diff --git a/test/models/test_gp_regression.py b/test/models/test_gp_regression.py index 3e400a8354..289b08bc64 100644 --- a/test/models/test_gp_regression.py +++ b/test/models/test_gp_regression.py @@ -374,7 +374,12 @@ def test_construct_inputs(self): ) X = model_kwargs["train_X"] Y = model_kwargs["train_Y"] - training_data = SupervisedDataset(X, Y) + training_data = SupervisedDataset( + X, + Y, + feature_names=[f"x{i}" for i in range(X.shape[-1])], + outcome_names=["y"], + ) data_dict = model.construct_inputs(training_data) self.assertTrue(X.equal(data_dict["train_X"])) self.assertTrue(Y.equal(data_dict["train_Y"])) @@ -448,7 +453,13 @@ def test_construct_inputs(self): X = model_kwargs["train_X"] Y = model_kwargs["train_Y"] Yvar = model_kwargs["train_Yvar"] - training_data = SupervisedDataset(X, Y, Yvar) + training_data = SupervisedDataset( + X, + Y, + Yvar=Yvar, + feature_names=[f"x{i}" for i in range(X.shape[-1])], + outcome_names=["y"], + ) data_dict = model.construct_inputs(training_data) self.assertTrue(X.equal(data_dict["train_X"])) self.assertTrue(Y.equal(data_dict["train_Y"])) diff --git a/test/models/test_gp_regression_fidelity.py b/test/models/test_gp_regression_fidelity.py index 512d67617c..66b41d0d9c 100644 --- a/test/models/test_gp_regression_fidelity.py +++ b/test/models/test_gp_regression_fidelity.py @@ -414,7 +414,14 @@ def test_construct_inputs(self): lin_truncated=lin_trunc, **tkwargs, ) - training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"]) + + X = kwargs["train_X"] + training_data = SupervisedDataset( + X=X, + Y=kwargs["train_Y"], + feature_names=[f"x{i}" for i in range(X.shape[-1])], + outcome_names=["y"], + ) # missing fidelity features with self.assertRaisesRegex(TypeError, "argument: 'fidelity_features'"): @@ -523,7 +530,13 @@ def test_construct_inputs(self): lin_truncated=lin_trunc, **tkwargs, ) - training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"]) + X = kwargs["train_X"] + training_data = SupervisedDataset( + X=X, + Y=kwargs["train_Y"], + feature_names=[f"x{i}" for i in range(X.shape[-1])], + outcome_names=["y"], + ) data_dict = model.construct_inputs(training_data, fidelity_features=[1]) self.assertTrue("train_Yvar" not in data_dict) @@ -532,6 +545,8 @@ def test_construct_inputs(self): X=kwargs["train_X"], Y=kwargs["train_Y"], Yvar=torch.full(kwargs["train_Y"].shape[:-1] + (1,), 0.1), + feature_names=[f"x{i}" for i in range(X.shape[-1])], + outcome_names=["y"], ) # missing fidelity features diff --git a/test/models/test_gp_regression_mixed.py b/test/models/test_gp_regression_mixed.py index 58afb16957..2de7abcdf0 100644 --- a/test/models/test_gp_regression_mixed.py +++ b/test/models/test_gp_regression_mixed.py @@ -273,7 +273,12 @@ def test_construct_inputs(self): tkwargs = {"device": self.device, "dtype": dtype} X, Y = _get_random_data(batch_shape=batch_shape, m=1, d=d, **tkwargs) cat_dims = list(range(ncat)) - training_data = SupervisedDataset(X, Y) + training_data = SupervisedDataset( + X, + Y, + feature_names=[f"x{i}" for i in range(d)], + outcome_names=["y"], + ) model_kwargs = MixedSingleTaskGP.construct_inputs( training_data, categorical_features=cat_dims ) @@ -283,7 +288,13 @@ def test_construct_inputs(self): self.assertIsNone(model_kwargs["likelihood"]) # With train_Yvar. - training_data = SupervisedDataset(X, Y, Y) + training_data = SupervisedDataset( + X, + Y, + Yvar=Y, + feature_names=[f"x{i}" for i in range(d)], + outcome_names=["y"], + ) with self.assertWarnsRegex(InputDataWarning, "train_Yvar"): model_kwargs = MixedSingleTaskGP.construct_inputs( training_data, categorical_features=cat_dims diff --git a/test/models/test_multitask.py b/test/models/test_multitask.py index 280f8c05ff..750368ebcd 100644 --- a/test/models/test_multitask.py +++ b/test/models/test_multitask.py @@ -7,7 +7,7 @@ import itertools import math import warnings -from typing import List, Optional +from typing import Dict, List, Optional, Tuple import torch from botorch.acquisition.objective import ScalarizedPosteriorTransform @@ -43,23 +43,31 @@ from gpytorch.priors import GammaPrior, LogNormalPrior, SmoothedBoxPrior from gpytorch.priors.lkj_prior import LKJCovariancePrior from gpytorch.settings import max_cholesky_size, max_root_decomposition_size +from torch import Tensor from torch.nn.functional import pad -def _gen_datasets(yvar: Optional[float] = None, **tkwargs): +def _gen_datasets( + yvar: Optional[float] = None, **tkwargs +) -> Tuple[Dict[int, SupervisedDataset], Tuple[Tensor, Tensor, Tensor]]: X = torch.linspace(0, 0.95, 10, **tkwargs) + 0.05 * torch.rand(10, **tkwargs) X = X.unsqueeze(dim=-1) Y1 = torch.sin(X * (2 * math.pi)) + torch.randn_like(X) * 0.2 Y2 = torch.cos(X * (2 * math.pi)) + torch.randn_like(X) * 0.2 train_X = torch.cat([pad(X, (1, 0), value=i) for i in range(2)]) train_Y = torch.cat([Y1, Y2]) - if yvar is None: - return SupervisedDataset.dict_from_iter(X, (Y1, Y2)), (train_X, train_Y) - Yvar1 = torch.full_like(Y1, yvar) - Yvar2 = torch.full_like(Y2, yvar) - train_Yvar = torch.cat([Yvar1, Yvar2]) - datasets = {0: SupervisedDataset(X, Y1, Yvar1), 1: SupervisedDataset(X, Y2, Yvar2)} + Yvar1 = None if yvar is None else torch.full_like(Y1, yvar) + Yvar2 = None if yvar is None else torch.full_like(Y2, yvar) + train_Yvar = None if yvar is None else torch.cat([Yvar1, Yvar2]) + datasets = { + 0: SupervisedDataset( + X, Y1, Yvar=Yvar1, feature_names=["X"], outcome_names=["y"] + ), + 1: SupervisedDataset( + X, Y2, Yvar=Yvar2, feature_names=["X"], outcome_names=["y"] + ), + } return datasets, (train_X, train_Y, train_Yvar) @@ -70,7 +78,7 @@ def _gen_model_and_data( outcome_transform=None, **tkwargs ): - datasets, (train_X, train_Y) = _gen_datasets(**tkwargs) + datasets, (train_X, train_Y, _) = _gen_datasets(**tkwargs) model = MultiTaskGP( train_X, train_Y, @@ -83,7 +91,7 @@ def _gen_model_and_data( def _gen_model_single_output(**tkwargs): - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) model = MultiTaskGP(train_X, train_Y, task_feature=0, output_tasks=[1]) return model.to(**tkwargs) @@ -117,7 +125,7 @@ def _gen_fixed_noise_model_single_output(**tkwargs): def _gen_fixed_prior_model(**tkwargs): - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) sd_prior = GammaPrior(2.0, 0.15) sd_prior._event_shape = torch.Size([2]) model = MultiTaskGP( @@ -130,7 +138,7 @@ def _gen_fixed_prior_model(**tkwargs): def _gen_given_covar_module_model(**tkwargs): - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) model = MultiTaskGP( train_X, train_Y, @@ -296,7 +304,7 @@ def test_MultiTaskGP(self): MultiTaskGP(torch.rand(2, 2, 2), torch.rand(2, 2, 1), 0) # test that bad feature index throws correct error - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) with self.assertRaises(ValueError): MultiTaskGP(train_X, train_Y, 2) @@ -380,7 +388,7 @@ def test_MultiTaskGP_given_covar_module(self): def test_custom_mean_and_likelihood(self): tkwargs = {"device": self.device, "dtype": torch.double} - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) mean_module = LinearMean(input_size=train_X.shape[-1]) likelihood = GaussianLikelihood(noise_prior=LogNormalPrior(0, 1)) model = MultiTaskGP( @@ -496,7 +504,7 @@ def test_FixedNoiseMultiTaskGP(self): ) # test that bad feature index throws correct error - _, (train_X, train_Y) = _gen_datasets(**tkwargs) + _, (train_X, train_Y, _) = _gen_datasets(**tkwargs) train_Yvar = torch.full_like(train_Y, 0.05) with self.assertRaises(ValueError): FixedNoiseMultiTaskGP(train_X, train_Y, train_Yvar, 2) diff --git a/test/models/utils/test_parse_training_data.py b/test/models/utils/test_parse_training_data.py index b43396388d..2e150742ad 100644 --- a/test/models/utils/test_parse_training_data.py +++ b/test/models/utils/test_parse_training_data.py @@ -22,7 +22,9 @@ def test_supervised(self): with self.assertRaisesRegex(NotImplementedError, "Could not find signature"): parse_training_data(Model, None) - dataset = SupervisedDataset(X=rand(3, 2), Y=rand(3, 1)) + dataset = SupervisedDataset( + X=rand(3, 2), Y=rand(3, 1), feature_names=["a", "b"], outcome_names=["y"] + ) with self.assertRaisesRegex(NotImplementedError, "Could not find signature"): parse_training_data(None, dataset) @@ -33,14 +35,22 @@ def test_supervised(self): def test_fixedNoise(self): # Test passing a `SupervisedDataset` - dataset = SupervisedDataset(X=rand(3, 2), Y=rand(3, 1)) + dataset = SupervisedDataset( + X=rand(3, 2), Y=rand(3, 1), feature_names=["a", "b"], outcome_names=["y"] + ) parse = parse_training_data(FixedNoiseGP, dataset) self.assertTrue("train_Yvar" not in parse) self.assertTrue(torch.equal(dataset.X, parse["train_X"])) self.assertTrue(torch.equal(dataset.Y, parse["train_Y"])) # Test passing a `FixedNoiseDataset` - dataset = FixedNoiseDataset(X=rand(3, 2), Y=rand(3, 1), Yvar=rand(3, 1)) + dataset = FixedNoiseDataset( + X=rand(3, 2), + Y=rand(3, 1), + Yvar=rand(3, 1), + feature_names=["a", "b"], + outcome_names=["y"], + ) parse = parse_training_data(FixedNoiseGP, dataset) self.assertTrue(torch.equal(dataset.X, parse["train_X"])) self.assertTrue(torch.equal(dataset.Y, parse["train_Y"])) @@ -53,7 +63,9 @@ def test_pairwiseGP_ranking(self): event_shape = Size([2 * datapoints.shape[-1]]) dataset_X = SliceContainer(datapoints, indices, event_shape=event_shape) dataset_Y = tensor([[0, 1], [1, 0]]).expand(indices.shape) - dataset = RankingDataset(X=dataset_X, Y=dataset_Y) + dataset = RankingDataset( + X=dataset_X, Y=dataset_Y, feature_names=["a", "b"], outcome_names=["y"] + ) parse = parse_training_data(PairwiseGP, dataset) self.assertTrue(dataset._X.values.equal(parse["datapoints"])) @@ -63,13 +75,27 @@ def test_pairwiseGP_ranking(self): def test_dict(self): n = 3 m = 2 - datasets = {i: SupervisedDataset(X=rand(n, 2), Y=rand(n, 1)) for i in range(m)} + datasets = { + i: SupervisedDataset( + X=rand(n, 2), + Y=rand(n, 1), + feature_names=["a", "b"], + outcome_names=["y"], + ) + for i in range(m) + } parse_training_data(Model, {0: datasets[0]}) with self.assertRaisesRegex(UnsupportedError, "multiple datasets to single"): parse_training_data(Model, datasets) _datasets = datasets.copy() - _datasets[m] = SupervisedDataset(rand(n, 2), rand(n, 1), rand(n, 1)) + _datasets[m] = SupervisedDataset( + rand(n, 2), + rand(n, 1), + Yvar=rand(n, 1), + feature_names=["a", "b"], + outcome_names=["y"], + ) with self.assertRaisesRegex(UnsupportedError, "Cannot combine .* hetero"): parse_training_data(MultiTaskGP, _datasets) diff --git a/test/utils/test_datasets.py b/test/utils/test_datasets.py index 6df06f9649..1575b2622b 100644 --- a/test/utils/test_datasets.py +++ b/test/utils/test_datasets.py @@ -14,45 +14,72 @@ class TestDatasets(BotorchTestCase): def test_supervised(self): # Generate some data - Xs = rand(4, 3, 2) - Ys = rand(4, 3, 1) + X = rand(3, 2) + Y = rand(3, 1) + feature_names = ["x1", "x2"] + outcome_names = ["y"] # Test `__init__` - dataset = SupervisedDataset(X=Xs[0], Y=Ys[0]) + dataset = SupervisedDataset( + X=X, Y=Y, feature_names=feature_names, outcome_names=outcome_names + ) self.assertIsInstance(dataset.X, Tensor) self.assertIsInstance(dataset._X, Tensor) self.assertIsInstance(dataset.Y, Tensor) self.assertIsInstance(dataset._Y, Tensor) - - dataset = SupervisedDataset( - X=DenseContainer(Xs[0], Xs[0].shape[-1:]), - Y=DenseContainer(Ys[0], Ys[0].shape[-1:]), + self.assertEqual(dataset.feature_names, feature_names) + self.assertEqual(dataset.outcome_names, outcome_names) + + dataset2 = SupervisedDataset( + X=DenseContainer(X, X.shape[-1:]), + Y=DenseContainer(Y, Y.shape[-1:]), + feature_names=feature_names, + outcome_names=outcome_names, ) - self.assertIsInstance(dataset.X, Tensor) - self.assertIsInstance(dataset._X, DenseContainer) - self.assertIsInstance(dataset.Y, Tensor) - self.assertIsInstance(dataset._Y, DenseContainer) + self.assertIsInstance(dataset2.X, Tensor) + self.assertIsInstance(dataset2._X, DenseContainer) + self.assertIsInstance(dataset2.Y, Tensor) + self.assertIsInstance(dataset2._Y, DenseContainer) + self.assertEqual(dataset, dataset2) # Test `_validate` with self.assertRaisesRegex(ValueError, "Batch dimensions .* incompatible."): - SupervisedDataset(X=rand(1, 2), Y=rand(2, 1)) - - # Test `dict_from_iter` and `__eq__` - datasets = SupervisedDataset.dict_from_iter(X=Xs.unbind(), Y=Ys.unbind()) - self.assertIsInstance(datasets, dict) - self.assertEqual(tuple(datasets.keys()), tuple(range(len(Xs)))) - for i, dataset in datasets.items(): - self.assertEqual(dataset, SupervisedDataset(Xs[i], Ys[i])) - self.assertNotEqual(datasets[0], datasets) - - datasets = SupervisedDataset.dict_from_iter(X=Xs[0], Y=Ys.unbind()) - self.assertEqual(len(datasets), len(Xs)) - for i in range(1, len(Xs)): - self.assertTrue(torch.equal(datasets[0].X, datasets[i].X)) + SupervisedDataset( + X=rand(1, 2), + Y=rand(2, 1), + feature_names=feature_names, + outcome_names=outcome_names, + ) + with self.assertRaisesRegex(ValueError, "`Y` and `Yvar`"): + SupervisedDataset( + X=rand(2, 2), + Y=rand(2, 1), + Yvar=rand(2), + feature_names=feature_names, + outcome_names=outcome_names, + ) + with self.assertRaisesRegex(ValueError, "feature_names"): + SupervisedDataset( + X=rand(2, 2), + Y=rand(2, 1), + feature_names=[], + outcome_names=outcome_names, + ) + with self.assertRaisesRegex(ValueError, "outcome_names"): + SupervisedDataset( + X=rand(2, 2), + Y=rand(2, 1), + feature_names=feature_names, + outcome_names=[], + ) # Test with Yvar. dataset = SupervisedDataset( - X=Xs[0], Y=Ys[0], Yvar=DenseContainer(Ys[0], Ys[0].shape[-1:]) + X=X, + Y=Y, + Yvar=DenseContainer(Y, Y.shape[-1:]), + feature_names=feature_names, + outcome_names=outcome_names, ) self.assertIsInstance(dataset.X, Tensor) self.assertIsInstance(dataset._X, Tensor) @@ -63,54 +90,103 @@ def test_supervised(self): def test_fixedNoise(self): # Generate some data - Xs = rand(4, 3, 2) - Ys = rand(4, 3, 1) - Ys_var = rand(4, 3, 1) - - # Test `dict_from_iter` - datasets = FixedNoiseDataset.dict_from_iter( - X=Xs.unbind(), - Y=Ys.unbind(), - Yvar=Ys_var.unbind(), - ) - for i, dataset in datasets.items(): - self.assertTrue(dataset.X.equal(Xs[i])) - self.assertTrue(dataset.Y.equal(Ys[i])) - self.assertTrue(dataset.Yvar.equal(Ys_var[i])) - - # Test handling of Tensor-valued arguments to `dict_from_iter` - datasets = FixedNoiseDataset.dict_from_iter( - X=Xs[0], - Y=Ys[1], - Yvar=Ys_var.unbind(), + X = rand(3, 2) + Y = rand(3, 1) + Yvar = rand(3, 1) + feature_names = ["x1", "x2"] + outcome_names = ["y"] + dataset = FixedNoiseDataset( + X=X, + Y=Y, + Yvar=Yvar, + feature_names=feature_names, + outcome_names=outcome_names, ) - for dataset in datasets.values(): - self.assertTrue(Xs[0].equal(dataset.X)) - self.assertTrue(Ys[1].equal(dataset.Y)) + self.assertTrue(torch.equal(dataset.X, X)) + self.assertTrue(torch.equal(dataset.Y, Y)) + self.assertTrue(torch.equal(dataset.Yvar, Yvar)) + self.assertEqual(dataset.feature_names, feature_names) + self.assertEqual(dataset.outcome_names, outcome_names) with self.assertRaisesRegex( ValueError, "`Y` and `Yvar`" ), self.assertWarnsRegex(DeprecationWarning, "SupervisedDataset"): - FixedNoiseDataset(X=Xs, Y=Ys, Yvar=Ys_var[0]) + FixedNoiseDataset( + X=X, + Y=Y, + Yvar=Yvar.squeeze(), + feature_names=feature_names, + outcome_names=outcome_names, + ) def test_ranking(self): # Test `_validate` X_val = rand(16, 2) X_idx = stack([randperm(len(X_val))[:3] for _ in range(1)]) X = SliceContainer(X_val, X_idx, event_shape=Size([3 * X_val.shape[-1]])) + feature_names = ["x1", "x2"] + outcome_names = ["ranking indices"] + + with self.assertRaisesRegex(ValueError, "The `values` field of `X`"): + RankingDataset( + X=X, + Y=tensor([[-1, 0, 1]]), + feature_names=feature_names[:1], + outcome_names=outcome_names, + ) with self.assertRaisesRegex(ValueError, "out-of-bounds"): - RankingDataset(X=X, Y=tensor([[-1, 0, 1]])) - RankingDataset(X=X, Y=tensor([[2, 0, 1]])) + RankingDataset( + X=X, + Y=tensor([[-1, 0, 1]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) + RankingDataset( + X=X, + Y=tensor([[2, 0, 1]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) with self.assertRaisesRegex(ValueError, "out-of-bounds"): - RankingDataset(X=X, Y=tensor([[0, 1, 3]])) - RankingDataset(X=X, Y=tensor([[0, 1, 2]])) + RankingDataset( + X=X, + Y=tensor([[0, 1, 3]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) + RankingDataset( + X=X, + Y=tensor([[0, 1, 2]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) with self.assertRaisesRegex(ValueError, "missing zero-th rank."): - RankingDataset(X=X, Y=tensor([[1, 2, 2]])) - RankingDataset(X=X, Y=tensor([[0, 1, 1]])) + RankingDataset( + X=X, + Y=tensor([[1, 2, 2]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) + RankingDataset( + X=X, + Y=tensor([[0, 1, 1]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) with self.assertRaisesRegex(ValueError, "ranks not skipped after ties."): - RankingDataset(X=X, Y=tensor([[0, 0, 1]])) - RankingDataset(X=X, Y=tensor([[0, 0, 2]])) + RankingDataset( + X=X, + Y=tensor([[0, 0, 1]]), + feature_names=feature_names, + outcome_names=outcome_names, + ) + RankingDataset( + X=X, + Y=tensor([[0, 0, 2]]), + feature_names=feature_names, + outcome_names=outcome_names, + )