Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 85 additions & 35 deletions botorch/utils/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from __future__ import annotations

import warnings
from itertools import count, repeat
from typing import Any, Dict, Hashable, Iterable, Optional, TypeVar, Union
from typing import Any, Iterable, List, Optional, TypeVar, Union

import torch
from botorch.utils.containers import BotorchContainer, SliceContainer
Expand All @@ -31,10 +30,19 @@ class SupervisedDataset:

X = torch.rand(16, 2)
Y = torch.rand(16, 1)
A = SupervisedDataset(X, Y)
feature_names = ["learning_rate", "embedding_dim"]
outcome_names = ["neg training loss"]
A = SupervisedDataset(
X=X,
Y=Y,
feature_names=feature_names,
outcome_names=outcome_names,
)
B = SupervisedDataset(
DenseContainer(X, event_shape=X.shape[-1:]),
DenseContainer(Y, event_shape=Y.shape[-1:]),
X=DenseContainer(X, event_shape=X.shape[-1:]),
Y=DenseContainer(Y, event_shape=Y.shape[-1:]),
feature_names=feature_names,
outcome_names=outcome_names,
)
assert A == B
"""
Expand All @@ -43,6 +51,9 @@ def __init__(
self,
X: Union[BotorchContainer, Tensor],
Y: Union[BotorchContainer, Tensor],
*,
feature_names: List[str],
outcome_names: List[str],
Yvar: Union[BotorchContainer, Tensor, None] = None,
validate_init: bool = True,
) -> None:
Expand All @@ -51,13 +62,17 @@ def __init__(
Args:
X: A `Tensor` or `BotorchContainer` representing the input features.
Y: A `Tensor` or `BotorchContainer` representing the outcomes.
feature_names: A list of names of the features in `X`.
outcome_names: A list of names of the outcomes in `Y`.
Yvar: An optional `Tensor` or `BotorchContainer` representing
the observation noise.
validate_init: If `True`, validates the input shapes.
"""
self._X = X
self._Y = Y
self._Yvar = Yvar
self.feature_names = feature_names
self.outcome_names = outcome_names
if validate_init:
self._validate()

Expand All @@ -79,7 +94,23 @@ def Yvar(self) -> Optional[Tensor]:
return self._Yvar
return self._Yvar()

def _validate(self) -> None:
def _validate(
self,
validate_feature_names: bool = True,
validate_outcome_names: bool = True,
) -> None:
r"""Checks that the shapes of the inputs are compatible with each other.

Args:
validate_feature_names: By default, we validate that the length of
`feature_names` matches the # of columns of `self.X`. If a
particular dataset, e.g., `RankingDataset`, is known to violate
this assumption, this can be set to `False`.
validate_outcome_names: By default, we validate that the length of
`outcomes_names` matches the # of columns of `self.Y`. If a
particular dataset, e.g., `RankingDataset`, is known to violate
this assumption, this can be set to `False`.
"""
shape_X = self.X.shape
if isinstance(self._X, BotorchContainer):
shape_X = shape_X[: len(shape_X) - len(self._X.event_shape)]
Expand All @@ -94,31 +125,16 @@ def _validate(self) -> None:
raise ValueError("Batch dimensions of `X` and `Y` are incompatible.")
if self.Yvar is not None and self.Yvar.shape != self.Y.shape:
raise ValueError("Shapes of `Y` and `Yvar` are incompatible.")

@classmethod
def dict_from_iter(
cls,
X: MaybeIterable[Union[BotorchContainer, Tensor]],
Y: MaybeIterable[Union[BotorchContainer, Tensor]],
Yvar: Optional[MaybeIterable[Union[BotorchContainer, Tensor]]] = None,
*,
keys: Optional[Iterable[Hashable]] = None,
) -> Dict[Hashable, SupervisedDataset]:
r"""Returns a dictionary of `SupervisedDataset` from iterables."""
single_X = isinstance(X, (Tensor, BotorchContainer))
single_Y = isinstance(Y, (Tensor, BotorchContainer))
if single_X:
X = (X,) if single_Y else repeat(X)
if single_Y:
Y = (Y,) if single_X else repeat(Y)
Yvar = repeat(Yvar) if isinstance(Yvar, (Tensor, BotorchContainer)) else Yvar

# Pass in Yvar only if it is not None.
iterables = (X, Y) if Yvar is None else (X, Y, Yvar)
return {
elements[0]: cls(*elements[1:])
for elements in zip(keys or count(), *iterables)
}
if validate_feature_names and len(self.feature_names) != self.X.shape[-1]:
raise ValueError(
"`X` must have the same number of columns as the number of "
"features in `feature_names`."
)
if validate_outcome_names and len(self.outcome_names) != self.Y.shape[-1]:
raise ValueError(
"`Y` must have the same number of columns as the number of "
"outcomes in `outcome_names`."
)

def __eq__(self, other: Any) -> bool:
return (
Expand All @@ -130,6 +146,8 @@ def __eq__(self, other: Any) -> bool:
if self.Yvar is None
else torch.equal(self.Yvar, other.Yvar)
)
and self.feature_names == other.feature_names
and self.outcome_names == other.outcome_names
)


Expand All @@ -145,14 +163,23 @@ def __init__(
X: Union[BotorchContainer, Tensor],
Y: Union[BotorchContainer, Tensor],
Yvar: Union[BotorchContainer, Tensor],
feature_names: List[str],
outcome_names: List[str],
validate_init: bool = True,
) -> None:
r"""Initialize a `FixedNoiseDataset` -- deprecated!"""
warnings.warn(
"`FixedNoiseDataset` is deprecated. Use `SupervisedDataset` instead.",
DeprecationWarning,
)
super().__init__(X=X, Y=Y, Yvar=Yvar, validate_init=validate_init)
super().__init__(
X=X,
Y=Y,
feature_names=feature_names,
outcome_names=outcome_names,
Yvar=Yvar,
validate_init=validate_init,
)


class RankingDataset(SupervisedDataset):
Expand All @@ -177,26 +204,49 @@ class RankingDataset(SupervisedDataset):
torch.stack([torch.randperm(3) for _ in range(8)]),
event_shape=torch.Size([3])
)
dataset = RankingDataset(X, Y)
feature_names = ["item_0", "item_1"]
outcome_names = ["ranking outcome"]
dataset = RankingDataset(
X=X,
Y=Y,
feature_names=feature_names,
outcome_names=outcome_names,
)
"""

def __init__(
self,
X: SliceContainer,
Y: Union[BotorchContainer, Tensor],
feature_names: List[str],
outcome_names: List[str],
validate_init: bool = True,
) -> None:
r"""Construct a `RankingDataset`.

Args:
X: A `SliceContainer` representing the input features being ranked.
Y: A `Tensor` or `BotorchContainer` representing the rankings.
feature_names: A list of names of the features in X.
outcome_names: A list of names of the outcomes in Y.
validate_init: If `True`, validates the input shapes.
"""
super().__init__(X=X, Y=Y, Yvar=None, validate_init=validate_init)
super().__init__(
X=X,
Y=Y,
feature_names=feature_names,
outcome_names=outcome_names,
Yvar=None,
validate_init=validate_init,
)

def _validate(self) -> None:
super()._validate()
super()._validate(validate_feature_names=False, validate_outcome_names=False)
if len(self.feature_names) != self._X.values.shape[-1]:
raise ValueError(
"The `values` field of `X` must have the same number of columns as "
"the number of features in `feature_names`."
)

Y = self.Y
arity = self._X.indices.shape[-1]
Expand Down
26 changes: 23 additions & 3 deletions test/acquisition/test_input_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,30 @@ def setUp(self, suppress_input_warnings: bool = True) -> None:
X2 = torch.rand(3, 2)
Y1 = torch.rand(3, 1)
Y2 = torch.rand(3, 1)
feature_names = ["X1", "X2"]
outcome_names = ["Y"]

self.blockX_blockY = SupervisedDataset.dict_from_iter(X1, Y1)
self.blockX_multiY = SupervisedDataset.dict_from_iter(X1, (Y1, Y2))
self.multiX_multiY = SupervisedDataset.dict_from_iter((X1, X2), (Y1, Y2))
self.blockX_blockY = {
0: SupervisedDataset(
X1, Y1, feature_names=feature_names, outcome_names=outcome_names
)
}
self.blockX_multiY = {
0: SupervisedDataset(
X1, Y1, feature_names=feature_names, outcome_names=outcome_names
),
1: SupervisedDataset(
X1, Y2, feature_names=feature_names, outcome_names=outcome_names
),
}
self.multiX_multiY = {
0: SupervisedDataset(
X1, Y1, feature_names=feature_names, outcome_names=outcome_names
),
1: SupervisedDataset(
X2, Y2, feature_names=feature_names, outcome_names=outcome_names
),
}
self.bounds = 2 * [(0.0, 1.0)]


Expand Down
4 changes: 3 additions & 1 deletion test/models/test_fully_bayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,9 @@ def test_construct_inputs(self):
X, Y, Yvar, model = self._get_data_and_model(
infer_noise=infer_noise, **tkwargs
)
training_data = SupervisedDataset(X, Y, Yvar)
training_data = SupervisedDataset(
X, Y, Yvar=Yvar, feature_names=["1", "2", "3", "4"], outcome_names=["1"]
)

data_dict = model.construct_inputs(training_data)
self.assertTrue(X.equal(data_dict["train_X"]))
Expand Down
11 changes: 3 additions & 8 deletions test/models/test_fully_bayesian_multitask.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,14 +566,9 @@ def test_construct_inputs(self):
for dtype, infer_noise in [(torch.float, False), (torch.double, True)]:
tkwargs = {"device": self.device, "dtype": dtype}
task_feature = 0

if infer_noise:
datasets, (train_X, train_Y) = _gen_datasets(yvar=None, **tkwargs)
train_Yvar = None
else:
datasets, (train_X, train_Y, train_Yvar) = _gen_datasets(
yvar=0.05, **tkwargs
)
datasets, (train_X, train_Y, train_Yvar) = _gen_datasets(
yvar=None if infer_noise else 0.05, **tkwargs
)

model = SaasFullyBayesianMultiTaskGP(
train_X=train_X,
Expand Down
15 changes: 13 additions & 2 deletions test/models/test_gp_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,12 @@ def test_construct_inputs(self):
)
X = model_kwargs["train_X"]
Y = model_kwargs["train_Y"]
training_data = SupervisedDataset(X, Y)
training_data = SupervisedDataset(
X,
Y,
feature_names=[f"x{i}" for i in range(X.shape[-1])],
outcome_names=["y"],
)
data_dict = model.construct_inputs(training_data)
self.assertTrue(X.equal(data_dict["train_X"]))
self.assertTrue(Y.equal(data_dict["train_Y"]))
Expand Down Expand Up @@ -448,7 +453,13 @@ def test_construct_inputs(self):
X = model_kwargs["train_X"]
Y = model_kwargs["train_Y"]
Yvar = model_kwargs["train_Yvar"]
training_data = SupervisedDataset(X, Y, Yvar)
training_data = SupervisedDataset(
X,
Y,
Yvar=Yvar,
feature_names=[f"x{i}" for i in range(X.shape[-1])],
outcome_names=["y"],
)
data_dict = model.construct_inputs(training_data)
self.assertTrue(X.equal(data_dict["train_X"]))
self.assertTrue(Y.equal(data_dict["train_Y"]))
Expand Down
19 changes: 17 additions & 2 deletions test/models/test_gp_regression_fidelity.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,14 @@ def test_construct_inputs(self):
lin_truncated=lin_trunc,
**tkwargs,
)
training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"])

X = kwargs["train_X"]
training_data = SupervisedDataset(
X=X,
Y=kwargs["train_Y"],
feature_names=[f"x{i}" for i in range(X.shape[-1])],
outcome_names=["y"],
)

# missing fidelity features
with self.assertRaisesRegex(TypeError, "argument: 'fidelity_features'"):
Expand Down Expand Up @@ -523,7 +530,13 @@ def test_construct_inputs(self):
lin_truncated=lin_trunc,
**tkwargs,
)
training_data = SupervisedDataset(kwargs["train_X"], kwargs["train_Y"])
X = kwargs["train_X"]
training_data = SupervisedDataset(
X=X,
Y=kwargs["train_Y"],
feature_names=[f"x{i}" for i in range(X.shape[-1])],
outcome_names=["y"],
)
data_dict = model.construct_inputs(training_data, fidelity_features=[1])
self.assertTrue("train_Yvar" not in data_dict)

Expand All @@ -532,6 +545,8 @@ def test_construct_inputs(self):
X=kwargs["train_X"],
Y=kwargs["train_Y"],
Yvar=torch.full(kwargs["train_Y"].shape[:-1] + (1,), 0.1),
feature_names=[f"x{i}" for i in range(X.shape[-1])],
outcome_names=["y"],
)

# missing fidelity features
Expand Down
15 changes: 13 additions & 2 deletions test/models/test_gp_regression_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,12 @@ def test_construct_inputs(self):
tkwargs = {"device": self.device, "dtype": dtype}
X, Y = _get_random_data(batch_shape=batch_shape, m=1, d=d, **tkwargs)
cat_dims = list(range(ncat))
training_data = SupervisedDataset(X, Y)
training_data = SupervisedDataset(
X,
Y,
feature_names=[f"x{i}" for i in range(d)],
outcome_names=["y"],
)
model_kwargs = MixedSingleTaskGP.construct_inputs(
training_data, categorical_features=cat_dims
)
Expand All @@ -283,7 +288,13 @@ def test_construct_inputs(self):
self.assertIsNone(model_kwargs["likelihood"])

# With train_Yvar.
training_data = SupervisedDataset(X, Y, Y)
training_data = SupervisedDataset(
X,
Y,
Yvar=Y,
feature_names=[f"x{i}" for i in range(d)],
outcome_names=["y"],
)
with self.assertWarnsRegex(InputDataWarning, "train_Yvar"):
model_kwargs = MixedSingleTaskGP.construct_inputs(
training_data, categorical_features=cat_dims
Expand Down
Loading