Skip to content

Commit

Permalink
Modify how collection rank is computed in dataset subqueries.
Browse files Browse the repository at this point in the history
This case-based approach is much closer to Andy Salnikov's initial
implementation of find-first dataset queries; I think it's likely that
changing it was at most unnecessary performance-wise (I was looking
for ways to simplify a query the planner was having trouble with, but
this construct - despite being hard for humans to parse when reading
the SQL - probably wasn't playing a role in that).  And some other
query system changes have made it preferable from a code-organization
standpoint, providing it's no worse from a performance standpoint.

To that end, in some quick benchmarks (on PostgreSQL, querying 16
collections with the same dataset types and a spatial constraint to
make the overall query nontrivial), this was consistently faster than
the old, UNION-based rank expression, but only barely; I'm by no means
certain that difference is significant, but I'm at least pretty
confident that this is no worse for performance.
  • Loading branch information
TallJimbo committed Sep 6, 2022
1 parent 9866319 commit bb7ba89
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 23 deletions.
28 changes: 26 additions & 2 deletions python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ def select(
run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
rank: SimpleQuery.Select.Or[None] = None,
) -> sqlalchemy.sql.Selectable:
# Docstring inherited from DatasetRecordStorage.
collection_types = {collection.type for collection in collections}
Expand Down Expand Up @@ -406,7 +407,13 @@ def select(
TimespanReprClass.fromLiteral(None).overlaps(TimespanReprClass.fromLiteral(timespan))
)
self._finish_single_select(
tags_query, self._tags, collections, id=id, run=run, ingestDate=ingestDate
tags_query,
self._tags,
collections,
id=id,
run=run,
ingestDate=ingestDate,
rank=rank,
)
else:
tags_query = None
Expand All @@ -430,7 +437,13 @@ def select(
)
)
self._finish_single_select(
calibs_query, self._calibs, collections, id=id, run=run, ingestDate=ingestDate
calibs_query,
self._calibs,
collections,
id=id,
run=run,
ingestDate=ingestDate,
rank=rank,
)
else:
calibs_query = None
Expand All @@ -451,6 +464,7 @@ def _finish_single_select(
id: SimpleQuery.Select.Or[Optional[int]],
run: SimpleQuery.Select.Or[None],
ingestDate: SimpleQuery.Select.Or[Optional[Timespan]],
rank: SimpleQuery.Select.Or[None],
) -> None:
dataset_id_col = table.columns.dataset_id
collection_col = table.columns[self._collections.getCollectionForeignKeyName()]
Expand All @@ -468,6 +482,16 @@ def _finish_single_select(
query.where.append(sqlalchemy.sql.literal(False))
else:
query.where.append(collection_col.in_([collection.key for collection in collections]))
# Add rank if requested as a CASE-based calculation the collection
# column.
if rank is not None:
assert rank is SimpleQuery.Select, "Cannot constraint rank, only select it."
query.columns.append(
sqlalchemy.sql.case(
{record.key: n for n, record in enumerate(collections)},
value=collection_col,
).label("rank")
)
# We can always get the dataset_id from the tags/calibs table or
# constrain it there. Can't use kwargs for that because we need to
# alias it to 'id'.
Expand Down
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/registry/interfaces/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ def select(
run: SimpleQuery.Select.Or[None] = SimpleQuery.Select,
timespan: SimpleQuery.Select.Or[Optional[Timespan]] = SimpleQuery.Select,
ingestDate: SimpleQuery.Select.Or[Optional[Timespan]] = None,
rank: SimpleQuery.Select.Or[None] = None,
) -> sqlalchemy.sql.Selectable:
"""Return a SQLAlchemy object that represents a ``SELECT`` query for
this `DatasetType`.
Expand Down Expand Up @@ -432,6 +433,10 @@ def select(
ingest times which are inside given timespan and also include
timestamp in the result columns. If `None` (default) then there is
no constraint and timestamp is not returned.
rank : `Select` or `None`
If `Select`, include a calculated column that is the integer rank
of the row's collection in the given list of collections, starting
from zero.
Returns
-------
Expand Down
33 changes: 12 additions & 21 deletions python/lsst/daf/butler/registry/queries/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,27 +406,18 @@ def _build_dataset_search_subquery(
# {dst}_window.rownum = 1;
#
# We'll start with the Common Table Expression (CTE) at the top.
subqueries = []
for rank, collection_record in enumerate(collections):
ssq = storage.select(
collection_record,
dataId=SimpleQuery.Select,
id=SimpleQuery.Select,
run=SimpleQuery.Select,
ingestDate=SimpleQuery.Select,
timespan=None,
)
subqueries.append(ssq.add_columns(sqlalchemy.sql.literal(rank).label("rank")))
# Although one would expect that these subqueries can be UNION ALL
# instead of UNION because each subquery is already distinct, it turns
# out that with many subqueries this causes catastrophic performance
# problems with both sqlite and postgres. Using UNION may require more
# table scans, but a much simpler query plan given our table
# structures. See DM-31429.
search = sqlalchemy.sql.union(*subqueries).cte(f"{storage.datasetType.name}_search")
# Now we fill out the SELECT the CTE, and the subquery it contains (at
# the same time, since they have the same columns, aside from the OVER
# clause).
search = storage.select(
*collections,
dataId=SimpleQuery.Select,
id=SimpleQuery.Select,
run=SimpleQuery.Select,
ingestDate=SimpleQuery.Select,
timespan=None,
rank=SimpleQuery.Select,
).cte(f"{storage.datasetType.name}_search")
# Now we fill out the SELECT from the CTE, and the subquery it contains
# (at the same time, since they have the same columns, aside from the
# OVER clause).
run_key_name = self._managers.collections.getRunForeignKeyName()
window_data_id_cols = [
search.columns[name].label(name) for name in storage.datasetType.dimensions.required.names
Expand Down

0 comments on commit bb7ba89

Please sign in to comment.