Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-27344: Add butler query-dimension-records subcommand #442

Merged
merged 5 commits into from
Nov 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/cli/cmd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"query_data_ids",
"query_dataset_types",
"query_datasets",
"query_dimension_records",
"remove_dataset_type",
)

Expand All @@ -43,5 +44,6 @@
query_data_ids,
query_dataset_types,
query_datasets,
query_dimension_records,
remove_dataset_type,
)
38 changes: 29 additions & 9 deletions python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import click
import yaml

from ..opt import (
collection_type_option,
collection_argument,
collections_option,
components_option,
dataset_type_option,
datasets_option,
dimensions_argument,
directory_argument,
element_argument,
glob_argument,
options_file_option,
repo_argument,
Expand Down Expand Up @@ -179,17 +180,15 @@ def query_collections(*args, **kwargs):
@glob_argument(help="GLOB is one or more glob-style expressions that fully or partially identify the "
"dataset types to return.")
@verbose_option(help="Include dataset type name, dimensions, and storage class in output.")
@click.option("--components/--no-components",
default=None,
help="For --components, apply all expression patterns to component dataset type names as well. "
"For --no-components, never apply patterns to components. Default (where neither is "
"specified) is to apply patterns to components only if their parent datasets were not "
"matched by the expression. Fully-specified component datasets (`str` or `DatasetType` "
"instances) are always included.")
@components_option()
@options_file_option()
def query_dataset_types(*args, **kwargs):
"""Get the dataset types in a repository."""
print(yaml.dump(cli_handle_exception(script.queryDatasetTypes, *args, **kwargs), sort_keys=False))
table = cli_handle_exception(script.queryDatasetTypes, *args, **kwargs)
if table:
table.pprint_all()
else:
print("No results. Try --help for more information.")


@click.command(cls=ButlerCommand)
Expand Down Expand Up @@ -269,3 +268,24 @@ def query_data_ids(**kwargs):
print("No results. Try requesting some dimensions or datasets, see --help for more information.")
else:
print("No results. Try --help for more information.")


@click.command(cls=ButlerCommand)
@repo_argument(required=True)
@element_argument(required=True)
@datasets_option(help=unwrap("""An expression that fully or partially identifies dataset types that should
constrain the yielded records. Only affects results when used with
--collections."""))
@collections_option(help=collections_option.help + " Only affects results when used with --datasets.")
@where_option(help=whereHelp)
@click.option("--no-check", is_flag=True,
help=unwrap("""Don't check the query before execution. By default the query is checked before it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TallJimbo what do we gain by having this option? When is disabling the check a good idea?

Copy link
Member

@TallJimbo TallJimbo Nov 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The check can generate false positives for some valid-but-rare queries. Those all involve wanting multiple instruments and a WHERE constraint on a table derived from instrument (e.g. exposure) on a column in that table whose values are nevertheless instrument-agnostic (e.g. exposure.exposure_time > 30). They then fall into two categories:

  • Cases where you do restrict the instruments (or skymaps) you want, but use IN instead of = and OR: instrument IN ('HSC', 'DECam') AND exposure.exposure_time > 30. The checker just isn't smart enough to rewrite the IN.
  • Cases where you actually want to query all of the instruments (or skymaps) in the registry: just exposure.exposure_time > 30. Note that I think we still want to check this case by default, because usually the user is thinking of a specific instrument but we don't know that.

Similar cases may exist for skymaps, but it's harder to think of practically useful cases given the attributes actually on the tract and patch tables.

I'm open to opinions on whether that's sufficiently niche as to make this option not worth its maintenance weight.

executed, this may reject some valid queries that resemble common mistakes."""))
@options_file_option()
def query_dimension_records(**kwargs):
"""Query for dimension information."""
table = cli_handle_exception(script.queryDimensionRecords, **kwargs)
if table:
table.pprint_all()
else:
print("No results. Try --help for more information.")
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/cli/opt/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
directory_argument = MWArgumentDecorator("directory",
help="DIRECTORY is the folder containing dataset files.")


element_argument = MWArgumentDecorator("element",
help="ELEMENT is the dimension element to obtain.")


glob_argument = MWArgumentDecorator("glob",
callback=split_commas,
help="GLOB is one or more strings to apply to the search.",
Expand Down
13 changes: 12 additions & 1 deletion python/lsst/daf/butler/cli/opt/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,23 @@ def makeCollectionTypes(context, param, value):

collections_option = MWOptionDecorator("--collections",
help=unwrap("""One or more expressions that fully or partially identify
the collections to search for datasets.If not provided all
the collections to search for datasets. If not provided all
datasets are returned."""),
multiple=True,
callback=split_commas)


components_option = MWOptionDecorator("--components/--no-components",
default=None,
help=unwrap("""For --components, apply all expression patterns to
component dataset type names as well. For --no-components,
never apply patterns to components. Default (where neither
is specified) is to apply patterns to components only if
their parent datasets were not matched by the expression.
Fully-specified component datasets (`str` or `DatasetType`
instances) are always included."""))


config_option = MWOptionDecorator("-c", "--config",
callback=split_kv,
help="Config override, as a key-value pair.",
Expand Down
6 changes: 6 additions & 0 deletions python/lsst/daf/butler/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,12 @@ def opts(self):
line."""
return self._opts

@property
def help(self):
"""Get the help text for this option. Returns an empty string if no
help was defined."""
return self.partialOpt.keywords.get("help", "")

def __call__(self, *args, **kwargs):
return self.partialOpt(*args, **kwargs)

Expand Down
3 changes: 2 additions & 1 deletion python/lsst/daf/butler/script/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from .butlerImport import butlerImport
from .certifyCalibrations import certifyCalibrations
from .createRepo import createRepo
from .configDump import configDump
from .configValidate import configValidate
Expand All @@ -28,5 +29,5 @@
from .queryDataIds import queryDataIds
from .queryDatasets import queryDatasets
from .queryDatasetTypes import queryDatasetTypes
from .queryDimensionRecords import queryDimensionRecords
from .removeDatasetType import removeDatasetType
from .certifyCalibrations import certifyCalibrations
16 changes: 10 additions & 6 deletions python/lsst/daf/butler/script/queryDatasetTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@

from typing import Any, List

from astropy.table import Table
from numpy import array

from .. import Butler
from ..core.utils import globToRegex

Expand Down Expand Up @@ -63,10 +66,11 @@ def queryDatasetTypes(repo, verbose, glob, components):
datasetTypes = butler.registry.queryDatasetTypes(components=components, **kwargs)
info: List[Any]
if verbose:
info = [dict(name=datasetType.name,
dimensions=list(datasetType.dimensions.names),
storageClass=datasetType.storageClass.name)
for datasetType in datasetTypes]
table = Table(array([(d.name, str(list(d.dimensions.names)) or "None", d.storageClass.name)
for d in datasetTypes]),
names=("name", "dimensions", "storage class"))
else:
info = [datasetType.name for datasetType in datasetTypes]
return {"datasetTypes": info}
rows = ([d.name for d in datasetTypes],)
table = Table(rows, names=("name",))
table.sort("name")
return table
51 changes: 51 additions & 0 deletions python/lsst/daf/butler/script/queryDimensionRecords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from astropy.table import Table

from .. import Butler
from ..core.utils import globToRegex


def queryDimensionRecords(repo, element, datasets, collections, where, no_check):
# Docstring for supported parameters is the same as
# Registry.queryDimensionRecords except for ``no_check``, which is the
# inverse of ``check``.

if collections:
collections = globToRegex(collections)
else:
collections = ...

butler = Butler(repo)

records = list(butler.registry.queryDimensionRecords(element,
datasets=datasets,
collections=collections,
where=where,
check=not no_check))
if not records:
return None

records.sort(key=lambda r: r.dataId) # use the dataId to sort the rows
keys = records[0].fields.names # order the columns the same as the record's `field.names`

return Table([[getattr(record, key, None) for record in records] for key in keys], names=keys)
1 change: 1 addition & 0 deletions python/lsst/daf/butler/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def readTable(textTable):
"""
return AstropyTable.read(textTable,
format="ascii",
data_start=2, # skip the header row and the header row underlines.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit surprised that Astropy doesn't have a parser for the default pretty print table output. Part of me is wondering whether we should add an output format option to all the tabular output subcommands so that users can easily parse the output if they want. That would also let you specify csv format in tests for example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatter options are not a bad idea. Another format might be an option to choose between Table and yaml, if there's ever going to be a case where a user may pipe the output of one command to the input of another? How to do naming/data structure with yaml would probably take a little thought though, or maybe you've got good ideas already.

In this case, AstropyTable.read normally does the Right Thing, but I ran into a situation where it was getting confused. The table had one column and one row, something like

name
----
foo

and it was reading the ---- as a value row. I spent a little time trying to figure out why but didn't really get anywhere. It seemed ok to have the unit test function start reading at idx 2 (3rd row, eh) since we're consistent with table output format (until we decide not to be... then we have to come up with a better or more elaborate fix)

fill_values=[("", 0, "")])


Expand Down
28 changes: 14 additions & 14 deletions tests/test_cliCmdQueryDatasetTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@
"""Unit tests for daf_butler CLI query-collections command.
"""

from astropy.table import Table as AstropyTable
from numpy import array
import unittest
import yaml

from lsst.daf.butler import Butler, DatasetType, StorageClass
from lsst.daf.butler.cli.butler import cli
from lsst.daf.butler.cli.cmd import query_dataset_types
from lsst.daf.butler.cli.utils import clickResultMsg, LogCliRunner
from lsst.daf.butler.tests import CliCmdTestBase
from lsst.daf.butler.tests.utils import ButlerTestHelper, readTable


class QueryDatasetTypesCmdTest(CliCmdTestBase, unittest.TestCase):
Expand Down Expand Up @@ -63,15 +65,15 @@ def test_all(self):
self.makeExpected(repo="here", verbose=True, glob=("foo*", ), components=False))


class QueryDatasetTypesScriptTest(unittest.TestCase):
class QueryDatasetTypesScriptTest(ButlerTestHelper, unittest.TestCase):

def testQueryDatasetTypes(self):
self.maxDiff = None
datasetName = "test"
instrumentDimension = "instrument"
visitDimension = "visit"
storageClassName = "testDatasetType"
expectedNotVerbose = {"datasetTypes": [datasetName]}
expectedNotVerbose = AstropyTable((("test",),), names=("name",))
runner = LogCliRunner()
with runner.isolated_filesystem():
butlerCfg = Butler.makeRepo("here")
Expand All @@ -84,22 +86,20 @@ def testQueryDatasetTypes(self):
# check not-verbose output:
result = runner.invoke(cli, ["query-dataset-types", "here"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
self.assertEqual(expectedNotVerbose, yaml.safe_load(result.output))
self.assertAstropyTablesEqual(readTable(result.output), expectedNotVerbose)
# check glob output:
result = runner.invoke(cli, ["query-dataset-types", "here", "t*"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
self.assertEqual(expectedNotVerbose, yaml.safe_load(result.output))
self.assertAstropyTablesEqual(readTable(result.output), expectedNotVerbose)
# check verbose output:
result = runner.invoke(cli, ["query-dataset-types", "here", "--verbose"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
response = yaml.safe_load(result.output)
# output dimension names contain all required dimensions, more than
# the registered dimensions, so verify the expected components
# individually.
self.assertEqual(response["datasetTypes"][0]["name"], datasetName)
self.assertEqual(response["datasetTypes"][0]["storageClass"], storageClassName)
self.assertIn(instrumentDimension, response["datasetTypes"][0]["dimensions"])
self.assertIn(visitDimension, response["datasetTypes"][0]["dimensions"])
expected = AstropyTable(array((
"test",
"['band', 'instrument', 'physical_filter', 'visit_system', 'visit']",
"testDatasetType")),
names=("name", "dimensions", "storage class"))
self.assertAstropyTablesEqual(readTable(result.output), expected)

# Now remove and check that it was removed
# First a non-existent one
Expand All @@ -113,7 +113,7 @@ def testQueryDatasetTypes(self):
# and check that it has gone
result = runner.invoke(cli, ["query-dataset-types", "here"])
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
self.assertEqual({"datasetTypes": []}, yaml.safe_load(result.output))
self.assertIn("No results", result.output)


if __name__ == "__main__":
Expand Down