Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-41062: Add topic finding to efdUtils #64

Merged
merged 4 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
50 changes: 49 additions & 1 deletion python/lsst/summit/utils/efdUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import datetime
import logging
import pandas as pd
import re
from deprecated.sphinx import deprecated

from .utils import getSite

Expand All @@ -46,7 +48,8 @@
'getDayObsStartTime',
'getDayObsEndTime',
'getDayObsForTime',
'getSubTopics',
'getSubTopics', # deprecated, being removed in w_2023_50
'getTopics',
]


Expand Down Expand Up @@ -530,6 +533,12 @@ def getDayObsForTime(time):
return int((time + offset).utc.isot[:10].replace('-', ''))


@deprecated(
reason="getSubTopics() has been replaced by getTopics() and using wildcards. "
"Will be removed after w_2023_50.",
version="w_2023_40",
category=FutureWarning,
)
def getSubTopics(client, topic):
"""Get all the sub topics within a given topic.

Expand All @@ -553,3 +562,42 @@ def getSubTopics(client, topic):
loop = asyncio.get_event_loop()
topics = loop.run_until_complete(client.get_topics())
return sorted([t for t in topics if t.startswith(topic)])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that with this new findTopic function, do you still want the getSubTopics function above?

If yes, do you consider changing their names? Just by their names it's confusing what is for what. Might be even better just to have one function taking a regular expression.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I do think that we still need both - here is how I imagine using them:

image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I thought that the naming made the usage pretty clear, but maybe I am wrong. I don't like the idea of the regex, mainly because I wrote these to be easy, and as a self-declared regex-phobe, not only would I not be confident in how to best write that, I wouldn't even know how to use my own tools! 😄

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that, what do you think is best here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naively I would be tempted to use a utility this way:

getTopics(client, "*airflow*")  # for finding whatever containing the airflow

getTopics(client, "lsst.sal.ESS.*") # to see what lives inside ESS 

But I'm not a user of this and won't be in the short term, so I'd leave the judgement to you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Actually that wasn't really regexp.. But you get my idea on wildcarding)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those exact examples now work 🙂

image


def getTopics(client, toFind, caseSensitive=False):
"""Return all the strings in topics which match the topic query string.

Supports wildcards, which are denoted as `*``, as per shell globs.

Example:
>>> # assume topics are ['apple', 'banana', 'grape']
>>> getTopics(, 'a*p*')
['apple', 'grape']

Parameters
----------
client : `lsst_efd_client.efd_helper.EfdClient`
The EFD client to use.
toFind : `str`
The query string, with optional wildcards denoted as *.
caseSensitive : `bool`, optional
If ``True``, the query is case sensitive. Defaults to ``False``.

Returns
-------
matches : `list` of `str`
The list of matching topics.
"""
loop = asyncio.get_event_loop()
topics = loop.run_until_complete(client.get_topics())

# Replace wildcard with regex equivalent
pattern = toFind.replace('*', '.*')
flags = re.IGNORECASE if not caseSensitive else 0

matches = []
for topic in topics:
if re.match(pattern, topic, flags):
matches.append(topic)

return matches
11 changes: 6 additions & 5 deletions tests/test_butlerUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
)
from lsst.summit.utils.butlerUtils import removeDataProduct # noqa: F401
import lsst.daf.butler as dafButler
from lsst.daf.butler import DatasetRef
from lsst.resources import ResourcePath


Expand Down Expand Up @@ -110,7 +111,7 @@ def setUp(self):
self.assertIsInstance(self.dataCoordMinimal, dafButler.dimensions.DataCoordinate)
# NB the type check below is currently using a non-public API, but
# at present there isn't a good alternative
viewType = dafButler.core.dimensions._coordinate._DataCoordinateFullView
viewType = dafButler.dimensions._coordinate._DataCoordinateFullView
Copy link
Contributor

@TallJimbo TallJimbo Oct 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to drop this assertIsInstance check or replace it with a check on something that is public, like lsst.daf.butler.NamedKeyMapping. Leaving this dependency on a private symbol is just asking for your code to break unexpectedly in the future, and it's just a test, not a production use case.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks.

self.assertIsInstance(self.dataCoordFullView, viewType)

def test_getLatissDefaultCollections(self):
Expand Down Expand Up @@ -178,14 +179,14 @@ def test_getMostRecentDataId(self):

def test_getDatasetRefForDataId(self):
dRef = getDatasetRefForDataId(self.butler, 'raw', self.rawDataId)
self.assertIsInstance(dRef, lsst.daf.butler.core.datasets.ref.DatasetRef)
self.assertIsInstance(dRef, DatasetRef)

dRef = getDatasetRefForDataId(self.butler, 'raw', self.rawDataIdNoDayObSeqNum)
self.assertIsInstance(dRef, lsst.daf.butler.core.datasets.ref.DatasetRef)
self.assertIsInstance(dRef, DatasetRef)
dRef = getDatasetRefForDataId(self.butler, 'raw', self.dataCoordMinimal)
self.assertIsInstance(dRef, lsst.daf.butler.core.datasets.ref.DatasetRef)
self.assertIsInstance(dRef, DatasetRef)
dRef = getDatasetRefForDataId(self.butler, 'raw', self.dataCoordFullView)
self.assertIsInstance(dRef, lsst.daf.butler.core.datasets.ref.DatasetRef)
self.assertIsInstance(dRef, DatasetRef)

def test__dayobs_present(self):
goods = [{'day_obs': 123}, {'exposure.day_obs': 234}, {'day_obs': 345, 'otherkey': -1}]
Expand Down
28 changes: 19 additions & 9 deletions tests/test_efdUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
getDayObsStartTime,
getDayObsEndTime,
getDayObsForTime,
getSubTopics,
getTopics,
)

from utils import getVcr
Expand Down Expand Up @@ -118,14 +118,24 @@ def test_getDayObsAsTimes(self):
self.assertEqual(dayEnd.jd, dayStart.jd + 1)

@vcr.use_cassette()
def test_getSubTopics(self):
subTopics = getSubTopics(self.client, 'lsst.sal.MTMount')
self.assertIsInstance(subTopics, list)
self.assertGreater(len(subTopics), 0)

subTopics = getSubTopics(self.client, 'fake.topics.does.not.exist')
self.assertIsInstance(subTopics, list)
self.assertEqual(len(subTopics), 0)
def test_getTopics(self):
topics = getTopics(self.client, 'lsst.sal.MTMount*')
self.assertIsInstance(topics, list)
self.assertGreater(len(topics), 0)

topics = getTopics(self.client, '*fake.topics.does.not.exist*')
self.assertIsInstance(topics, list)
self.assertEqual(len(topics), 0)

# check we can find the mount with a preceding wildcard
topics = getTopics(self.client, '*mTmoUnt*')
self.assertIsInstance(topics, list)
self.assertGreater(len(topics), 0)

# check it fails if we don't allow case insensitivity
topics = getTopics(self.client, '*mTmoUnt*', caseSensitive=True)
self.assertIsInstance(topics, list)
self.assertEqual(len(topics), 0)

@vcr.use_cassette()
def test_getEfdData(self):
Expand Down