Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-28674: Allow # to mean fragment in schemeless URIs #469

Merged
merged 2 commits into from
Feb 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions python/lsst/daf/butler/core/_butlerUri/_butlerUri.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
# Regex for looking for URI escapes
ESCAPES_RE = re.compile(r"%[A-F0-9]{2}")

# Precomputed escaped hash
ESCAPED_HASH = urllib.parse.quote("#")


class ButlerURI:
"""Convenience wrapper around URI parsers.
Expand Down Expand Up @@ -144,6 +147,14 @@ def __new__(cls, uri: Union[str, urllib.parse.ParseResult, ButlerURI, Path],
log.warning("Possible double encoding of %s", uri)
else:
uri = urllib.parse.quote(uri)
# Special case hash since we must support fragments
# even in schemeless URIs -- although try to only replace
# them in file part and not directory part
if ESCAPED_HASH in uri:
dirpos = uri.rfind("/")
# Do replacement after this /
uri = uri[:dirpos+1] + uri[dirpos+1:].replace(ESCAPED_HASH, "#")

parsed = urllib.parse.urlparse(uri)
elif isinstance(uri, urllib.parse.ParseResult):
parsed = copy.copy(uri)
Expand Down
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/core/_butlerUri/schemeless.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ def _fixupPathUri(parsed: urllib.parse.ParseResult, root: Optional[Union[str, Bu
# ParseResult is a NamedTuple so _replace is standard API
parsed = parsed._replace(**replacements)

if parsed.params or parsed.fragment or parsed.query:
# We do allow fragment but do not expect params or query to be
# specified for schemeless
if parsed.params or parsed.query:
log.warning("Additional items unexpectedly encountered in schemeless URI: %s", parsed.geturl())

return parsed, dirLike
7 changes: 5 additions & 2 deletions python/lsst/daf/butler/core/fileTemplates.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,9 +531,12 @@ def format(self, ref: DatasetRef) -> str:
output = output + literal + format(value, format_spec)

# Replace periods with underscores in the non-directory part to
# prevent file extension confusion.
# prevent file extension confusion. Also replace # in the non-dir
# part to avoid confusion with URI fragments
head, tail = os.path.split(output)
output = os.path.join(head, tail.replace(".", "_"))
tail = tail.replace(".", "_")
tail = tail.replace("#", "HASH")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably dont care about speed here (at the number of times this will be called its negligible), and what is more readable probably depends on who reads it, but FWIW, tail.replace(".", "_").replace("#", "HASH") is a bit faster (I personally find it easier to grok, but others might not). This is more a side comment and not an explicit request to change.

output = os.path.join(head, tail)

# Complain if we were meant to use a component
if component is not None and not usedComponent:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def testQuotedRoot(self):

for pathInStore in ("relative/path/file.ext.gz",
"relative/path+2/file.ext.gz",
"relative/path+3/file#.ext.gz"):
"relative/path+3/file&.ext.gz"):
loc1 = factory.fromPath(pathInStore)

self.assertEqual(loc1.pathInStore.path, pathInStore)
Expand Down
7 changes: 7 additions & 0 deletions tests/test_uri.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,13 @@ def testEscapes(self):
uri = ButlerURI(urllib.parse.quote(plus_path), forceDirectory=True)
self.assertEqual(uri.ospath, plus_path)

# Check that # is not escaped for schemeless URIs
hash_path = "/a/b#/c&d#xyz"
hpos = hash_path.rfind("#")
uri = ButlerURI(hash_path)
self.assertEqual(uri.ospath, hash_path[:hpos])
self.assertEqual(uri.fragment, hash_path[hpos + 1:])


@unittest.skipIf(not boto3, "Warning: boto3 AWS SDK not found!")
@mock_s3
Expand Down