Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-42306: Performance optimizations #76

Merged
merged 28 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9a0c337
Now that schemeless is never absolute, return False every time for isabs
timj Jan 11, 2024
d75f51a
No longer call isdir() on schemeless URI
timj Jan 11, 2024
8f708ef
Modify how dirLike is calculated
timj Jan 11, 2024
4291c61
Simplify relativeToPathRoot
timj Jan 11, 2024
525cdff
forceDirectory=False now means that it is definitely a file-like URI
timj Jan 12, 2024
11cb4a1
For updatedFile check if it is a directory directly
timj Jan 12, 2024
e6fc028
Speed up getExtension
timj Jan 12, 2024
385cfd3
Use str.removesuffix
timj Jan 12, 2024
5d79fb0
More simplifications and forceDirectory cleanups
timj Jan 12, 2024
d789db1
Use own getExtension implementation rather than pathlib
timj Jan 12, 2024
c5fa1c8
Must use isdir() not dirLike when checking
timj Jan 12, 2024
8bc00da
Fix forceDirectory type annotations
timj Jan 16, 2024
e433233
Fix some bad quoting in sphinx docs
timj Jan 16, 2024
ef211ea
Add news fragments
timj Jan 16, 2024
8af7208
Remove file system check from package resource constructor
timj Jan 16, 2024
c6a30a4
Now raise if forceDirectory=False and URI path ends with /
timj Jan 16, 2024
88ddfdd
Remove posixpath.isdir call on file:// constructor
timj Jan 16, 2024
c22a44d
Fix schemeless.isdir
timj Jan 16, 2024
08c8cd9
Add test specifically for schemeless isdir()
timj Jan 17, 2024
69012e3
Simplify uri.join()
timj Jan 19, 2024
5bb3c4d
Simplify Schemeless relative_to now that it can never be absolute URI
timj Jan 19, 2024
d5df587
When copying allow forceDirectory to override
timj Jan 19, 2024
4ea37fe
Clean up parent to use dirLike
timj Jan 19, 2024
fc7646a
Change updatedFile to use dirLike and .parent
timj Jan 22, 2024
6a639cc
Change getExtension to also work on directories
timj Jan 22, 2024
58dcf84
Remove code checking for abspath schemeless
timj Jan 22, 2024
d2c5172
Improve test coverage
timj Jan 22, 2024
1d37903
Add extra news fragment
timj Jan 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/changes/DM-42306.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The ``forceDirectory`` flag now has three states.
As before `True` indicates that it is known that the URI refers to a directory-like entity.
Now `False` indicates that it is known that the URI refers to a file-like entity.
`None` is the new default and that indicates that the caller does not know and that the status should be inferred either by checking the file system or looking for a trailing ``/``.
It is now an error to create a ``ResourcePath`` which has a trailing ``/`` but with ``forceDirectory=False``.
1 change: 1 addition & 0 deletions doc/changes/DM-42306.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* ``getExtension()`` now works for directories.
3 changes: 3 additions & 0 deletions doc/changes/DM-42306.perf.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Schemeless URIs no longer check the file system on construction.
* Both `getExtension` and `relativeToPathRoot` have been rewritten to no longer use `pathlib`.
* It is now possible to declare that a URI is file-like on construction. Use `forceDirectory=False`.
244 changes: 147 additions & 97 deletions python/lsst/resources/_resourcePath.py

Large diffs are not rendered by default.

29 changes: 16 additions & 13 deletions python/lsst/resources/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,10 @@ def isdir(self) -> bool:
`True` if this URI is a directory or looks like a directory,
else `False`.
"""
return self.dirLike or os.path.isdir(self.ospath)
if self.dirLike is None:
# Cache state for next time.
self.dirLike = os.path.isdir(self.ospath)
Copy link
Member Author

@timj timj Jan 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ResourcePath is nominally immutable but this cache can be updated from None to bool. This will presumably need locking around it and in packageresource and schemeless implementations.

I am also wondering if this and isTemporary and isLocal should be made read-only getters with _dirLike etc used internally.

return self.dirLike

def transfer_from(
self,
Expand Down Expand Up @@ -381,8 +384,8 @@ def _fixupPathUri(
parsed: urllib.parse.ParseResult,
root: ResourcePath | None = None,
forceAbsolute: bool = False,
forceDirectory: bool = False,
) -> tuple[urllib.parse.ParseResult, bool]:
forceDirectory: bool | None = None,
) -> tuple[urllib.parse.ParseResult, bool | None]:
"""Fix up relative paths in URI instances.

Parameters
Expand All @@ -404,9 +407,10 @@ def _fixupPathUri(
-------
modified : `~urllib.parse.ParseResult`
Update result if a URI is being handled.
dirLike : `bool`
dirLike : `bool` or `None`
`True` if given parsed URI has a trailing separator or
forceDirectory is True. Otherwise `False`.
``forceDirectory`` is `True`. Otherwise can return the given
value of ``forceDirectory``.

Notes
-----
Expand All @@ -416,21 +420,20 @@ def _fixupPathUri(
always done regardless of the ``forceAbsolute`` parameter.
"""
# assume we are not dealing with a directory like URI
dirLike = False
dirLike = forceDirectory

# file URI implies POSIX path separators so split as POSIX,
# then join as os, and convert to abspath. Do not handle
# home directories since "file" scheme is explicitly documented
# to not do tilde expansion.
sep = posixpath.sep

# For local file system we can explicitly check to see if this
# really is a directory. The URI might point to a location that
# does not exists yet but all that matters is if it is a directory
# then we make sure use that fact. No need to do the check if
# we are already being told.
if not forceDirectory and posixpath.isdir(parsed.path):
forceDirectory = True
# Consistency check.
if forceDirectory is False and parsed.path.endswith(sep):
raise ValueError(
f"URI {parsed.geturl()} ends with {sep} but "
"forceDirectory parameter declares it to be a file."
)

# For an absolute path all we need to do is check if we need
# to force the directory separator
Expand Down
3 changes: 2 additions & 1 deletion python/lsst/resources/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@ class Location:
path separator if a ``file`` scheme is being used for the URI,
else a POSIX separator. Can be a full URI if the root URI is `None`.
Can also be a schemeless URI if it refers to a relative path.
This path is required to be a file name and not a directory.
"""

__slots__ = ("_datastoreRootUri", "_path", "_uri")

def __init__(self, datastoreRootUri: None | ResourcePath | str, path: ResourcePath | str):
# Be careful not to force a relative local path to absolute path
path_uri = ResourcePath(path, forceAbsolute=False)
path_uri = ResourcePath(path, forceAbsolute=False, forceDirectory=False)

if isinstance(datastoreRootUri, str):
datastoreRootUri = ResourcePath(datastoreRootUri, forceDirectory=True)
Expand Down
40 changes: 10 additions & 30 deletions python/lsst/resources/packageresource.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,10 @@
from importlib import resources # type: ignore[no-redef]

from collections.abc import Iterator
from typing import TYPE_CHECKING

from ._resourceHandles._baseResourceHandle import ResourceHandleProtocol
from ._resourcePath import ResourcePath

if TYPE_CHECKING:
import urllib.parse

log = logging.getLogger(__name__)


Expand All @@ -47,25 +43,6 @@ class PackageResourcePath(ResourcePath):
resource name.
"""

@classmethod
def _fixDirectorySep(
cls, parsed: urllib.parse.ParseResult, forceDirectory: bool = False
) -> tuple[urllib.parse.ParseResult, bool]:
"""Ensure that a path separator is present on directory paths."""
parsed, dirLike = super()._fixDirectorySep(parsed, forceDirectory=forceDirectory)
if not dirLike:
try:
# If the resource location does not exist this can
# fail immediately. It is possible we are doing path
# manipulation and not wanting to read the resource now,
# so catch the error and move on.
ref = resources.files(parsed.netloc).joinpath(parsed.path.lstrip("/"))
except ModuleNotFoundError:
pass
else:
dirLike = ref.is_dir()
return parsed, dirLike

def _get_ref(self) -> resources.abc.Traversable | None:
"""Obtain the object representing the resource.

Expand All @@ -76,20 +53,23 @@ def _get_ref(self) -> resources.abc.Traversable | None:
associated with the resources is not accessible. This can happen
if Python can't import the Python package defining the resource.
"""
# Need the path without the leading /.
path = self.path.lstrip("/")
try:
ref = resources.files(self.netloc).joinpath(self.relativeToPathRoot)
ref = resources.files(self.netloc).joinpath(path)
except ModuleNotFoundError:
return None
return ref

def isdir(self) -> bool:
"""Return True if this URI is a directory, else False."""
if self.dirLike: # Always bypass if we guessed the resource is a directory.
return True
ref = self._get_ref()
if ref is None:
return False # Does not seem to exist so assume not a directory.
return ref.is_dir()
if self.dirLike is None:
ref = self._get_ref()
if ref is not None:
self.dirLike = ref.is_dir()
else:
return False
return self.dirLike

def exists(self) -> bool:
"""Check that the python resource exists."""
Expand Down
78 changes: 51 additions & 27 deletions python/lsst/resources/schemeless.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import logging
import os
import os.path
import stat
import urllib.parse
from pathlib import PurePath

Expand Down Expand Up @@ -46,9 +47,10 @@ def isabs(self) -> bool:
Returns
-------
isabs : `bool`
`True` if the file is absolute, `False` otherwise.
`True` if the file is absolute, `False` otherwise. Will always
be `False` for schemeless URIs.
"""
return os.path.isabs(self.ospath)
return False
timj marked this conversation as resolved.
Show resolved Hide resolved

def abspath(self) -> ResourcePath:
"""Force a schemeless URI to a file URI.
Expand All @@ -70,9 +72,39 @@ def abspath(self) -> ResourcePath:
# the options that will force the code below in _fixupPathUri to
# return a file URI from a scheme-less one.
return ResourcePath(
str(self), forceAbsolute=True, forceDirectory=self.isdir(), isTemporary=self.isTemporary
str(self), forceAbsolute=True, forceDirectory=self.dirLike, isTemporary=self.isTemporary
)

def isdir(self) -> bool:
"""Return whether this URI is a directory.

Returns
-------
isdir : `bool`
`True` if this URI is a directory or looks like a directory,
else `False`.

Notes
-----
If the URI is not known to refer to a file or a directory the file
system will be checked. The relative path will be resolved using
the current working directory. If the path can not be found, `False`
will be returned (matching `os.path.isdir` semantics) but the result
will not be stored in ``dirLike`` and will be checked again on request
in case the working directory has been updated.
"""
if self.dirLike is None:
try:
status = os.stat(self.ospath)
except FileNotFoundError:
# Do not update dirLike flag.
return False

# Do not cache. We do not know if this really refers to a file or
# not and changing directory might change the answer.
return stat.S_ISDIR(status.st_mode)
return self.dirLike

def relative_to(self, other: ResourcePath) -> str | None:
"""Return the relative path from this URI to the other URI.

Expand Down Expand Up @@ -102,24 +134,14 @@ def relative_to(self, other: ResourcePath) -> str | None:
# to convert from scheme-less to absolute URI.
child = None

if not self.isabs() and not other.isabs():
if not other.isabs():
# Both are schemeless relative. Use parent implementation
# rather than trying to convert both to file: first since schemes
# match.
pass
elif not self.isabs() and other.isabs():
elif other.isabs():
# Append child to other. This can account for .. in child path.
child = other.join(self.path)
elif self.isabs() and not other.isabs():
# Finding common paths is not possible if the parent is
# relative and the child is absolute.
return None
elif self.isabs() and other.isabs():
# Both are absolute so convert schemeless to file
# if necessary.
child = self.abspath()
if not other.scheme:
other = other.abspath()
else:
raise RuntimeError(f"Unexpected combination of {child}.relative_to({other}).")

Expand All @@ -133,8 +155,8 @@ def _fixupPathUri(
parsed: urllib.parse.ParseResult,
root: ResourcePath | None = None,
forceAbsolute: bool = False,
forceDirectory: bool = False,
) -> tuple[urllib.parse.ParseResult, bool]:
forceDirectory: bool | None = None,
) -> tuple[urllib.parse.ParseResult, bool | None]:
"""Fix up relative paths for local file system.

Parameters
Expand All @@ -153,7 +175,9 @@ def _fixupPathUri(
absolute path.
forceDirectory : `bool`, optional
If `True` forces the URI to end with a separator, otherwise given
URI is interpreted as is.
URI is interpreted as is. `False` can be used to indicate that
the URI is known to correspond to a file. `None` means that the
status is unknown.

Returns
-------
Expand All @@ -174,7 +198,7 @@ def _fixupPathUri(
expanded.
"""
# assume we are not dealing with a directory URI
dirLike = False
dirLike = forceDirectory

# Replacement values for the URI
replacements = {}
Expand Down Expand Up @@ -221,18 +245,18 @@ def _fixupPathUri(
# normpath strips trailing "/" which makes it hard to keep
# track of directory vs file when calling replaceFile

# For local file system we can explicitly check to see if this
# really is a directory. The URI might point to a location that
# does not exists yet but all that matters is if it is a directory
# then we make sure use that fact. No need to do the check if
# we are already being told.
if not forceDirectory and os.path.isdir(replacements["path"]):
forceDirectory = True

# add the trailing separator only if explicitly required or
# if it was stripped by normpath. Acknowledge that trailing
# separator exists.
endsOnSep = expandedPath.endswith(os.sep) and not replacements["path"].endswith(os.sep)

# Consistency check.
if forceDirectory is False and endsOnSep:
raise ValueError(
f"URI {parsed.geturl()} ends with {os.sep} but "
"forceDirectory parameter declares it to be a file."
)

if forceDirectory or endsOnSep or dirLike:
dirLike = True
if not replacements["path"].endswith(os.sep):
Expand Down
39 changes: 36 additions & 3 deletions python/lsst/resources/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,18 @@ def test_creation(self) -> None:
with self.assertRaises(RuntimeError):
ResourcePath(self.root_uri, isTemporary=True)

file = self.root_uri.join("file.txt")
file = self.root_uri.join("file.txt", forceDirectory=False)
with self.assertRaises(RuntimeError):
ResourcePath(file, forceDirectory=True)

file = self.root_uri.join("file.txt")
file_as_dir = ResourcePath(file, forceDirectory=True)
self.assertTrue(file_as_dir.isdir())

dir = self._make_uri("a/b/c/")
with self.assertRaises(ValueError):
ResourcePath(dir, forceDirectory=False)

with self.assertRaises(NotImplementedError):
ResourcePath("unknown://netloc")

Expand Down Expand Up @@ -256,6 +264,15 @@ def test_extension(self) -> None:
extension = extensionless.updatedExtension(".fits")
self.assertEqual(extension.getExtension(), ".fits")

uri = ResourcePath("test.txt", forceAbsolute=False)
self.assertEqual(uri.getExtension(), ".txt")
uri = ResourcePath(self._make_uri("dir.1/dir.2/test.txt"), forceDirectory=False)
self.assertEqual(uri.getExtension(), ".txt")
uri = ResourcePath(self._make_uri("dir.1/dir.2/"), forceDirectory=True)
self.assertEqual(uri.getExtension(), ".2")
uri = ResourcePath(self._make_uri("dir.1/dir/"), forceDirectory=True)
self.assertEqual(uri.getExtension(), "")

def test_relative(self) -> None:
"""Check that we can get subpaths back from two URIs."""
parent = ResourcePath(self._make_uri(self.path1), forceDirectory=True)
Expand Down Expand Up @@ -331,6 +348,7 @@ def test_parents(self) -> None:
self.assertEqual(derived_parent, parent)
self.assertTrue(derived_parent.isdir())
self.assertEqual(child_file.parent().parent(), parent)
self.assertEqual(child_subdir.dirname(), child_subdir)

def test_escapes(self) -> None:
"""Special characters in file paths."""
Expand Down Expand Up @@ -431,7 +449,11 @@ def test_join(self) -> None:

other = ResourcePath(f"{self.root}test.txt")
self.assertEqual(root.join(other), other)
self.assertEqual(other.join("b/new.txt").geturl(), f"{self.root}b/new.txt")
self.assertEqual(other.join("b/new.txt").geturl(), f"{self.root}test.txt/b/new.txt")

other = ResourcePath(f"{self.root}text.txt", forceDirectory=False)
with self.assertRaises(ValueError):
other.join("b/new.text")

joined = ResourcePath(f"{self.root}hsc/payload/").join(
ResourcePath("test.qgraph", forceAbsolute=False)
Expand All @@ -442,6 +464,17 @@ def test_join(self) -> None:
joined = ResourcePath(f"{self.root}hsc/payload/").join(qgraph)
self.assertEqual(joined, qgraph)

with self.assertRaises(ValueError):
root.join("dir/", forceDirectory=False)

temp = root.join("dir2/", isTemporary=True)
with self.assertRaises(RuntimeError):
temp.join("test.txt", isTemporary=False)

rel = ResourcePath("new.txt", forceAbsolute=False, forceDirectory=False)
with self.assertRaises(RuntimeError):
root.join(rel, forceDirectory=True)

def test_quoting(self) -> None:
"""Check that quoting works."""
parent = ResourcePath(self._make_uri("rootdir"), forceDirectory=True)
Expand Down Expand Up @@ -483,7 +516,7 @@ def setUp(self) -> None:
if self.scheme == "file":
# Use a local tempdir because on macOS the temp dirs use symlinks
# so relsymlink gets quite confused.
self.tmpdir = ResourcePath(makeTestTempDir(self.testdir))
self.tmpdir = ResourcePath(makeTestTempDir(self.testdir), forceDirectory=True)
else:
# Create random tmp directory relative to the test root.
self.tmpdir = self.root_uri.join(
Expand Down