From 01666ec8c0a18248dd2364e384a1b35b86f11aa2 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Sun, 22 Mar 2026 00:03:24 +0000 Subject: [PATCH 1/4] feat(upath): add UPath support for remote-path-aware type conversion and file hashing --- pyproject.toml | 1 + src/orcapod/contexts/data/v0.1.json | 7 + src/orcapod/hashing/hash_utils.py | 31 ++++- .../semantic_hashing/builtin_handlers.py | 38 ++++++ .../semantic_struct_converters.py | 106 +++++++++------ .../semantic_types/universal_converter.py | 13 ++ .../test_universal_converter.py | 35 +++++ .../test_upath_struct_converter.py | 125 ++++++++++++++++++ uv.lock | 26 +++- 9 files changed, 336 insertions(+), 46 deletions(-) create mode 100644 tests/test_semantic_types/test_upath_struct_converter.py diff --git a/pyproject.toml b/pyproject.toml index 028decef..c2ae6e1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", + "universal-pathlib>=0.3.8", "starfix @ git+https://github.com/nauticalab/starfix-python.git@344617bc6f7fcabab5c011d5774ed47de33f21de", "pygraphviz>=1.14", "tzdata>=2024.1", diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index f75798b5..4a842344 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -12,6 +12,12 @@ "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { "converters": { + "upath": { + "_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter", + "_config": { + "file_hasher": {"_ref": "file_hasher"} + } + }, "path": { "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", "_config": { @@ -52,6 +58,7 @@ [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}], [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 0addcb77..9c3f4122 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -7,7 +7,7 @@ import xxhash -from orcapod.types import ContentHash +from orcapod.types import ContentHash, PathLike logger = logging.getLogger(__name__) @@ -43,9 +43,26 @@ def combine_hashes( return combined_hash -def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: +def _to_path(file_path: PathLike) -> Path: + """Convert a path-like to a Path, preserving UPath instances. + + If ``file_path`` is already a ``Path`` (or a path-like with ``is_file`` + and ``open`` methods, such as a remote ``UPath``), return it as-is so + that remote-filesystem semantics are retained. Otherwise wrap it in + ``Path()``. + """ + if isinstance(file_path, Path): + return file_path + if hasattr(file_path, "is_file") and hasattr(file_path, "open"): + return file_path # type: ignore[return-value] + return Path(file_path) + + +def hash_file(file_path: PathLike, algorithm="sha256", buffer_size=65536) -> ContentHash: """Calculate the hash of a file using the specified algorithm. + Supports both local ``pathlib.Path`` and remote ``UPath`` objects. + Args: file_path: Path to the file to hash. algorithm: Hash algorithm to use — options include: @@ -56,7 +73,9 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: A ContentHash with method set to the algorithm name and digest containing the raw hash bytes. """ - if not Path(file_path).is_file(): + path = _to_path(file_path) + + if not path.is_file(): raise FileNotFoundError(f"The file {file_path} does not exist") # Hash the path string itself rather than file content @@ -67,7 +86,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: if algorithm == "xxh64": hasher = xxhash.xxh64() - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: @@ -77,7 +96,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: if algorithm == "crc32": crc = 0 - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: @@ -96,7 +115,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: f"Invalid algorithm: {algorithm}. Available algorithms: {valid_algorithms}, xxh64, crc32" ) - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 931d7cc5..829dc59b 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -5,6 +5,7 @@ knows how to process out of the box: - PathContentHandler -- pathlib.Path: returns ContentHash of file content + - UPathContentHandler -- upath.UPath: returns ContentHash of file content (remote-aware) - UUIDHandler -- uuid.UUID: canonical string representation - BytesHandler -- bytes / bytearray: hex string representation - FunctionHandler -- callable with __code__: via FunctionInfoExtractorProtocol @@ -96,6 +97,43 @@ def handle(self, obj: PathLike, hasher: "SemanticHasherProtocol") -> Any: return self.file_hasher.hash_file(path) +class UPathContentHandler: + """ + Handler for universal_pathlib.UPath objects. + + Behaves identically to ``PathContentHandler`` but preserves the UPath + instance so that remote filesystem semantics (e.g. S3, GCS) are retained + during file content hashing. + + Args: + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` + method (satisfies the FileContentHasherProtocol protocol). + """ + + def __init__(self, file_hasher: FileContentHasherProtocol) -> None: + self.file_hasher = file_hasher + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + from upath import UPath + + path = UPath(obj) if not isinstance(obj, UPath) else obj + + if not path.exists(): + raise FileNotFoundError( + f"UPathContentHandler: path does not exist: {path!r}. " + "Paths must refer to existing files for content-based hashing." + ) + + if path.is_dir(): + raise IsADirectoryError( + f"UPathContentHandler: path is a directory: {path!r}. " + "Only regular files are supported for content-based hashing." + ) + + logger.debug("UPathContentHandler: hashing file content at %s", path) + return self.file_hasher.hash_file(path) + + class UUIDHandler: """ Handler for uuid.UUID objects. diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 63d1e236..86b9f864 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -5,9 +5,12 @@ making semantic types visible in schemas and preserved through operations. """ +from abc import ABC, abstractmethod from pathlib import Path from typing import TYPE_CHECKING, Any +from upath import UPath + from orcapod.types import ContentHash from orcapod.utils.lazy_module import LazyModule @@ -74,68 +77,73 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: return ContentHash(method=f"{self.semantic_type_name}:sha256", digest=digest) -# Path-specific implementation -class PathStructConverter(SemanticStructConverterBase): - """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}""" +class FilePathStructConverterBase(SemanticStructConverterBase, ABC): + """Base converter for file path types (Path and UPath). - def __init__(self, file_hasher: "FileContentHasherProtocol"): - super().__init__("path") - self._python_type = Path - self._file_hasher = file_hasher + Extracts the shared conversion logic since Path and UPath have + identical APIs for the operations we need (str conversion, + construction from string, ``read_bytes``). + """ - # Define the Arrow struct type for paths - self._arrow_struct_type = pa.struct( - [ - pa.field("path", pa.large_string()), - ] - ) + def __init__( + self, + name: str, + path_type: type, + file_hasher: "FileContentHasherProtocol", + ): + super().__init__(name) + self._python_type = path_type + self._field_name = name + self._file_hasher = file_hasher + self._arrow_struct_type = pa.struct([ + pa.field(name, pa.large_string()), + ]) @property def python_type(self) -> type: return self._python_type @property - def arrow_struct_type(self) -> pa.StructType: + def arrow_struct_type(self) -> "pa.StructType": return self._arrow_struct_type - def python_to_struct_dict(self, value: Path) -> dict[str, Any]: - """Convert Path to struct dictionary.""" - if not isinstance(value, Path): - raise TypeError(f"Expected Path, got {type(value)}") + @abstractmethod + def _make_path(self, path_str: str) -> Any: + """Construct the appropriate path object from a string.""" + ... - return { - "path": str(value), - } + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert path object to struct dictionary.""" + if not isinstance(value, self._python_type): + raise TypeError(f"Expected {self._python_type.__name__}, got {type(value)}") + return {self._field_name: str(value)} - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: - """Convert struct dictionary back to Path.""" - path_str = struct_dict.get("path") + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dictionary back to path object.""" + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'path' field in struct") - - return Path(path_str) + raise ValueError(f"Missing '{self._field_name}' field in struct") + return self._make_path(path_str) def can_handle_python_type(self, python_type: type) -> bool: """Check if this converter can handle the given Python type.""" - return issubclass(python_type, Path) + return issubclass(python_type, self._python_type) - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: """Check if this converter can handle the given struct type.""" - # Check if struct has the expected fields for field in self._arrow_struct_type: if ( field.name not in struct_type.names or struct_type[field.name].type != field.type ): return False - return True def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: """Check if a struct dictionary represents this semantic type.""" - # TODO: infer this check based on identified struct type as defined in the __init__ - return set(struct_dict.keys()) == {"path"} and isinstance( - struct_dict["path"], str + return ( + set(struct_dict.keys()) == {self._field_name} + and isinstance(struct_dict[self._field_name], str) ) def hash_struct_dict( @@ -144,8 +152,8 @@ def hash_struct_dict( """Compute hash of a path semantic type by hashing the file content. Args: - struct_dict: Dict with a "path" key containing a file path string. - add_prefix: If True, prefix with "path:sha256:...". + struct_dict: Dict with the path field containing a file path string. + add_prefix: If True, prefix with semantic type and algorithm info. Returns: Hash string of the file content. @@ -154,11 +162,11 @@ def hash_struct_dict( FileNotFoundError: If the path does not exist. IsADirectoryError: If the path is a directory. """ - path_str = struct_dict.get("path") + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'path' field in struct dict") + raise ValueError(f"Missing '{self._field_name}' field in struct dict") - path = Path(path_str) + path = self._make_path(path_str) if not path.exists(): raise FileNotFoundError(f"Path does not exist: {path}") if path.is_dir(): @@ -166,3 +174,23 @@ def hash_struct_dict( content_hash = self._file_hasher.hash_file(path) return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) + + +class PathStructConverter(FilePathStructConverterBase): + """Converter for pathlib.Path objects to/from semantic structs.""" + + def __init__(self, file_hasher: "FileContentHasherProtocol"): + super().__init__("path", Path, file_hasher) + + def _make_path(self, path_str: str) -> Path: + return Path(path_str) + + +class UPathStructConverter(FilePathStructConverterBase): + """Converter for universal_pathlib.UPath objects to/from semantic structs.""" + + def __init__(self, file_hasher: "FileContentHasherProtocol"): + super().__init__("upath", UPath, file_hasher) + + def _make_path(self, path_str: str) -> UPath: + return UPath(path_str) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index be76e808..809dd5a0 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -660,6 +660,19 @@ def _create_python_to_arrow_converter( f"f{i}": converters[i](item) for i, item in enumerate(value) } + # Handle Optional[T] unions; complex unions (e.g., A | B) are not currently supported + elif origin is typing.Union or origin is types.UnionType: + non_none_types = [t for t in args if t is not type(None)] + if len(non_none_types) == 1: + # Optional[T] - use converter for T, pass through None + inner_converter = self.get_python_to_arrow_converter(non_none_types[0]) + return lambda value: inner_converter(value) if value is not None else None + else: + raise ValueError( + f"Complex unions with multiple non-None types are not supported: {python_type}. " + f"Only Optional[T] (i.e., T | None) is allowed." + ) + else: # Default passthrough return lambda value: value diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py index 38b46eb7..55b85e2f 100644 --- a/tests/test_semantic_types/test_universal_converter.py +++ b/tests/test_semantic_types/test_universal_converter.py @@ -33,6 +33,41 @@ def test_python_type_to_arrow_type_custom(): assert field.type == pa.large_string() +def test_python_type_to_arrow_type_upath(): + from upath import UPath + + arrow_type = universal_converter.python_type_to_arrow_type(UPath) + # Should be a StructType with field 'upath' of type large_string + assert isinstance(arrow_type, pa.StructType) + assert len(arrow_type) == 1 + field = arrow_type[0] + assert field.name == "upath" + assert field.type == pa.large_string() + + +def test_optional_upath_converter(): + """Test that Optional[UPath] correctly converts UPath values.""" + from upath import UPath + + to_arrow, to_python = universal_converter.get_conversion_functions(UPath | None) + + # Test with UPath value + path = UPath("/tmp/test.txt") + result = to_arrow(path) + assert result == {"upath": "/tmp/test.txt"} + + # Test with None + assert to_arrow(None) is None + + +def test_complex_union_raises_error(): + """Test that complex unions (multiple non-None types) raise ValueError.""" + from upath import UPath + + with pytest.raises(ValueError, match="Complex unions"): + universal_converter.get_conversion_functions(UPath | Path) + + def test_python_type_to_arrow_type_context(): ctx = get_default_context() assert universal_converter.python_type_to_arrow_type(int, ctx) == pa.int64() diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py new file mode 100644 index 00000000..42e50ce7 --- /dev/null +++ b/tests/test_semantic_types/test_upath_struct_converter.py @@ -0,0 +1,125 @@ +from pathlib import Path +from typing import cast + +import pytest +from upath import UPath + +from orcapod.hashing.file_hashers import BasicFileHasher +from orcapod.semantic_types.semantic_struct_converters import UPathStructConverter + + +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def converter(file_hasher): + return UPathStructConverter(file_hasher=file_hasher) + + +def test_upath_to_struct_and_back(converter): + path_obj = UPath("/tmp/test.txt") + struct_dict = converter.python_to_struct_dict(path_obj) + assert struct_dict["upath"] == str(path_obj) + restored = converter.struct_dict_to_python(struct_dict) + assert isinstance(restored, UPath) + assert str(restored) == str(path_obj) + + +def test_upath_to_struct_invalid_type(converter): + with pytest.raises(TypeError): + converter.python_to_struct_dict(Path("/tmp/test.txt")) # type: ignore + + +def test_struct_to_python_missing_field(converter): + with pytest.raises(ValueError): + converter.struct_dict_to_python({}) + + +def test_can_handle_python_type(converter): + assert converter.can_handle_python_type(UPath) + assert not converter.can_handle_python_type(str) + assert not converter.can_handle_python_type(Path) + + +def test_can_handle_struct_type(converter): + struct_type = converter.arrow_struct_type + assert converter.can_handle_struct_type(struct_type) + + +def test_is_semantic_struct(converter): + assert converter.is_semantic_struct({"upath": "/tmp/test.txt"}) + assert not converter.is_semantic_struct({"path": "/tmp/test.txt"}) + assert not converter.is_semantic_struct({"upath": 123}) + + +def test_hash_struct_dict_file_not_found(converter, tmp_path): + struct_dict = {"upath": str(tmp_path / "does_not_exist.txt")} + with pytest.raises(FileNotFoundError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_is_directory(converter, tmp_path): + struct_dict = {"upath": str(tmp_path)} + with pytest.raises(IsADirectoryError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_content_based(converter, tmp_path): + """Two distinct files with identical content produce the same hash.""" + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + content = "identical content" + file1.write_text(content) + file2.write_text(content) + hash1 = converter.hash_struct_dict({"upath": str(file1)}) + hash2 = converter.hash_struct_dict({"upath": str(file2)}) + assert hash1 == hash2 + + +def test_hash_struct_dict_with_prefix(converter, tmp_path): + """Prefixed hash starts with 'upath:sha256:'.""" + file = tmp_path / "file.txt" + file.write_text("hello") + hash_str = converter.hash_struct_dict({"upath": str(file)}, add_prefix=True) + assert hash_str.startswith("upath:sha256:") + + +def test_hash_struct_dict_different_content(converter, tmp_path): + """Same path with modified content yields a different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = converter.hash_struct_dict({"upath": str(file)}) + file.write_text("version 2") + hash2 = converter.hash_struct_dict({"upath": str(file)}) + assert hash1 != hash2 + + +def test_hash_struct_dict_missing_field(converter): + with pytest.raises(ValueError, match="Missing 'upath' field"): + converter.hash_struct_dict({}) + + +def test_upath_arrow_struct_type(converter): + """The Arrow struct type has a single 'upath' field of large_string.""" + import pyarrow as pa + + struct_type = converter.arrow_struct_type + assert isinstance(struct_type, pa.StructType) + assert len(struct_type) == 1 + assert struct_type[0].name == "upath" + assert struct_type[0].type == pa.large_string() + + +def test_path_and_upath_struct_types_differ(): + """Path and UPath converters produce distinct Arrow struct types.""" + from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PathStructConverter(file_hasher=file_hasher) + upath_conv = UPathStructConverter(file_hasher=file_hasher) + + assert path_conv.arrow_struct_type != upath_conv.arrow_struct_type + assert path_conv.arrow_struct_type[0].name == "path" + assert upath_conv.arrow_struct_type[0].name == "upath" diff --git a/uv.lock b/uv.lock index eeae09e0..886a1c4b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11.0" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'darwin'", @@ -2028,6 +2028,7 @@ dependencies = [ { name = "starfix" }, { name = "typing-extensions" }, { name = "tzdata" }, + { name = "universal-pathlib" }, { name = "uuid-utils" }, { name = "xxhash" }, ] @@ -2096,6 +2097,7 @@ requires-dist = [ { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de" }, { name = "typing-extensions" }, { name = "tzdata", specifier = ">=2024.1" }, + { name = "universal-pathlib", specifier = ">=0.3.8" }, { name = "uuid-utils", specifier = ">=0.11.1" }, { name = "xxhash" }, ] @@ -2195,6 +2197,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, ] +[[package]] +name = "pathlib-abc" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/cb/448649d7f25d228bf0be3a04590ab7afa77f15e056f8fa976ed05ec9a78f/pathlib_abc-0.5.2.tar.gz", hash = "sha256:fcd56f147234645e2c59c7ae22808b34c364bb231f685ddd9f96885aed78a94c", size = 33342, upload-time = "2025-10-10T18:37:20.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/29/c028a0731e202035f0e2e0bfbf1a3e46ad6c628cbb17f6f1cc9eea5d9ff1/pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb", size = 19070, upload-time = "2025-10-10T18:37:19.437Z" }, +] + [[package]] name = "pathspec" version = "1.0.4" @@ -3576,6 +3587,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "universal-pathlib" +version = "0.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, + { name = "pathlib-abc" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/6e/d997a70ee8f4c61f9a7e2f4f8af721cf072a3326848fc881b05187e52558/universal_pathlib-0.3.10.tar.gz", hash = "sha256:4487cbc90730a48cfb64f811d99e14b6faed6d738420cd5f93f59f48e6930bfb", size = 261110, upload-time = "2026-02-22T14:40:58.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/1a/5d9a402b39ec892d856bbdd9db502ff73ce28cdf4aff72eb1ce1d6843506/universal_pathlib-0.3.10-py3-none-any.whl", hash = "sha256:dfaf2fb35683d2eb1287a3ed7b215e4d6016aa6eaf339c607023d22f90821c66", size = 83528, upload-time = "2026-02-22T14:40:57.316Z" }, +] + [[package]] name = "urllib3" version = "2.4.0" From dd5d8cf17ead399b187b3aac573620fe225de535 Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Sun, 22 Mar 2026 01:12:29 +0000 Subject: [PATCH 2/4] fix(upath): address review issues in UPath support - Move top-level UPath import to TYPE_CHECKING with lazy imports in methods - Replace duck-typing in _to_path() with isinstance check (UPath subclasses Path) - Make PathStructConverter reject UPath instances to avoid dispatch ambiguity - Raise ValueError for complex unions in python_type_to_arrow_type (was silently picking first type) - Reject non-UPath objects in UPathContentHandler.handle() instead of silently wrapping - Add tests for PathStructConverter/UPath boundary and consistent union error --- src/orcapod/hashing/hash_utils.py | 9 ++--- .../semantic_hashing/builtin_handlers.py | 18 +++++---- .../semantic_struct_converters.py | 37 +++++++++++++++++-- .../semantic_types/universal_converter.py | 12 +++--- .../test_universal_converter.py | 3 ++ .../test_upath_struct_converter.py | 23 ++++++++++++ 6 files changed, 80 insertions(+), 22 deletions(-) diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 9c3f4122..30bcab74 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -46,15 +46,12 @@ def combine_hashes( def _to_path(file_path: PathLike) -> Path: """Convert a path-like to a Path, preserving UPath instances. - If ``file_path`` is already a ``Path`` (or a path-like with ``is_file`` - and ``open`` methods, such as a remote ``UPath``), return it as-is so - that remote-filesystem semantics are retained. Otherwise wrap it in - ``Path()``. + If ``file_path`` is already a ``Path`` (including ``UPath`` subclasses), + return it as-is so that remote-filesystem semantics are retained. + Otherwise wrap it in ``Path()``. """ if isinstance(file_path, Path): return file_path - if hasattr(file_path, "is_file") and hasattr(file_path, "open"): - return file_path # type: ignore[return-value] return Path(file_path) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 829dc59b..a5e7fa9b 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -116,22 +116,26 @@ def __init__(self, file_hasher: FileContentHasherProtocol) -> None: def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: from upath import UPath - path = UPath(obj) if not isinstance(obj, UPath) else obj + if not isinstance(obj, UPath): + raise TypeError( + f"UPathContentHandler: expected a UPath, got {type(obj)!r}. " + "Use PathContentHandler for pathlib.Path objects." + ) - if not path.exists(): + if not obj.exists(): raise FileNotFoundError( - f"UPathContentHandler: path does not exist: {path!r}. " + f"UPathContentHandler: path does not exist: {obj!r}. " "Paths must refer to existing files for content-based hashing." ) - if path.is_dir(): + if obj.is_dir(): raise IsADirectoryError( - f"UPathContentHandler: path is a directory: {path!r}. " + f"UPathContentHandler: path is a directory: {obj!r}. " "Only regular files are supported for content-based hashing." ) - logger.debug("UPathContentHandler: hashing file content at %s", path) - return self.file_hasher.hash_file(path) + logger.debug("UPathContentHandler: hashing file content at %s", obj) + return self.file_hasher.hash_file(obj) class UUIDHandler: diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 86b9f864..fbe57227 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -9,13 +9,12 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from upath import UPath - from orcapod.types import ContentHash from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import pyarrow as pa + from upath import UPath from orcapod.protocols.hashing_protocols import FileContentHasherProtocol else: @@ -177,7 +176,11 @@ def hash_struct_dict( class PathStructConverter(FilePathStructConverterBase): - """Converter for pathlib.Path objects to/from semantic structs.""" + """Converter for pathlib.Path objects to/from semantic structs. + + Rejects ``UPath`` instances to avoid ambiguity with + ``UPathStructConverter``, since ``UPath`` is a ``Path`` subclass. + """ def __init__(self, file_hasher: "FileContentHasherProtocol"): super().__init__("path", Path, file_hasher) @@ -185,12 +188,38 @@ def __init__(self, file_hasher: "FileContentHasherProtocol"): def _make_path(self, path_str: str) -> Path: return Path(path_str) + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert Path to struct dictionary, rejecting UPath instances.""" + from upath import UPath + + if isinstance(value, UPath): + raise TypeError( + f"Expected Path (not UPath), got {type(value)}. " + "Use UPathStructConverter for UPath instances." + ) + return super().python_to_struct_dict(value) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type. + + Returns False for UPath (and its subclasses) to avoid ambiguity. + """ + from upath import UPath + + if issubclass(python_type, UPath): + return False + return issubclass(python_type, Path) + class UPathStructConverter(FilePathStructConverterBase): """Converter for universal_pathlib.UPath objects to/from semantic structs.""" def __init__(self, file_hasher: "FileContentHasherProtocol"): + from upath import UPath + super().__init__("upath", UPath, file_hasher) - def _make_path(self, path_str: str) -> UPath: + def _make_path(self, path_str: str) -> "UPath": + from upath import UPath + return UPath(path_str) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index 809dd5a0..88112ca4 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -404,13 +404,15 @@ def _convert_python_to_arrow(self, python_type: DataType) -> pa.DataType: # Handle Union/Optional types elif origin is typing.Union or origin is types.UnionType: - if len(args) == 2 and type(None) in args: + non_none_types = [t for t in args if t is not type(None)] + if len(non_none_types) == 1: # Optional[T] → just T (nullability handled at field level) - non_none_type = args[0] if args[1] is type(None) else args[1] - return self.python_type_to_arrow_type(non_none_type) + return self.python_type_to_arrow_type(non_none_types[0]) else: - # Complex unions → use first type as fallback - return self.python_type_to_arrow_type(args[0]) + raise ValueError( + f"Complex unions with multiple non-None types are not supported: {python_type}. " + f"Only Optional[T] (i.e., T | None) is allowed." + ) # Handle set types → lists elif origin is set: diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py index 55b85e2f..4113f935 100644 --- a/tests/test_semantic_types/test_universal_converter.py +++ b/tests/test_semantic_types/test_universal_converter.py @@ -67,6 +67,9 @@ def test_complex_union_raises_error(): with pytest.raises(ValueError, match="Complex unions"): universal_converter.get_conversion_functions(UPath | Path) + with pytest.raises(ValueError, match="Complex unions"): + universal_converter.python_type_to_arrow_type(UPath | Path) + def test_python_type_to_arrow_type_context(): ctx = get_default_context() diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py index 42e50ce7..10ac7d39 100644 --- a/tests/test_semantic_types/test_upath_struct_converter.py +++ b/tests/test_semantic_types/test_upath_struct_converter.py @@ -123,3 +123,26 @@ def test_path_and_upath_struct_types_differ(): assert path_conv.arrow_struct_type != upath_conv.arrow_struct_type assert path_conv.arrow_struct_type[0].name == "path" assert upath_conv.arrow_struct_type[0].name == "upath" + + +def test_path_converter_rejects_upath(): + """PathStructConverter rejects UPath instances to avoid ambiguity.""" + from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PathStructConverter(file_hasher=file_hasher) + + upath_val = UPath("/tmp/test.txt") + with pytest.raises(TypeError, match="not UPath"): + path_conv.python_to_struct_dict(upath_val) + + +def test_path_converter_cannot_handle_upath_type(): + """PathStructConverter.can_handle_python_type returns False for UPath.""" + from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PathStructConverter(file_hasher=file_hasher) + + assert not path_conv.can_handle_python_type(UPath) + assert path_conv.can_handle_python_type(Path) From 9e3553e9c8eb0f933b1d5375c80fe0f1e476139c Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Sun, 22 Mar 2026 01:45:11 +0000 Subject: [PATCH 3/4] =?UTF-8?q?refactor(upath):=20address=20PR=20#94=20rev?= =?UTF-8?q?iew=20=E2=80=94=20rename=20converters,=20harden=20=5Fto=5Fpath?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../contexts/data/schemas/context_schema.json | 2 +- src/orcapod/contexts/data/v0.1.json | 2 +- src/orcapod/hashing/hash_utils.py | 9 +++++++++ src/orcapod/hashing/versioned_hashers.py | 4 ++-- .../semantic_types/semantic_struct_converters.py | 6 +++--- .../test_file_hashing_consistency.py | 12 ++++++------ .../test_path_struct_converter.py | 4 ++-- .../test_semantic_struct_converters.py | 2 +- .../test_upath_struct_converter.py | 16 ++++++++-------- 9 files changed, 33 insertions(+), 24 deletions(-) diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 1e9f5468..b2380124 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -175,7 +175,7 @@ "_config": { "converters": [ { - "_class": "orcapod.types.semantic_types.PathStructConverter", + "_class": "orcapod.types.semantic_types.PythonPathStructConverter", "_config": {} } ] diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 4a842344..cbe8eee6 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -19,7 +19,7 @@ } }, "path": { - "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", + "_class": "orcapod.semantic_types.semantic_struct_converters.PythonPathStructConverter", "_config": { "file_hasher": {"_ref": "file_hasher"} } diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 30bcab74..b98fc3ea 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -50,6 +50,15 @@ def _to_path(file_path: PathLike) -> Path: return it as-is so that remote-filesystem semantics are retained. Otherwise wrap it in ``Path()``. """ + # Check UPath first to preserve remote-filesystem semantics even if + # the inheritance relationship with pathlib.Path ever changes. + try: + from upath import UPath + + if isinstance(file_path, UPath): + return file_path # type: ignore[return-value] + except ImportError: + pass if isinstance(file_path, Path): return file_path return Path(file_path) diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index fda8cd07..f736293b 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -126,7 +126,7 @@ def get_versioned_semantic_arrow_hasher( from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter # Build a default semantic registry populated with the standard converters. # We use Any-typed locals here to side-step type-checker false positives @@ -134,7 +134,7 @@ def get_versioned_semantic_arrow_hasher( # a slightly different hash_struct_dict signature than the concrete class. registry: Any = SemanticTypeRegistry() file_hasher = BasicFileHasher(algorithm="sha256") - path_converter: Any = PathStructConverter(file_hasher=file_hasher) + path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) logger.debug( diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index fbe57227..65da6fc7 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -76,7 +76,7 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: return ContentHash(method=f"{self.semantic_type_name}:sha256", digest=digest) -class FilePathStructConverterBase(SemanticStructConverterBase, ABC): +class PathStructConverterBase(SemanticStructConverterBase, ABC): """Base converter for file path types (Path and UPath). Extracts the shared conversion logic since Path and UPath have @@ -175,7 +175,7 @@ def hash_struct_dict( return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) -class PathStructConverter(FilePathStructConverterBase): +class PythonPathStructConverter(PathStructConverterBase): """Converter for pathlib.Path objects to/from semantic structs. Rejects ``UPath`` instances to avoid ambiguity with @@ -211,7 +211,7 @@ def can_handle_python_type(self, python_type: type) -> bool: return issubclass(python_type, Path) -class UPathStructConverter(FilePathStructConverterBase): +class UPathStructConverter(PathStructConverterBase): """Converter for universal_pathlib.UPath objects to/from semantic structs.""" def __init__(self, file_hasher: "FileContentHasherProtocol"): diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index ef35379f..1ce9886b 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -2,7 +2,7 @@ Integration tests verifying that file hashing is consistent across both paths: 1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a - path struct column → calls PathStructConverter.hash_struct_dict → file_hasher. + path struct column → calls PythonPathStructConverter.hash_struct_dict → file_hasher. 2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object → calls PathContentHandler.handle → file_hasher. @@ -23,7 +23,7 @@ from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.semantic_types.semantic_struct_converters import PathStructConverter +from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter # --------------------------------------------------------------------------- @@ -39,12 +39,12 @@ def file_hasher(): @pytest.fixture def path_converter(file_hasher): - return PathStructConverter(file_hasher=file_hasher) + return PythonPathStructConverter(file_hasher=file_hasher) @pytest.fixture def arrow_hasher(path_converter): - """SemanticArrowHasher wired with the shared file_hasher via PathStructConverter.""" + """SemanticArrowHasher wired with the shared file_hasher via PythonPathStructConverter.""" registry = SemanticTypeRegistry() registry.register_converter("path", path_converter) return SemanticArrowHasher(semantic_registry=registry) @@ -180,7 +180,7 @@ class TestCrossPathConsistency: def test_arrow_and_semantic_hash_same_file_content( self, path_converter, semantic_hasher, file_hasher, tmp_path ): - """The file content hash extracted by PathStructConverter.hash_struct_dict + """The file content hash extracted by PythonPathStructConverter.hash_struct_dict must match the ContentHash produced by PathContentHandler.handle (which the semantic hasher uses internally for Path objects). @@ -190,7 +190,7 @@ def test_arrow_and_semantic_hash_same_file_content( file = tmp_path / "shared.txt" file.write_text("shared content for both paths") - # Arrow path: PathStructConverter.hash_struct_dict (no prefix) + # Arrow path: PythonPathStructConverter.hash_struct_dict (no prefix) arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py index 73ae46e6..c66ca05d 100644 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -4,7 +4,7 @@ import pytest from orcapod.hashing.file_hashers import BasicFileHasher -from orcapod.semantic_types.semantic_struct_converters import PathStructConverter +from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter @pytest.fixture @@ -14,7 +14,7 @@ def file_hasher(): @pytest.fixture def converter(file_hasher): - return PathStructConverter(file_hasher=file_hasher) + return PythonPathStructConverter(file_hasher=file_hasher) def test_path_to_struct_and_back(converter): diff --git a/tests/test_semantic_types/test_semantic_struct_converters.py b/tests/test_semantic_types/test_semantic_struct_converters.py index cd8f34f3..77bdfb1d 100644 --- a/tests/test_semantic_types/test_semantic_struct_converters.py +++ b/tests/test_semantic_types/test_semantic_struct_converters.py @@ -62,7 +62,7 @@ def test_compute_content_hash(): assert result.digest == hashlib.sha256(data).digest() -# --- PathStructConverter tests --- +# --- PythonPathStructConverter tests --- def test_extensibility_with_new_converter(): diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py index 10ac7d39..ee503ae0 100644 --- a/tests/test_semantic_types/test_upath_struct_converter.py +++ b/tests/test_semantic_types/test_upath_struct_converter.py @@ -114,10 +114,10 @@ def test_upath_arrow_struct_type(converter): def test_path_and_upath_struct_types_differ(): """Path and UPath converters produce distinct Arrow struct types.""" - from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PathStructConverter(file_hasher=file_hasher) + path_conv = PythonPathStructConverter(file_hasher=file_hasher) upath_conv = UPathStructConverter(file_hasher=file_hasher) assert path_conv.arrow_struct_type != upath_conv.arrow_struct_type @@ -126,11 +126,11 @@ def test_path_and_upath_struct_types_differ(): def test_path_converter_rejects_upath(): - """PathStructConverter rejects UPath instances to avoid ambiguity.""" - from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + """PythonPathStructConverter rejects UPath instances to avoid ambiguity.""" + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PathStructConverter(file_hasher=file_hasher) + path_conv = PythonPathStructConverter(file_hasher=file_hasher) upath_val = UPath("/tmp/test.txt") with pytest.raises(TypeError, match="not UPath"): @@ -138,11 +138,11 @@ def test_path_converter_rejects_upath(): def test_path_converter_cannot_handle_upath_type(): - """PathStructConverter.can_handle_python_type returns False for UPath.""" - from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + """PythonPathStructConverter.can_handle_python_type returns False for UPath.""" + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter file_hasher = BasicFileHasher(algorithm="sha256") - path_conv = PathStructConverter(file_hasher=file_hasher) + path_conv = PythonPathStructConverter(file_hasher=file_hasher) assert not path_conv.can_handle_python_type(UPath) assert path_conv.can_handle_python_type(Path) From b665ca589cb82224294bb4dd39c88b7c457b9d7d Mon Sep 17 00:00:00 2001 From: Brian Arnold Date: Sun, 22 Mar 2026 16:14:37 +0000 Subject: [PATCH 4/4] refactor(upath): make UPath a top-level import and fix _to_path return type --- src/orcapod/hashing/hash_utils.py | 12 ++++-------- .../hashing/semantic_hashing/builtin_handlers.py | 4 ++-- .../semantic_types/semantic_struct_converters.py | 13 +++---------- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index b98fc3ea..c5040e39 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -6,6 +6,7 @@ from pathlib import Path import xxhash +from upath import UPath from orcapod.types import ContentHash, PathLike @@ -43,7 +44,7 @@ def combine_hashes( return combined_hash -def _to_path(file_path: PathLike) -> Path: +def _to_path(file_path: PathLike) -> Path | UPath: """Convert a path-like to a Path, preserving UPath instances. If ``file_path`` is already a ``Path`` (including ``UPath`` subclasses), @@ -52,13 +53,8 @@ def _to_path(file_path: PathLike) -> Path: """ # Check UPath first to preserve remote-filesystem semantics even if # the inheritance relationship with pathlib.Path ever changes. - try: - from upath import UPath - - if isinstance(file_path, UPath): - return file_path # type: ignore[return-value] - except ImportError: - pass + if isinstance(file_path, UPath): + return file_path if isinstance(file_path, Path): return file_path return Path(file_path) diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index a5e7fa9b..bbce461d 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -34,6 +34,8 @@ from typing import TYPE_CHECKING, Any from uuid import UUID +from upath import UPath + from orcapod.types import PathLike, Schema if TYPE_CHECKING: @@ -114,8 +116,6 @@ def __init__(self, file_hasher: FileContentHasherProtocol) -> None: self.file_hasher = file_hasher def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: - from upath import UPath - if not isinstance(obj, UPath): raise TypeError( f"UPathContentHandler: expected a UPath, got {type(obj)!r}. " diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 65da6fc7..e80b3c5d 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -9,12 +9,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from upath import UPath + from orcapod.types import ContentHash from orcapod.utils.lazy_module import LazyModule if TYPE_CHECKING: import pyarrow as pa - from upath import UPath from orcapod.protocols.hashing_protocols import FileContentHasherProtocol else: @@ -190,8 +191,6 @@ def _make_path(self, path_str: str) -> Path: def python_to_struct_dict(self, value: Any) -> dict[str, Any]: """Convert Path to struct dictionary, rejecting UPath instances.""" - from upath import UPath - if isinstance(value, UPath): raise TypeError( f"Expected Path (not UPath), got {type(value)}. " @@ -204,8 +203,6 @@ def can_handle_python_type(self, python_type: type) -> bool: Returns False for UPath (and its subclasses) to avoid ambiguity. """ - from upath import UPath - if issubclass(python_type, UPath): return False return issubclass(python_type, Path) @@ -215,11 +212,7 @@ class UPathStructConverter(PathStructConverterBase): """Converter for universal_pathlib.UPath objects to/from semantic structs.""" def __init__(self, file_hasher: "FileContentHasherProtocol"): - from upath import UPath - super().__init__("upath", UPath, file_hasher) - def _make_path(self, path_str: str) -> "UPath": - from upath import UPath - + def _make_path(self, path_str: str) -> UPath: return UPath(path_str)