diff --git a/pyproject.toml b/pyproject.toml index 028decef..c2ae6e1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "deltalake>=1.0.2", "graphviz>=0.21", "gitpython>=3.1.45", + "universal-pathlib>=0.3.8", "starfix @ git+https://github.com/nauticalab/starfix-python.git@344617bc6f7fcabab5c011d5774ed47de33f21de", "pygraphviz>=1.14", "tzdata>=2024.1", diff --git a/src/orcapod/contexts/data/schemas/context_schema.json b/src/orcapod/contexts/data/schemas/context_schema.json index 1e9f5468..b2380124 100644 --- a/src/orcapod/contexts/data/schemas/context_schema.json +++ b/src/orcapod/contexts/data/schemas/context_schema.json @@ -175,7 +175,7 @@ "_config": { "converters": [ { - "_class": "orcapod.types.semantic_types.PathStructConverter", + "_class": "orcapod.types.semantic_types.PythonPathStructConverter", "_config": {} } ] diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index f75798b5..cbe8eee6 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -12,8 +12,14 @@ "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { "converters": { + "upath": { + "_class": "orcapod.semantic_types.semantic_struct_converters.UPathStructConverter", + "_config": { + "file_hasher": {"_ref": "file_hasher"} + } + }, "path": { - "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", + "_class": "orcapod.semantic_types.semantic_struct_converters.PythonPathStructConverter", "_config": { "file_hasher": {"_ref": "file_hasher"} } @@ -52,6 +58,7 @@ [{"_type": "builtins.bytes"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], [{"_type": "builtins.bytearray"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.BytesHandler", "_config": {}}], [{"_type": "pathlib.Path"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.PathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], + [{"_type": "upath.core.UPath"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UPathContentHandler", "_config": {"file_hasher": {"_ref": "file_hasher"}}}], [{"_type": "uuid.UUID"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.UUIDHandler", "_config": {}}], [{"_type": "types.FunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 0addcb77..c5040e39 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -6,8 +6,9 @@ from pathlib import Path import xxhash +from upath import UPath -from orcapod.types import ContentHash +from orcapod.types import ContentHash, PathLike logger = logging.getLogger(__name__) @@ -43,9 +44,27 @@ def combine_hashes( return combined_hash -def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: +def _to_path(file_path: PathLike) -> Path | UPath: + """Convert a path-like to a Path, preserving UPath instances. + + If ``file_path`` is already a ``Path`` (including ``UPath`` subclasses), + return it as-is so that remote-filesystem semantics are retained. + Otherwise wrap it in ``Path()``. + """ + # Check UPath first to preserve remote-filesystem semantics even if + # the inheritance relationship with pathlib.Path ever changes. + if isinstance(file_path, UPath): + return file_path + if isinstance(file_path, Path): + return file_path + return Path(file_path) + + +def hash_file(file_path: PathLike, algorithm="sha256", buffer_size=65536) -> ContentHash: """Calculate the hash of a file using the specified algorithm. + Supports both local ``pathlib.Path`` and remote ``UPath`` objects. + Args: file_path: Path to the file to hash. algorithm: Hash algorithm to use — options include: @@ -56,7 +75,9 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: A ContentHash with method set to the algorithm name and digest containing the raw hash bytes. """ - if not Path(file_path).is_file(): + path = _to_path(file_path) + + if not path.is_file(): raise FileNotFoundError(f"The file {file_path} does not exist") # Hash the path string itself rather than file content @@ -67,7 +88,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: if algorithm == "xxh64": hasher = xxhash.xxh64() - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: @@ -77,7 +98,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: if algorithm == "crc32": crc = 0 - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: @@ -96,7 +117,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: f"Invalid algorithm: {algorithm}. Available algorithms: {valid_algorithms}, xxh64, crc32" ) - with open(file_path, "rb") as file: + with path.open("rb") as file: while True: data = file.read(buffer_size) if not data: diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index 931d7cc5..bbce461d 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -5,6 +5,7 @@ knows how to process out of the box: - PathContentHandler -- pathlib.Path: returns ContentHash of file content + - UPathContentHandler -- upath.UPath: returns ContentHash of file content (remote-aware) - UUIDHandler -- uuid.UUID: canonical string representation - BytesHandler -- bytes / bytearray: hex string representation - FunctionHandler -- callable with __code__: via FunctionInfoExtractorProtocol @@ -33,6 +34,8 @@ from typing import TYPE_CHECKING, Any from uuid import UUID +from upath import UPath + from orcapod.types import PathLike, Schema if TYPE_CHECKING: @@ -96,6 +99,45 @@ def handle(self, obj: PathLike, hasher: "SemanticHasherProtocol") -> Any: return self.file_hasher.hash_file(path) +class UPathContentHandler: + """ + Handler for universal_pathlib.UPath objects. + + Behaves identically to ``PathContentHandler`` but preserves the UPath + instance so that remote filesystem semantics (e.g. S3, GCS) are retained + during file content hashing. + + Args: + file_hasher: Any object with a ``hash_file(path) -> ContentHash`` + method (satisfies the FileContentHasherProtocol protocol). + """ + + def __init__(self, file_hasher: FileContentHasherProtocol) -> None: + self.file_hasher = file_hasher + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + if not isinstance(obj, UPath): + raise TypeError( + f"UPathContentHandler: expected a UPath, got {type(obj)!r}. " + "Use PathContentHandler for pathlib.Path objects." + ) + + if not obj.exists(): + raise FileNotFoundError( + f"UPathContentHandler: path does not exist: {obj!r}. " + "Paths must refer to existing files for content-based hashing." + ) + + if obj.is_dir(): + raise IsADirectoryError( + f"UPathContentHandler: path is a directory: {obj!r}. " + "Only regular files are supported for content-based hashing." + ) + + logger.debug("UPathContentHandler: hashing file content at %s", obj) + return self.file_hasher.hash_file(obj) + + class UUIDHandler: """ Handler for uuid.UUID objects. diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index fda8cd07..f736293b 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -126,7 +126,7 @@ def get_versioned_semantic_arrow_hasher( from orcapod.hashing.arrow_hashers import StarfixArrowHasher from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry - from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter # Build a default semantic registry populated with the standard converters. # We use Any-typed locals here to side-step type-checker false positives @@ -134,7 +134,7 @@ def get_versioned_semantic_arrow_hasher( # a slightly different hash_struct_dict signature than the concrete class. registry: Any = SemanticTypeRegistry() file_hasher = BasicFileHasher(algorithm="sha256") - path_converter: Any = PathStructConverter(file_hasher=file_hasher) + path_converter: Any = PythonPathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) logger.debug( diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 63d1e236..e80b3c5d 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -5,9 +5,12 @@ making semantic types visible in schemas and preserved through operations. """ +from abc import ABC, abstractmethod from pathlib import Path from typing import TYPE_CHECKING, Any +from upath import UPath + from orcapod.types import ContentHash from orcapod.utils.lazy_module import LazyModule @@ -74,68 +77,73 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: return ContentHash(method=f"{self.semantic_type_name}:sha256", digest=digest) -# Path-specific implementation -class PathStructConverter(SemanticStructConverterBase): - """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}""" +class PathStructConverterBase(SemanticStructConverterBase, ABC): + """Base converter for file path types (Path and UPath). - def __init__(self, file_hasher: "FileContentHasherProtocol"): - super().__init__("path") - self._python_type = Path - self._file_hasher = file_hasher + Extracts the shared conversion logic since Path and UPath have + identical APIs for the operations we need (str conversion, + construction from string, ``read_bytes``). + """ - # Define the Arrow struct type for paths - self._arrow_struct_type = pa.struct( - [ - pa.field("path", pa.large_string()), - ] - ) + def __init__( + self, + name: str, + path_type: type, + file_hasher: "FileContentHasherProtocol", + ): + super().__init__(name) + self._python_type = path_type + self._field_name = name + self._file_hasher = file_hasher + self._arrow_struct_type = pa.struct([ + pa.field(name, pa.large_string()), + ]) @property def python_type(self) -> type: return self._python_type @property - def arrow_struct_type(self) -> pa.StructType: + def arrow_struct_type(self) -> "pa.StructType": return self._arrow_struct_type - def python_to_struct_dict(self, value: Path) -> dict[str, Any]: - """Convert Path to struct dictionary.""" - if not isinstance(value, Path): - raise TypeError(f"Expected Path, got {type(value)}") + @abstractmethod + def _make_path(self, path_str: str) -> Any: + """Construct the appropriate path object from a string.""" + ... - return { - "path": str(value), - } + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert path object to struct dictionary.""" + if not isinstance(value, self._python_type): + raise TypeError(f"Expected {self._python_type.__name__}, got {type(value)}") + return {self._field_name: str(value)} - def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Path: - """Convert struct dictionary back to Path.""" - path_str = struct_dict.get("path") + def struct_dict_to_python(self, struct_dict: dict[str, Any]) -> Any: + """Convert struct dictionary back to path object.""" + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'path' field in struct") - - return Path(path_str) + raise ValueError(f"Missing '{self._field_name}' field in struct") + return self._make_path(path_str) def can_handle_python_type(self, python_type: type) -> bool: """Check if this converter can handle the given Python type.""" - return issubclass(python_type, Path) + return issubclass(python_type, self._python_type) - def can_handle_struct_type(self, struct_type: pa.StructType) -> bool: + def can_handle_struct_type(self, struct_type: "pa.StructType") -> bool: """Check if this converter can handle the given struct type.""" - # Check if struct has the expected fields for field in self._arrow_struct_type: if ( field.name not in struct_type.names or struct_type[field.name].type != field.type ): return False - return True def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: """Check if a struct dictionary represents this semantic type.""" - # TODO: infer this check based on identified struct type as defined in the __init__ - return set(struct_dict.keys()) == {"path"} and isinstance( - struct_dict["path"], str + return ( + set(struct_dict.keys()) == {self._field_name} + and isinstance(struct_dict[self._field_name], str) ) def hash_struct_dict( @@ -144,8 +152,8 @@ def hash_struct_dict( """Compute hash of a path semantic type by hashing the file content. Args: - struct_dict: Dict with a "path" key containing a file path string. - add_prefix: If True, prefix with "path:sha256:...". + struct_dict: Dict with the path field containing a file path string. + add_prefix: If True, prefix with semantic type and algorithm info. Returns: Hash string of the file content. @@ -154,11 +162,11 @@ def hash_struct_dict( FileNotFoundError: If the path does not exist. IsADirectoryError: If the path is a directory. """ - path_str = struct_dict.get("path") + path_str = struct_dict.get(self._field_name) if path_str is None: - raise ValueError("Missing 'path' field in struct dict") + raise ValueError(f"Missing '{self._field_name}' field in struct dict") - path = Path(path_str) + path = self._make_path(path_str) if not path.exists(): raise FileNotFoundError(f"Path does not exist: {path}") if path.is_dir(): @@ -166,3 +174,45 @@ def hash_struct_dict( content_hash = self._file_hasher.hash_file(path) return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) + + +class PythonPathStructConverter(PathStructConverterBase): + """Converter for pathlib.Path objects to/from semantic structs. + + Rejects ``UPath`` instances to avoid ambiguity with + ``UPathStructConverter``, since ``UPath`` is a ``Path`` subclass. + """ + + def __init__(self, file_hasher: "FileContentHasherProtocol"): + super().__init__("path", Path, file_hasher) + + def _make_path(self, path_str: str) -> Path: + return Path(path_str) + + def python_to_struct_dict(self, value: Any) -> dict[str, Any]: + """Convert Path to struct dictionary, rejecting UPath instances.""" + if isinstance(value, UPath): + raise TypeError( + f"Expected Path (not UPath), got {type(value)}. " + "Use UPathStructConverter for UPath instances." + ) + return super().python_to_struct_dict(value) + + def can_handle_python_type(self, python_type: type) -> bool: + """Check if this converter can handle the given Python type. + + Returns False for UPath (and its subclasses) to avoid ambiguity. + """ + if issubclass(python_type, UPath): + return False + return issubclass(python_type, Path) + + +class UPathStructConverter(PathStructConverterBase): + """Converter for universal_pathlib.UPath objects to/from semantic structs.""" + + def __init__(self, file_hasher: "FileContentHasherProtocol"): + super().__init__("upath", UPath, file_hasher) + + def _make_path(self, path_str: str) -> UPath: + return UPath(path_str) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index be76e808..88112ca4 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -404,13 +404,15 @@ def _convert_python_to_arrow(self, python_type: DataType) -> pa.DataType: # Handle Union/Optional types elif origin is typing.Union or origin is types.UnionType: - if len(args) == 2 and type(None) in args: + non_none_types = [t for t in args if t is not type(None)] + if len(non_none_types) == 1: # Optional[T] → just T (nullability handled at field level) - non_none_type = args[0] if args[1] is type(None) else args[1] - return self.python_type_to_arrow_type(non_none_type) + return self.python_type_to_arrow_type(non_none_types[0]) else: - # Complex unions → use first type as fallback - return self.python_type_to_arrow_type(args[0]) + raise ValueError( + f"Complex unions with multiple non-None types are not supported: {python_type}. " + f"Only Optional[T] (i.e., T | None) is allowed." + ) # Handle set types → lists elif origin is set: @@ -660,6 +662,19 @@ def _create_python_to_arrow_converter( f"f{i}": converters[i](item) for i, item in enumerate(value) } + # Handle Optional[T] unions; complex unions (e.g., A | B) are not currently supported + elif origin is typing.Union or origin is types.UnionType: + non_none_types = [t for t in args if t is not type(None)] + if len(non_none_types) == 1: + # Optional[T] - use converter for T, pass through None + inner_converter = self.get_python_to_arrow_converter(non_none_types[0]) + return lambda value: inner_converter(value) if value is not None else None + else: + raise ValueError( + f"Complex unions with multiple non-None types are not supported: {python_type}. " + f"Only Optional[T] (i.e., T | None) is allowed." + ) + else: # Default passthrough return lambda value: value diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index ef35379f..1ce9886b 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -2,7 +2,7 @@ Integration tests verifying that file hashing is consistent across both paths: 1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a - path struct column → calls PathStructConverter.hash_struct_dict → file_hasher. + path struct column → calls PythonPathStructConverter.hash_struct_dict → file_hasher. 2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object → calls PathContentHandler.handle → file_hasher. @@ -23,7 +23,7 @@ from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry -from orcapod.semantic_types.semantic_struct_converters import PathStructConverter +from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter # --------------------------------------------------------------------------- @@ -39,12 +39,12 @@ def file_hasher(): @pytest.fixture def path_converter(file_hasher): - return PathStructConverter(file_hasher=file_hasher) + return PythonPathStructConverter(file_hasher=file_hasher) @pytest.fixture def arrow_hasher(path_converter): - """SemanticArrowHasher wired with the shared file_hasher via PathStructConverter.""" + """SemanticArrowHasher wired with the shared file_hasher via PythonPathStructConverter.""" registry = SemanticTypeRegistry() registry.register_converter("path", path_converter) return SemanticArrowHasher(semantic_registry=registry) @@ -180,7 +180,7 @@ class TestCrossPathConsistency: def test_arrow_and_semantic_hash_same_file_content( self, path_converter, semantic_hasher, file_hasher, tmp_path ): - """The file content hash extracted by PathStructConverter.hash_struct_dict + """The file content hash extracted by PythonPathStructConverter.hash_struct_dict must match the ContentHash produced by PathContentHandler.handle (which the semantic hasher uses internally for Path objects). @@ -190,7 +190,7 @@ def test_arrow_and_semantic_hash_same_file_content( file = tmp_path / "shared.txt" file.write_text("shared content for both paths") - # Arrow path: PathStructConverter.hash_struct_dict (no prefix) + # Arrow path: PythonPathStructConverter.hash_struct_dict (no prefix) arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py index 73ae46e6..c66ca05d 100644 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -4,7 +4,7 @@ import pytest from orcapod.hashing.file_hashers import BasicFileHasher -from orcapod.semantic_types.semantic_struct_converters import PathStructConverter +from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter @pytest.fixture @@ -14,7 +14,7 @@ def file_hasher(): @pytest.fixture def converter(file_hasher): - return PathStructConverter(file_hasher=file_hasher) + return PythonPathStructConverter(file_hasher=file_hasher) def test_path_to_struct_and_back(converter): diff --git a/tests/test_semantic_types/test_semantic_struct_converters.py b/tests/test_semantic_types/test_semantic_struct_converters.py index cd8f34f3..77bdfb1d 100644 --- a/tests/test_semantic_types/test_semantic_struct_converters.py +++ b/tests/test_semantic_types/test_semantic_struct_converters.py @@ -62,7 +62,7 @@ def test_compute_content_hash(): assert result.digest == hashlib.sha256(data).digest() -# --- PathStructConverter tests --- +# --- PythonPathStructConverter tests --- def test_extensibility_with_new_converter(): diff --git a/tests/test_semantic_types/test_universal_converter.py b/tests/test_semantic_types/test_universal_converter.py index 38b46eb7..4113f935 100644 --- a/tests/test_semantic_types/test_universal_converter.py +++ b/tests/test_semantic_types/test_universal_converter.py @@ -33,6 +33,44 @@ def test_python_type_to_arrow_type_custom(): assert field.type == pa.large_string() +def test_python_type_to_arrow_type_upath(): + from upath import UPath + + arrow_type = universal_converter.python_type_to_arrow_type(UPath) + # Should be a StructType with field 'upath' of type large_string + assert isinstance(arrow_type, pa.StructType) + assert len(arrow_type) == 1 + field = arrow_type[0] + assert field.name == "upath" + assert field.type == pa.large_string() + + +def test_optional_upath_converter(): + """Test that Optional[UPath] correctly converts UPath values.""" + from upath import UPath + + to_arrow, to_python = universal_converter.get_conversion_functions(UPath | None) + + # Test with UPath value + path = UPath("/tmp/test.txt") + result = to_arrow(path) + assert result == {"upath": "/tmp/test.txt"} + + # Test with None + assert to_arrow(None) is None + + +def test_complex_union_raises_error(): + """Test that complex unions (multiple non-None types) raise ValueError.""" + from upath import UPath + + with pytest.raises(ValueError, match="Complex unions"): + universal_converter.get_conversion_functions(UPath | Path) + + with pytest.raises(ValueError, match="Complex unions"): + universal_converter.python_type_to_arrow_type(UPath | Path) + + def test_python_type_to_arrow_type_context(): ctx = get_default_context() assert universal_converter.python_type_to_arrow_type(int, ctx) == pa.int64() diff --git a/tests/test_semantic_types/test_upath_struct_converter.py b/tests/test_semantic_types/test_upath_struct_converter.py new file mode 100644 index 00000000..ee503ae0 --- /dev/null +++ b/tests/test_semantic_types/test_upath_struct_converter.py @@ -0,0 +1,148 @@ +from pathlib import Path +from typing import cast + +import pytest +from upath import UPath + +from orcapod.hashing.file_hashers import BasicFileHasher +from orcapod.semantic_types.semantic_struct_converters import UPathStructConverter + + +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def converter(file_hasher): + return UPathStructConverter(file_hasher=file_hasher) + + +def test_upath_to_struct_and_back(converter): + path_obj = UPath("/tmp/test.txt") + struct_dict = converter.python_to_struct_dict(path_obj) + assert struct_dict["upath"] == str(path_obj) + restored = converter.struct_dict_to_python(struct_dict) + assert isinstance(restored, UPath) + assert str(restored) == str(path_obj) + + +def test_upath_to_struct_invalid_type(converter): + with pytest.raises(TypeError): + converter.python_to_struct_dict(Path("/tmp/test.txt")) # type: ignore + + +def test_struct_to_python_missing_field(converter): + with pytest.raises(ValueError): + converter.struct_dict_to_python({}) + + +def test_can_handle_python_type(converter): + assert converter.can_handle_python_type(UPath) + assert not converter.can_handle_python_type(str) + assert not converter.can_handle_python_type(Path) + + +def test_can_handle_struct_type(converter): + struct_type = converter.arrow_struct_type + assert converter.can_handle_struct_type(struct_type) + + +def test_is_semantic_struct(converter): + assert converter.is_semantic_struct({"upath": "/tmp/test.txt"}) + assert not converter.is_semantic_struct({"path": "/tmp/test.txt"}) + assert not converter.is_semantic_struct({"upath": 123}) + + +def test_hash_struct_dict_file_not_found(converter, tmp_path): + struct_dict = {"upath": str(tmp_path / "does_not_exist.txt")} + with pytest.raises(FileNotFoundError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_is_directory(converter, tmp_path): + struct_dict = {"upath": str(tmp_path)} + with pytest.raises(IsADirectoryError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_content_based(converter, tmp_path): + """Two distinct files with identical content produce the same hash.""" + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + content = "identical content" + file1.write_text(content) + file2.write_text(content) + hash1 = converter.hash_struct_dict({"upath": str(file1)}) + hash2 = converter.hash_struct_dict({"upath": str(file2)}) + assert hash1 == hash2 + + +def test_hash_struct_dict_with_prefix(converter, tmp_path): + """Prefixed hash starts with 'upath:sha256:'.""" + file = tmp_path / "file.txt" + file.write_text("hello") + hash_str = converter.hash_struct_dict({"upath": str(file)}, add_prefix=True) + assert hash_str.startswith("upath:sha256:") + + +def test_hash_struct_dict_different_content(converter, tmp_path): + """Same path with modified content yields a different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = converter.hash_struct_dict({"upath": str(file)}) + file.write_text("version 2") + hash2 = converter.hash_struct_dict({"upath": str(file)}) + assert hash1 != hash2 + + +def test_hash_struct_dict_missing_field(converter): + with pytest.raises(ValueError, match="Missing 'upath' field"): + converter.hash_struct_dict({}) + + +def test_upath_arrow_struct_type(converter): + """The Arrow struct type has a single 'upath' field of large_string.""" + import pyarrow as pa + + struct_type = converter.arrow_struct_type + assert isinstance(struct_type, pa.StructType) + assert len(struct_type) == 1 + assert struct_type[0].name == "upath" + assert struct_type[0].type == pa.large_string() + + +def test_path_and_upath_struct_types_differ(): + """Path and UPath converters produce distinct Arrow struct types.""" + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PythonPathStructConverter(file_hasher=file_hasher) + upath_conv = UPathStructConverter(file_hasher=file_hasher) + + assert path_conv.arrow_struct_type != upath_conv.arrow_struct_type + assert path_conv.arrow_struct_type[0].name == "path" + assert upath_conv.arrow_struct_type[0].name == "upath" + + +def test_path_converter_rejects_upath(): + """PythonPathStructConverter rejects UPath instances to avoid ambiguity.""" + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PythonPathStructConverter(file_hasher=file_hasher) + + upath_val = UPath("/tmp/test.txt") + with pytest.raises(TypeError, match="not UPath"): + path_conv.python_to_struct_dict(upath_val) + + +def test_path_converter_cannot_handle_upath_type(): + """PythonPathStructConverter.can_handle_python_type returns False for UPath.""" + from orcapod.semantic_types.semantic_struct_converters import PythonPathStructConverter + + file_hasher = BasicFileHasher(algorithm="sha256") + path_conv = PythonPathStructConverter(file_hasher=file_hasher) + + assert not path_conv.can_handle_python_type(UPath) + assert path_conv.can_handle_python_type(Path) diff --git a/uv.lock b/uv.lock index eeae09e0..886a1c4b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.11.0" resolution-markers = [ "python_full_version >= '3.14' and sys_platform == 'darwin'", @@ -2028,6 +2028,7 @@ dependencies = [ { name = "starfix" }, { name = "typing-extensions" }, { name = "tzdata" }, + { name = "universal-pathlib" }, { name = "uuid-utils" }, { name = "xxhash" }, ] @@ -2096,6 +2097,7 @@ requires-dist = [ { name = "starfix", git = "https://github.com/nauticalab/starfix-python.git?rev=344617bc6f7fcabab5c011d5774ed47de33f21de" }, { name = "typing-extensions" }, { name = "tzdata", specifier = ">=2024.1" }, + { name = "universal-pathlib", specifier = ">=0.3.8" }, { name = "uuid-utils", specifier = ">=0.11.1" }, { name = "xxhash" }, ] @@ -2195,6 +2197,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, ] +[[package]] +name = "pathlib-abc" +version = "0.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/cb/448649d7f25d228bf0be3a04590ab7afa77f15e056f8fa976ed05ec9a78f/pathlib_abc-0.5.2.tar.gz", hash = "sha256:fcd56f147234645e2c59c7ae22808b34c364bb231f685ddd9f96885aed78a94c", size = 33342, upload-time = "2025-10-10T18:37:20.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b1/29/c028a0731e202035f0e2e0bfbf1a3e46ad6c628cbb17f6f1cc9eea5d9ff1/pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb", size = 19070, upload-time = "2025-10-10T18:37:19.437Z" }, +] + [[package]] name = "pathspec" version = "1.0.4" @@ -3576,6 +3587,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" }, ] +[[package]] +name = "universal-pathlib" +version = "0.3.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fsspec" }, + { name = "pathlib-abc" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/6e/d997a70ee8f4c61f9a7e2f4f8af721cf072a3326848fc881b05187e52558/universal_pathlib-0.3.10.tar.gz", hash = "sha256:4487cbc90730a48cfb64f811d99e14b6faed6d738420cd5f93f59f48e6930bfb", size = 261110, upload-time = "2026-02-22T14:40:58.87Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/1a/5d9a402b39ec892d856bbdd9db502ff73ce28cdf4aff72eb1ce1d6843506/universal_pathlib-0.3.10-py3-none-any.whl", hash = "sha256:dfaf2fb35683d2eb1287a3ed7b215e4d6016aa6eaf339c607023d22f90821c66", size = 83528, upload-time = "2026-02-22T14:40:57.316Z" }, +] + [[package]] name = "urllib3" version = "2.4.0"