From 26c576a331ecf97ab02d94a2fba08eb43dfe026d Mon Sep 17 00:00:00 2001 From: Owen Littlejohns Date: Fri, 17 Oct 2025 13:38:19 -0400 Subject: [PATCH 1/2] DAS-2397: Add single entry point function for all file types. --- CHANGELOG.md | 8 +- README.md | 32 +++++ docs/Using_earthdata-hashdiff.ipynb | 39 ++++++ docs/requirements.txt | 2 +- earthdata_hashdiff/__about__.py | 2 +- earthdata_hashdiff/__init__.py | 2 + earthdata_hashdiff/compare.py | 86 ++++++++++++++ tests/unit/test_compare.py | 178 ++++++++++++++++++++++++++++ 8 files changed, 346 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 03f9053..5c8e982 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,13 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [vX.Y.Z] - Unreleased +## [v1.2.0] - 2025-10-10 + +### Added + +- `matches_reference_hash_file` has been added to the top-level API. This function + can receive an optional argument specifying the file type, or will try to + infer the file type from the path of the file under comparison. ### Changed diff --git a/README.md b/README.md index 5005449..100cd01 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,38 @@ assert geotiff_matches_reference_hash_file( ) ``` +### A single entry point for comparison + +For convenience, you can use the `matches_reference_hash_file` for all of the +file types previously discussed. Each call will accept the paths to the binary +file and JSON hash file, along with appropriate optional kwargs relevant to +the file type. + +```python +from earthdata_hashdiff import matches_reference_hash_file + +assert matches_reference_hash_file( + 'path/to/netcdf/file.nc4', + 'path/to/json/with/hashes.json', +) + +assert matches_reference_hash_file( + 'path/to/netcdf/file.nc4', + 'path/to/json/with/hashes.json', + skipped_metadata_attributes={'attribute_name_one', 'attribute_name_two'}, +) + +assert geotiff_matches_reference_hash_file( + 'path/to/geotiff/file.tif', + 'path/to/json/with/hash.json', +) + +assert geotiff_matches_reference_hash_file( + 'path/to/geotiff/file.tif', + 'path/to/json/with/hash.json', + skipped_metadata_tags={'tag_name_one'}, +) +``` ## Installing ### Using pip diff --git a/docs/Using_earthdata-hashdiff.ipynb b/docs/Using_earthdata-hashdiff.ipynb index 69a8768..8f2106e 100644 --- a/docs/Using_earthdata-hashdiff.ipynb +++ b/docs/Using_earthdata-hashdiff.ipynb @@ -473,6 +473,45 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "dda44bf9-1a3e-4702-b10b-f6b401e2217a", + "metadata": {}, + "source": [ + "## A single comparison entry point\n", + "\n", + "For convenience, you can use the `matches_reference_hash_file` for all of the file types previously discussed. Each call will accept the paths to the binary file and JSON hash file, along with appropriate optional kwargs relevant to the file type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba1196f0-1dc6-4c17-97d8-7b7b8f047c73", + "metadata": {}, + "outputs": [], + "source": [ + "from earthdata_hashdiff import matches_reference_hash_file\n", + "\n", + "# GeoTIFF example\n", + "assert matches_reference_hash_file(\n", + " ecostress_granule,\n", + " f'{ecostress_granule}.json',\n", + ")\n", + "\n", + "# HDF-5 example\n", + "assert matches_reference_hash_file(\n", + " gpm_3imerghh_granule_one,\n", + " f'{gpm_3imerghh_granule_one}.json',\n", + "), 'Binary file did not match previously generated hashes.'\n", + "\n", + "# HDF-5 example with kwargs\n", + "assert matches_reference_hash_file(\n", + " gpm_3imerghh_granule_one,\n", + " f'{gpm_3imerghh_granule_one}.decode.json',\n", + " skipped_variables_or_groups={'/Grid/time', '/Grid/time_bnds'},\n", + "), 'Binary file did not match previously generated hashes.'" + ] + }, { "cell_type": "markdown", "id": "c61a5f43-2bf2-42f6-8c39-abeef381816f", diff --git a/docs/requirements.txt b/docs/requirements.txt index 9b4729c..d330452 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ # These packages are required to run the documentation Jupyter notebook. -earthdata-hashdiff ~= 1.1.0 +earthdata-hashdiff ~= 1.2.0 notebook ~= 7.4.5 requests ~= 2.32.4 diff --git a/earthdata_hashdiff/__about__.py b/earthdata_hashdiff/__about__.py index b8d8324..05b583b 100644 --- a/earthdata_hashdiff/__about__.py +++ b/earthdata_hashdiff/__about__.py @@ -1,3 +1,3 @@ """Version for the package - only edit when intending to release.""" -version = '1.1.0' +version = '1.2.0' diff --git a/earthdata_hashdiff/__init__.py b/earthdata_hashdiff/__init__.py index 1386e19..b098bc9 100644 --- a/earthdata_hashdiff/__init__.py +++ b/earthdata_hashdiff/__init__.py @@ -4,6 +4,7 @@ from earthdata_hashdiff.compare import ( geotiff_matches_reference_hash_file, h5_matches_reference_hash_file, + matches_reference_hash_file, nc4_matches_reference_hash_file, ) from earthdata_hashdiff.generate import ( @@ -26,5 +27,6 @@ 'get_hashes_from_nc4_file', 'geotiff_matches_reference_hash_file', 'h5_matches_reference_hash_file', + 'matches_reference_hash_file', 'nc4_matches_reference_hash_file', ] diff --git a/earthdata_hashdiff/compare.py b/earthdata_hashdiff/compare.py index 294bdc3..6da72c4 100644 --- a/earthdata_hashdiff/compare.py +++ b/earthdata_hashdiff/compare.py @@ -6,6 +6,9 @@ """ import json +from collections.abc import Callable +from os.path import splitext +from typing import Literal, TypedDict, overload from earthdata_hashdiff.generate import ( GEOTIFF_HASH_KEY, @@ -14,6 +17,89 @@ get_hashes_from_xarray_input, ) +HashedFileTypes = Literal['GeoTIFF', 'HDF-5', 'netCDF4'] + + +class XarrayParams(TypedDict): + """Typing for unique inputs to matches_reference_hash_file_using_xarray.""" + + skipped_variables_or_groups: set[str] + skipped_metadata_attributes: set[str] + xarray_kwargs: dict + + +class GeoTIFFParams(TypedDict): + """Typing for unique inputs to geotiff_matches_reference_hash_file.""" + + skipped_metadata_tags: set[str] + + +@overload +def matches_reference_hash_file( + binary_file_path: str, + reference_file_path: str, + **kwargs: XarrayParams, +) -> bool: ... + + +@overload +def matches_reference_hash_file( + binary_file_path: str, + reference_file_path: str, + **kwargs: GeoTIFFParams, +) -> bool: ... + + +def matches_reference_hash_file( + binary_file_path: str, + reference_file_path: str, + **kwargs: XarrayParams | GeoTIFFParams, +) -> bool: + """Generate hashes for request output and compare to reference file. + + Possible kwargs: + + * skipped_variables_or_groups - For netCDF4 or HDF-5 files. + * skipped_metadata_attributes - For netCDF4 or HDF-5 files. + * xarray_kwargs - For netCDF4 or HDF-5 files. + * skipped_metadata_tags - For GeoTIFF files. + + """ + file_type_comparisons: dict[HashedFileTypes, Callable[..., bool]] = { + 'GeoTIFF': geotiff_matches_reference_hash_file, + 'HDF-5': h5_matches_reference_hash_file, + 'netCDF4': nc4_matches_reference_hash_file, + } + + file_type = guess_file_type(binary_file_path) + + comparison_function = file_type_comparisons.get(file_type) + + if comparison_function is None: + raise ValueError('file_type not recognised: {file_type}') + + return comparison_function(binary_file_path, reference_file_path, **kwargs) + + +def guess_file_type(file_path: str) -> HashedFileTypes: + """Return a file type guessed based on the extension for the file. + + If the extension is an entirely unrecognised one, a `ValueError` is raised. + + """ + file_extension = splitext(file_path)[-1].lower() + + if file_extension in ['.tif', '.tiff']: + file_type: HashedFileTypes = 'GeoTIFF' + elif file_extension in ['.h5', '.hdf', '.hdf5']: + file_type = 'HDF-5' + elif file_extension in ['.nc', '.nc4']: + file_type = 'netCDF4' + else: + raise ValueError(f'File extension not recognised: "{file_extension}"') + + return file_type + def matches_reference_hash_file_using_xarray( binary_file_path: str, diff --git a/tests/unit/test_compare.py b/tests/unit/test_compare.py index 059ed3c..f2bebb4 100644 --- a/tests/unit/test_compare.py +++ b/tests/unit/test_compare.py @@ -1,13 +1,17 @@ """Unit tests for the earthdata_hashdiff.compare.py module.""" from os.path import join as path_join +from unittest.mock import patch import numpy as np +import pytest from tifffile import imwrite from earthdata_hashdiff.compare import ( geotiff_matches_reference_hash_file, + guess_file_type, h5_matches_reference_hash_file, + matches_reference_hash_file, matches_reference_hash_file_using_xarray, nc4_matches_reference_hash_file, ) @@ -148,3 +152,177 @@ def test_geotiff_matches_reference_hash_file_metadata_difference_fails( amended_geotiff_path, sample_geotiff_hash_file, ) + + +@pytest.mark.parametrize( + 'file_path,expected_file_type', + [ + ('input.tif', 'GeoTIFF'), + ('input.tiff', 'GeoTIFF'), + ('input.h5', 'HDF-5'), + ('input.hdf', 'HDF-5'), + ('input.HDF', 'HDF-5'), + ('input.hdf5', 'HDF-5'), + ('input.nc', 'netCDF4'), + ('input.nc4', 'netCDF4'), + ], +) +def test_guess_file_type_known_extension(file_path, expected_file_type): + """Ensure known paths with known extensions return the expected file type.""" + assert guess_file_type(file_path) == expected_file_type + + +def test_guess_file_type_not_recognised(): + """Ensure path with unknown extension raises a ValueError.""" + with pytest.raises(ValueError, match=r'File extension not recognised: ".xyz"'): + guess_file_type('input.xyz') + + +@patch('earthdata_hashdiff.compare.geotiff_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.h5_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.nc4_matches_reference_hash_file', autospec=True) +def test_matches_reference_hash_file_netcdf4( + mock_nc4_matches_reference_hash_file, + mock_h5_matches_reference_hash_file, + mock_geotiff_matches_reference_hash_file, +): + """Ensure netCDF4 input is routed to the correct comparison function.""" + mock_nc4_matches_reference_hash_file.return_value = True + + assert matches_reference_hash_file('input.nc4', 'hashes.json') + mock_nc4_matches_reference_hash_file.assert_called_once_with( + 'input.nc4', + 'hashes.json', + ) + + # Ensure other comparison functions weren't called + mock_h5_matches_reference_hash_file.assert_not_called() + mock_geotiff_matches_reference_hash_file.assert_not_called() + + +@patch('earthdata_hashdiff.compare.geotiff_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.h5_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.nc4_matches_reference_hash_file', autospec=True) +def test_matches_reference_hash_file_netcdf4_kwargs( + mock_nc4_matches_reference_hash_file, + mock_h5_matches_reference_hash_file, + mock_geotiff_matches_reference_hash_file, +): + """Ensure netCDF4 input is routed to the comparison function with kwargs.""" + mock_nc4_matches_reference_hash_file.return_value = True + + variables_to_skip = {'variable_one', 'variable_two'} + + assert matches_reference_hash_file( + 'input.nc4', + 'hashes.json', + skipped_variables_or_groups=variables_to_skip, + ) + mock_nc4_matches_reference_hash_file.assert_called_once_with( + 'input.nc4', + 'hashes.json', + skipped_variables_or_groups=variables_to_skip, + ) + + # Ensure other comparison functions weren't called + mock_h5_matches_reference_hash_file.assert_not_called() + mock_geotiff_matches_reference_hash_file.assert_not_called() + + +@patch('earthdata_hashdiff.compare.geotiff_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.h5_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.nc4_matches_reference_hash_file', autospec=True) +def test_matches_reference_hash_file_hdf5( + mock_nc4_matches_reference_hash_file, + mock_h5_matches_reference_hash_file, + mock_geotiff_matches_reference_hash_file, +): + """Ensure an HDF-5 input is routed to the correct comparison function.""" + mock_h5_matches_reference_hash_file.return_value = True + + metadata_to_skip = {'varying_parameter'} + + assert matches_reference_hash_file( + 'input.h5', + 'hashes.json', + skipped_metadata_attributes=metadata_to_skip, + ) + mock_h5_matches_reference_hash_file.assert_called_once_with( + 'input.h5', + 'hashes.json', + skipped_metadata_attributes=metadata_to_skip, + ) + + # Ensure other comparison functions weren't called + mock_nc4_matches_reference_hash_file.assert_not_called() + mock_geotiff_matches_reference_hash_file.assert_not_called() + + +@patch('earthdata_hashdiff.compare.geotiff_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.h5_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.nc4_matches_reference_hash_file', autospec=True) +def test_matches_reference_hash_file_geotiff( + mock_nc4_matches_reference_hash_file, + mock_h5_matches_reference_hash_file, + mock_geotiff_matches_reference_hash_file, +): + """Ensure a GeoTIFF input is routed to the correct comparison function.""" + mock_geotiff_matches_reference_hash_file.return_value = True + + assert matches_reference_hash_file('input.tiff', 'hashes.json') + + mock_geotiff_matches_reference_hash_file.assert_called_once_with( + 'input.tiff', + 'hashes.json', + ) + + # Ensure other comparison functions weren't called + mock_nc4_matches_reference_hash_file.assert_not_called() + mock_h5_matches_reference_hash_file.assert_not_called() + + +def test_matches_reference_hash_file_unknown_kwargs(): + """Ensure input with an unknown kwarg raises a TypeError.""" + with pytest.raises( + TypeError, match=r'got an unexpected keyword argument \'unknown_kwarg\'' + ): + matches_reference_hash_file( + 'input.nc4', + 'hashes.json', + unknown_kwarg='whatami', + ) + + +def test_matches_reference_hash_file_wrong_kwargs(): + """Ensure netCDF4 input with a GeoTIFF kwarg raises a TypeError. + + This test ensures that, even though the function signature is overloaded, + mixing kwargs from one comparison when trying to execute another will fail. + + """ + with pytest.raises( + TypeError, match=r'got an unexpected keyword argument \'skipped_metadata_tags\'' + ): + matches_reference_hash_file( + 'input.nc4', + 'hashes.json', + skipped_metadata_tags={'skipped'}, + ) + + +@patch('earthdata_hashdiff.compare.geotiff_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.h5_matches_reference_hash_file', autospec=True) +@patch('earthdata_hashdiff.compare.nc4_matches_reference_hash_file', autospec=True) +def test_matches_reference_hash_file_unknown_file_extension( + mock_nc4_matches_reference_hash_file, + mock_h5_matches_reference_hash_file, + mock_geotiff_matches_reference_hash_file, +): + """Ensure that a file with an unknown extension raises a ValueError.""" + with pytest.raises(ValueError, match=r'File extension not recognised: ".xyz"'): + matches_reference_hash_file('input.xyz', 'hashes.json') + + # Ensure other comparison functions weren't called + mock_nc4_matches_reference_hash_file.assert_not_called() + mock_h5_matches_reference_hash_file.assert_not_called() + mock_geotiff_matches_reference_hash_file.assert_not_called() From 4271436e6964ac8b98a9b1833e2b39765469c3e3 Mon Sep 17 00:00:00 2001 From: Owen Littlejohns Date: Fri, 17 Oct 2025 16:17:38 -0400 Subject: [PATCH 2/2] DAS-2397: Fix changelog. --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c8e982..b07dcfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `matches_reference_hash_file` has been added to the top-level API. This function - can receive an optional argument specifying the file type, or will try to - infer the file type from the path of the file under comparison. + will try to infer the file type from the path of the file under comparison and + then use the relevant comparison functionality for HDF-5, netCDF4 or GeoTIFF + files. ### Changed