Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [vX.Y.Z] - Unreleased
## [v1.2.0] - 2025-10-10

### Added

- `matches_reference_hash_file` has been added to the top-level API. This function
will try to infer the file type from the path of the file under comparison and
then use the relevant comparison functionality for HDF-5, netCDF4 or GeoTIFF
files.

### Changed

Expand Down
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,38 @@ assert geotiff_matches_reference_hash_file(
)
```

### A single entry point for comparison

For convenience, you can use the `matches_reference_hash_file` for all of the
file types previously discussed. Each call will accept the paths to the binary
file and JSON hash file, along with appropriate optional kwargs relevant to
the file type.

```python
from earthdata_hashdiff import matches_reference_hash_file

assert matches_reference_hash_file(
'path/to/netcdf/file.nc4',
'path/to/json/with/hashes.json',
)

assert matches_reference_hash_file(
'path/to/netcdf/file.nc4',
'path/to/json/with/hashes.json',
skipped_metadata_attributes={'attribute_name_one', 'attribute_name_two'},
)

assert geotiff_matches_reference_hash_file(
'path/to/geotiff/file.tif',
'path/to/json/with/hash.json',
)

assert geotiff_matches_reference_hash_file(
'path/to/geotiff/file.tif',
'path/to/json/with/hash.json',
skipped_metadata_tags={'tag_name_one'},
)
```
## Installing

### Using pip
Expand Down
39 changes: 39 additions & 0 deletions docs/Using_earthdata-hashdiff.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,45 @@
")"
]
},
{
"cell_type": "markdown",
"id": "dda44bf9-1a3e-4702-b10b-f6b401e2217a",
"metadata": {},
"source": [
"## A single comparison entry point\n",
"\n",
"For convenience, you can use the `matches_reference_hash_file` for all of the file types previously discussed. Each call will accept the paths to the binary file and JSON hash file, along with appropriate optional kwargs relevant to the file type."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba1196f0-1dc6-4c17-97d8-7b7b8f047c73",
"metadata": {},
"outputs": [],
"source": [
"from earthdata_hashdiff import matches_reference_hash_file\n",
"\n",
"# GeoTIFF example\n",
"assert matches_reference_hash_file(\n",
" ecostress_granule,\n",
" f'{ecostress_granule}.json',\n",
")\n",
"\n",
"# HDF-5 example\n",
"assert matches_reference_hash_file(\n",
" gpm_3imerghh_granule_one,\n",
" f'{gpm_3imerghh_granule_one}.json',\n",
"), 'Binary file did not match previously generated hashes.'\n",
"\n",
"# HDF-5 example with kwargs\n",
"assert matches_reference_hash_file(\n",
" gpm_3imerghh_granule_one,\n",
" f'{gpm_3imerghh_granule_one}.decode.json',\n",
" skipped_variables_or_groups={'/Grid/time', '/Grid/time_bnds'},\n",
"), 'Binary file did not match previously generated hashes.'"
]
},
{
"cell_type": "markdown",
"id": "c61a5f43-2bf2-42f6-8c39-abeef381816f",
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# These packages are required to run the documentation Jupyter notebook.
earthdata-hashdiff ~= 1.1.0
earthdata-hashdiff ~= 1.2.0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in anticipation of the release from this PR merging.

notebook ~= 7.4.5
requests ~= 2.32.4
2 changes: 1 addition & 1 deletion earthdata_hashdiff/__about__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version for the package - only edit when intending to release."""

version = '1.1.0'
version = '1.2.0'
2 changes: 2 additions & 0 deletions earthdata_hashdiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from earthdata_hashdiff.compare import (
geotiff_matches_reference_hash_file,
h5_matches_reference_hash_file,
matches_reference_hash_file,
nc4_matches_reference_hash_file,
)
from earthdata_hashdiff.generate import (
Expand All @@ -26,5 +27,6 @@
'get_hashes_from_nc4_file',
'geotiff_matches_reference_hash_file',
'h5_matches_reference_hash_file',
'matches_reference_hash_file',
'nc4_matches_reference_hash_file',
]
86 changes: 86 additions & 0 deletions earthdata_hashdiff/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
"""

import json
from collections.abc import Callable
from os.path import splitext
from typing import Literal, TypedDict, overload

from earthdata_hashdiff.generate import (
GEOTIFF_HASH_KEY,
Expand All @@ -14,6 +17,89 @@
get_hashes_from_xarray_input,
)

HashedFileTypes = Literal['GeoTIFF', 'HDF-5', 'netCDF4']


class XarrayParams(TypedDict):
"""Typing for unique inputs to matches_reference_hash_file_using_xarray."""

skipped_variables_or_groups: set[str]
skipped_metadata_attributes: set[str]
xarray_kwargs: dict


class GeoTIFFParams(TypedDict):
"""Typing for unique inputs to geotiff_matches_reference_hash_file."""

skipped_metadata_tags: set[str]


@overload
def matches_reference_hash_file(
binary_file_path: str,
reference_file_path: str,
**kwargs: XarrayParams,
) -> bool: ...


@overload
def matches_reference_hash_file(
binary_file_path: str,
reference_file_path: str,
**kwargs: GeoTIFFParams,
) -> bool: ...


def matches_reference_hash_file(
binary_file_path: str,
reference_file_path: str,
**kwargs: XarrayParams | GeoTIFFParams,
) -> bool:
"""Generate hashes for request output and compare to reference file.

Possible kwargs:

* skipped_variables_or_groups - For netCDF4 or HDF-5 files.
* skipped_metadata_attributes - For netCDF4 or HDF-5 files.
* xarray_kwargs - For netCDF4 or HDF-5 files.
* skipped_metadata_tags - For GeoTIFF files.

"""
file_type_comparisons: dict[HashedFileTypes, Callable[..., bool]] = {
'GeoTIFF': geotiff_matches_reference_hash_file,
'HDF-5': h5_matches_reference_hash_file,
'netCDF4': nc4_matches_reference_hash_file,
}

file_type = guess_file_type(binary_file_path)

comparison_function = file_type_comparisons.get(file_type)

if comparison_function is None:
raise ValueError('file_type not recognised: {file_type}')

return comparison_function(binary_file_path, reference_file_path, **kwargs)


def guess_file_type(file_path: str) -> HashedFileTypes:
"""Return a file type guessed based on the extension for the file.

If the extension is an entirely unrecognised one, a `ValueError` is raised.

"""
file_extension = splitext(file_path)[-1].lower()

if file_extension in ['.tif', '.tiff']:
file_type: HashedFileTypes = 'GeoTIFF'
elif file_extension in ['.h5', '.hdf', '.hdf5']:
file_type = 'HDF-5'
elif file_extension in ['.nc', '.nc4']:
file_type = 'netCDF4'
else:
raise ValueError(f'File extension not recognised: "{file_extension}"')

return file_type


def matches_reference_hash_file_using_xarray(
binary_file_path: str,
Expand Down
Loading