From 0122fd4158882841dcce2bcdb3f7dd3621f51b32 Mon Sep 17 00:00:00 2001 From: Paul Haesler Date: Wed, 20 Dec 2023 14:19:20 +1100 Subject: [PATCH] Cherry picked PRs from develop (1.8) (#1524) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update whats_new.rst for 1.8.17 release. (#1510) * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/adrienverge/yamllint.git: v1.32.0 → v1.33.0](https://github.com/adrienverge/yamllint.git/compare/v1.32.0...v1.33.0) * Bump conda-incubator/setup-miniconda from 2 to 3 Bumps [conda-incubator/setup-miniconda](https://github.com/conda-incubator/setup-miniconda) from 2 to 3. - [Release notes](https://github.com/conda-incubator/setup-miniconda/releases) - [Changelog](https://github.com/conda-incubator/setup-miniconda/blob/main/CHANGELOG.md) - [Commits](https://github.com/conda-incubator/setup-miniconda/compare/v2...v3) --- updated-dependencies: - dependency-name: conda-incubator/setup-miniconda dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] * Dataset CLI tool to find duplicates (#1517) * add dataset cli tool to find duplicates * convert timestamps to UTC * update whats_new * update memory driver search duplicates implementation --------- Co-authored-by: Ariana Barzinpour * Make solar_date() timezone aware. (#1521) * Warn if non-eo3 dataset has eo3 metadata type (#1523) * warn if non-eo3 dataset has eo3 metadata type * fix str contains * add test --------- Co-authored-by: Ariana Barzinpour * Fix merge oops * Resolve some merge issues arising fromm differences between SQLAlchemy 1.4 and 2.0. --------- Signed-off-by: dependabot[bot] Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Ariana-B <40238244+ariana-b@users.noreply.github.com> Co-authored-by: Ariana Barzinpour --- .github/workflows/test-conda-build.yml | 2 +- .pre-commit-config.yaml | 2 +- datacube/api/query.py | 2 +- datacube/drivers/postgis/_api.py | 55 +++++++++++++++++- datacube/drivers/postgis/_fields.py | 15 ++++- datacube/drivers/postgres/_api.py | 60 +++++++++++++++++++- datacube/drivers/postgres/_fields.py | 15 ++++- datacube/index/hl.py | 15 +++++ datacube/index/memory/_datasets.py | 20 ++++--- datacube/index/postgis/_datasets.py | 9 ++- datacube/index/postgres/_datasets.py | 9 ++- datacube/scripts/dataset.py | 42 ++++++++++++++ docs/about/whats_new.rst | 9 +++ integration_tests/index/test_memory_index.py | 4 +- integration_tests/index/test_search_eo3.py | 30 +++++++++- integration_tests/test_cli_output.py | 26 +++++++++ integration_tests/test_dataset_add.py | 26 +++++++++ tests/api/test_query.py | 7 +++ 18 files changed, 316 insertions(+), 32 deletions(-) diff --git a/.github/workflows/test-conda-build.yml b/.github/workflows/test-conda-build.yml index 3ec372b21..abe41e2e1 100644 --- a/.github/workflows/test-conda-build.yml +++ b/.github/workflows/test-conda-build.yml @@ -30,7 +30,7 @@ jobs: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda-environment.yml') }} - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v3 with: environment-file: conda-environment.yml auto-update-conda: true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ba8832218..4f102e6d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/adrienverge/yamllint.git - rev: v1.32.0 + rev: v1.33.0 hooks: - id: yamllint - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/datacube/api/query.py b/datacube/api/query.py index 78559b859..a3e8cf812 100644 --- a/datacube/api/query.py +++ b/datacube/api/query.py @@ -389,7 +389,7 @@ def solar_day(dataset: Dataset, longitude: Optional[float] = None) -> np.datetim :param longitude: If supplied correct timestamp for this longitude, rather than mid-point of the Dataset's footprint """ - utc = dataset.center_time + utc = dataset.center_time.astimezone(datetime.timezone.utc) if longitude is None: _lon = _ds_mid_longitude(dataset) diff --git a/datacube/drivers/postgis/_api.py b/datacube/drivers/postgis/_api.py index 2e91f80a3..a910f3fc2 100644 --- a/datacube/drivers/postgis/_api.py +++ b/datacube/drivers/postgis/_api.py @@ -740,11 +740,14 @@ def search_unique_datasets(self, expressions, select_fields=None, limit=None): def get_duplicates(self, match_fields: Sequence[PgField], expressions: Sequence[PgExpression]) -> Iterable[tuple]: # TODO + if "time" in [f.name for f in match_fields]: + return self.get_duplicates_with_time(match_fields, expressions) + group_expressions = tuple(f.alchemy_expression for f in match_fields) join_tables = PostgisDbAPI._join_tables(expressions, match_fields) query = select( - func.array_agg(Dataset.id), + func.array_agg(Dataset.id).label("ids"), *group_expressions ).select_from(Dataset) for joins in join_tables: @@ -759,6 +762,56 @@ def get_duplicates(self, match_fields: Sequence[PgField], expressions: Sequence[ ) return self._connection.execute(query) + def get_duplicates_with_time( + self, match_fields: Sequence[PgField], expressions: Sequence[PgExpression] + ) -> Iterable[tuple]: + fields = [] + for f in match_fields: + if f.name == "time": + time_field = f.expression_with_leniency + else: + fields.append(f.alchemy_expression) + + join_tables = PostgisDbAPI._join_tables(expressions, match_fields) + + cols = [Dataset.id, time_field.label('time'), *fields] + query = select( + *cols + ).select_from(Dataset) + for joins in join_tables: + query = query.join(*joins) + + query = query.where( + and_(Dataset.archived == None, *(PostgisDbAPI._alchemify_expressions(expressions))) + ) + + t1 = query.alias("t1") + t2 = query.alias("t2") + + time_overlap = select( + t1.c.id, + text("t1.time * t2.time as time_intersect"), + *fields + ).select_from( + t1.join( + t2, + and_(t1.c.time.overlaps(t2.c.time), t1.c.id != t2.c.id) + ) + ) + + query = select( + func.array_agg(func.distinct(time_overlap.c.id)).label("ids"), + *fields, + text("(lower(time_intersect) at time zone 'UTC', upper(time_intersect) at time zone 'UTC') as time") + ).select_from( + time_overlap + ).group_by( + *fields, text("time_intersect") + ).having( + func.count(time_overlap.c.id) > 1 + ) + return self._connection.execute(query) + def count_datasets(self, expressions): """ :type expressions: tuple[datacube.drivers.postgis._fields.PgExpression] diff --git a/datacube/drivers/postgis/_fields.py b/datacube/drivers/postgis/_fields.py index 3db4bacb7..35ec5bdcb 100755 --- a/datacube/drivers/postgis/_fields.py +++ b/datacube/drivers/postgis/_fields.py @@ -16,6 +16,7 @@ from sqlalchemy import cast, func, and_ from sqlalchemy.dialects import postgresql as postgres from sqlalchemy.dialects.postgresql import NUMRANGE, TSTZRANGE +from sqlalchemy.dialects.postgresql import INTERVAL from sqlalchemy.sql import ColumnElement from sqlalchemy.orm import aliased @@ -223,7 +224,7 @@ def __init__(self, name, description, alchemy_column, indexed, offset=None, sele @property def alchemy_expression(self): - return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc) + return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc).label(self.name) def __eq__(self, value): """ @@ -362,7 +363,7 @@ def value_to_alchemy(self, value): @property def alchemy_expression(self): - return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)) + return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)).label(self.name) def __eq__(self, value): """ @@ -441,6 +442,16 @@ def between(self, low, high): raise ValueError("Unknown comparison type for date range: " "expecting datetimes, got: (%r, %r)" % (low, high)) + @property + def expression_with_leniency(self): + return func.tstzrange( + self.lower.alchemy_expression - cast('500 milliseconds', INTERVAL), + self.greater.alchemy_expression + cast('500 milliseconds', INTERVAL), + # Inclusive on both sides. + '[]', + type_=TSTZRANGE, + ) + def _number_implies_year(v: Union[int, datetime]) -> datetime: """ diff --git a/datacube/drivers/postgres/_api.py b/datacube/drivers/postgres/_api.py index bd46ed2b2..9267dbc72 100644 --- a/datacube/drivers/postgres/_api.py +++ b/datacube/drivers/postgres/_api.py @@ -775,10 +775,13 @@ def search_unique_datasets(self, expressions, select_fields=None, limit=None): return self._connection.execute(select_query) def get_duplicates(self, match_fields: Iterable[PgField], expressions: Iterable[PgExpression]) -> Iterable[Tuple]: + if "time" in [f.name for f in match_fields]: + return self.get_duplicates_with_time(match_fields, expressions) + group_expressions = tuple(f.alchemy_expression for f in match_fields) select_query = select( - func.array_agg(DATASET.c.id), + func.array_agg(DATASET.c.id).label('ids'), *group_expressions ).select_from( PostgresDbAPI._from_expression(DATASET, expressions, match_fields) @@ -791,6 +794,61 @@ def get_duplicates(self, match_fields: Iterable[PgField], expressions: Iterable[ ) return self._connection.execute(select_query) + def get_duplicates_with_time( + self, match_fields: Iterable[PgField], expressions: Iterable[PgExpression] + ) -> Iterable[Tuple]: + """ + If considering time when searching for duplicates, we need to grant some amount of leniency + in case timestamps are not exactly the same. + From the set of datasets that are active and have the correct product (candidates), + find all those whose extended timestamp range overlap (overlapping), + then group them by the other fields. + """ + fields = [] + for f in match_fields: + if f.name == "time": + time_field = f.expression_with_leniency + else: + fields.append(f.alchemy_expression) + + candidates_table = select( + DATASET.c.id, + time_field.label('time'), + *fields + ).select_from( + PostgresDbAPI._from_expression(DATASET, expressions, match_fields) + ).where( + and_(DATASET.c.archived == None, *(PostgresDbAPI._alchemify_expressions(expressions))) + ) + + t1 = candidates_table.alias("t1") + t2 = candidates_table.alias("t2") + + overlapping = select( + t1.c.id, + text("t1.time * t2.time as time_intersect"), + *fields + ).select_from( + t1.join( + t2, + and_(t1.c.time.overlaps(t2.c.time), t1.c.id != t2.c.id) + ) + ) + + final_query = select( + func.array_agg(func.distinct(overlapping.c.id)).label("ids"), + *fields, + text("(lower(time_intersect) at time zone 'UTC', upper(time_intersect) at time zone 'UTC') as time") + ).select_from( + overlapping + ).group_by( + *fields, text("time_intersect") + ).having( + func.count(overlapping.c.id) > 1 + ) + + return self._connection.execute(final_query) + def count_datasets(self, expressions): """ :type expressions: tuple[datacube.drivers.postgres._fields.PgExpression] diff --git a/datacube/drivers/postgres/_fields.py b/datacube/drivers/postgres/_fields.py index fc3023b05..c6a98470f 100755 --- a/datacube/drivers/postgres/_fields.py +++ b/datacube/drivers/postgres/_fields.py @@ -15,6 +15,7 @@ from sqlalchemy.dialects import postgresql as postgres from sqlalchemy.dialects.postgresql import INT4RANGE from sqlalchemy.dialects.postgresql import NUMRANGE, TSTZRANGE +from sqlalchemy.dialects.postgresql import INTERVAL from sqlalchemy.sql import ColumnElement from datacube import utils @@ -193,7 +194,7 @@ def __init__(self, name, description, alchemy_column, indexed, offset=None, sele @property def alchemy_expression(self): - return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc) + return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc).label(self.name) def __eq__(self, value): """ @@ -322,7 +323,7 @@ def postgres_index_type(self): @property def alchemy_expression(self): - return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)) + return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)).label(self.name) def __eq__(self, value): """ @@ -431,6 +432,16 @@ def between(self, low, high): raise ValueError("Unknown comparison type for date range: " "expecting datetimes, got: (%r, %r)" % (low, high)) + @property + def expression_with_leniency(self): + return func.tstzrange( + self.lower.alchemy_expression - cast('500 milliseconds', INTERVAL), + self.greater.alchemy_expression + cast('500 milliseconds', INTERVAL), + # Inclusive on both sides. + '[]', + type_=TSTZRANGE, + ) + def _number_implies_year(v: Union[int, datetime]) -> datetime: """ diff --git a/datacube/index/hl.py b/datacube/index/hl.py index 2dd945948..8403dc7ae 100644 --- a/datacube/index/hl.py +++ b/datacube/index/hl.py @@ -5,6 +5,8 @@ """ High level indexing operations/utilities """ +import logging + import json import toolz from uuid import UUID @@ -18,6 +20,8 @@ from datacube.model import LineageDirection from .eo3 import prep_eo3, is_doc_eo3, is_doc_geo # type: ignore[attr-defined] +_LOG = logging.getLogger(__name__) + class ProductRule: def __init__(self, product: Product, signature: Mapping[str, Any]): @@ -145,6 +149,14 @@ def render_diff(offset, a, b): ] +def check_intended_eo3(ds: SimpleDocNav, product: Product) -> None: + # warn if it looks like dataset was meant to be eo3 but is not + if not is_doc_eo3(ds.doc) and ("eo3" in product.metadata_type.name): + _LOG.warning(f"Dataset {ds.id} has a product with an eo3 metadata type, " + "but the dataset definition does not include the $schema field " + "and so will not be recognised as an eo3 dataset.") + + def resolve_no_lineage(ds: SimpleDocNav, uri: str, matcher: ProductMatcher, @@ -156,6 +168,7 @@ def resolve_no_lineage(ds: SimpleDocNav, product = matcher(doc) except BadMatch as e: return None, e + check_intended_eo3(ds, product) return Dataset(product, doc, uris=[uri], sources={}), None @@ -190,6 +203,7 @@ def resolve_with_lineage(doc: SimpleDocNav, uri: str, matcher: ProductMatcher, if source_tree.direction == LineageDirection.DERIVED: raise ValueError("source_tree cannot be a derived tree.") source_tree = source_tree.find_subtree(uuid_) + check_intended_eo3(doc, product) return Dataset(product, doc.doc, source_tree=source_tree, @@ -268,6 +282,7 @@ def resolve_ds(ds: SimpleDocNav, else: product = matcher(doc) + check_intended_eo3(ds, product) return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache) try: return remap_lineage_doc(main_ds, resolve_ds, cache={}), None diff --git a/datacube/index/memory/_datasets.py b/datacube/index/memory/_datasets.py index ebc1e907d..67a2a969f 100755 --- a/datacube/index/memory/_datasets.py +++ b/datacube/index/memory/_datasets.py @@ -133,10 +133,11 @@ def search_product_duplicates(self, product: Product, *args: Union[str, Field] ) -> Iterable[Tuple[Tuple, Iterable[UUID]]]: - field_names: List[str] = [arg.name if isinstance(arg, Field) else arg for arg in args] - # Typing note: mypy cannot handle dynamically created namedtuples - GroupedVals = namedtuple('search_result', field_names) # type: ignore[misc] - + """ + Find dataset ids of a given product that have duplicates of the given set of field names. + Returns each set of those field values and the datasets that have them. + Note that this implementation does not account for slight timestamp discrepancies. + """ def to_field(f: Union[str, Field]) -> Field: if isinstance(f, str): f = product.metadata_type.dataset_fields[f] @@ -144,6 +145,8 @@ def to_field(f: Union[str, Field]) -> Field: return f fields = [to_field(f) for f in args] + # Typing note: mypy cannot handle dynamically created namedtuples + GroupedVals = namedtuple('search_result', list(f.name for f in fields)) # type: ignore[misc] def values(ds: Dataset) -> GroupedVals: vals = [] @@ -151,16 +154,17 @@ def values(ds: Dataset) -> GroupedVals: vals.append(field.extract(ds.metadata_doc)) # type: ignore[attr-defined] return GroupedVals(*vals) - dups: Dict[Tuple, List[UUID]] = {} + dups: Dict[Tuple, set[UUID]] = {} for ds in self.active_by_id.values(): if ds.product.name != product.name: continue vals = values(ds) if vals in dups: - dups[vals].append(ds.id) + dups[vals].add(ds.id) else: - dups[vals] = [ds.id] - return list(dups.items()) + dups[vals] = set([ds.id]) # avoid duplicate entries + # only return entries with more than one dataset + return list({k: v for k, v in dups.items() if len(v) > 1}) def can_update(self, dataset: Dataset, diff --git a/datacube/index/postgis/_datasets.py b/datacube/index/postgis/_datasets.py index 4ea756635..d3c54ed6a 100755 --- a/datacube/index/postgis/_datasets.py +++ b/datacube/index/postgis/_datasets.py @@ -270,15 +270,14 @@ def load_field(f: Union[str, fields.Field]) -> fields.Field: return f group_fields: List[fields.Field] = [load_field(f) for f in args] - result_type = namedtuple('search_result', list(f.name for f in group_fields)) # type: ignore - expressions = [product.metadata_type.dataset_fields.get('product') == product.name] with self._db_connection() as connection: for record in connection.get_duplicates(group_fields, expressions): - dataset_ids = set(record[0]) - grouped_fields = tuple(record[1:]) - yield result_type(*grouped_fields), dataset_ids + as_dict = record._asdict() + if 'ids' in as_dict.keys(): + ids = as_dict.pop('ids') + yield namedtuple('search_result', as_dict.keys())(**as_dict), set(ids) def can_update(self, dataset, updates_allowed=None): """ diff --git a/datacube/index/postgres/_datasets.py b/datacube/index/postgres/_datasets.py index 8f22536ba..006965cc9 100755 --- a/datacube/index/postgres/_datasets.py +++ b/datacube/index/postgres/_datasets.py @@ -245,15 +245,14 @@ def load_field(f: Union[str, fields.Field]) -> fields.Field: return f group_fields: List[fields.Field] = [load_field(f) for f in args] - result_type = namedtuple('search_result', list(f.name for f in group_fields)) # type: ignore - expressions = [product.metadata_type.dataset_fields.get('product') == product.name] with self._db_connection() as connection: for record in connection.get_duplicates(group_fields, expressions): - dataset_ids = set(record[0]) - grouped_fields = tuple(record[1:]) - yield result_type(*grouped_fields), dataset_ids + as_dict = record._asdict() + if "ids" in as_dict.keys(): + ids = as_dict.pop('ids') + yield namedtuple('search_result', as_dict.keys())(**as_dict), set(ids) def can_update(self, dataset, updates_allowed=None): """ diff --git a/datacube/scripts/dataset.py b/datacube/scripts/dataset.py index fe2e71632..184f575bc 100644 --- a/datacube/scripts/dataset.py +++ b/datacube/scripts/dataset.py @@ -648,3 +648,45 @@ def purge_cmd(index: Index, dry_run: bool, all_ds: bool, ids: List[str]): click.echo(f'{len(datasets_for_archive)} datasets not purged (dry run)') click.echo('Completed dataset purge.') + + +@dataset_cmd.command('find-duplicates', help="Search for duplicate indexed datasets") +@click.option('--product', '-p', 'product_names', + help=("Only search within product(s) specified with this option. " + "You can supply several by repeating this option with a new product name."), + multiple=True) +@click.argument('fields', nargs=-1) +@ui.pass_index() +def find_duplicates(index: Index, product_names, fields): + """ + Find dataset ids of two or more active datasets that have duplicate values in the specified fields. + If products are specified, search only within those products. Otherwise, search within any products that + have the fields. + """ + if not fields: + click.echo('Error: must provide field names to match on\n') + sys.exit(1) + + # if no products were specified, use whichever ones have the specified search fields + # if products were specified, check they all have the required fields + products_with_fields = list(index.products.get_with_fields(fields)) + if not products_with_fields: + click.echo(f'Error: no products found with fields {", ".join(fields)}\n') + sys.exit(1) + if not list(product_names): + products = products_with_fields + else: + products = [index.products.get_by_name(name) for name in product_names] + products_without_fields = set(products).difference(set(products_with_fields)) + if len(products_without_fields): + click.echo(f'Error: specified products {", ".join(p.name for p in products_without_fields)} ' + 'do not contain all required fields\n') + sys.exit(1) + + dupes = [] + for product in products: + dupes.extend(list(index.datasets.search_product_duplicates(product, *fields))) + if len(dupes): + print(dupes) + else: + click.echo('No potential duplicates found.') diff --git a/docs/about/whats_new.rst b/docs/about/whats_new.rst index 0e19faf7d..3b8718bc1 100644 --- a/docs/about/whats_new.rst +++ b/docs/about/whats_new.rst @@ -25,11 +25,20 @@ v1.9.next - Increase minimum Python version to 3.10 (:pull:`1509`) - Virtual product tests using odc-geo GridSpec (:pull:`1512`) - New Configuration API, as per ODC-EP10 (:pull:`1505`) +- Alembic migrations for postgis driver (:pull:`1520`) v1.8.next ========= +- Add dataset cli tool ``find-duplicates`` to identify duplicate indexed datasets (:pull:`1517`) +- Make solar_day() timezone aware (:pull:`1521`) +- Warn if non-eo3 dataset has eo3 metadata type (:pull:`1523`) + +v1.8.17 (8th November 2023) +=========================== - Fix schema creation with postgres driver when initialising system with ``--no-init-users`` (:pull:`1504`) +- Switch to new jsonschema 'referencing' API and repin jsonschema to >=4.18 (:pull:`1477`) +- Update whats_new.rst for release (:pull:`1510`) v1.8.16 (17th October 2023) =========================== diff --git a/integration_tests/index/test_memory_index.py b/integration_tests/index/test_memory_index.py index f691b111f..f35e8efc2 100644 --- a/integration_tests/index/test_memory_index.py +++ b/integration_tests/index/test_memory_index.py @@ -204,9 +204,7 @@ def test_mem_ds_search_dups(mem_eo3_data: ODCEnvironment): dc, ls8_id, wo_id = mem_eo3_data ls8_ds = dc.index.datasets.get(ls8_id) dup_results = dc.index.datasets.search_product_duplicates(ls8_ds.product, "cloud_cover", "dataset_maturity") - assert len(dup_results) == 1 - assert dup_results[0][0].cloud_cover == ls8_ds.metadata.cloud_cover - assert ls8_id in dup_results[0][1] + assert len(dup_results) == 0 def test_mem_ds_locations(mem_eo3_data: ODCEnvironment): diff --git a/integration_tests/index/test_search_eo3.py b/integration_tests/index/test_search_eo3.py index 6813c0711..fe0a758ee 100644 --- a/integration_tests/index/test_search_eo3.py +++ b/integration_tests/index/test_search_eo3.py @@ -7,6 +7,7 @@ """ import datetime from typing import Any +from collections import namedtuple import pytest import yaml @@ -777,13 +778,14 @@ def test_find_duplicates_eo3(index, assert len(all_datasets) == 5 # First two ls8 datasets have the same path/row, last two have a different row. + dupe_fields = namedtuple('search_result', ['region_code', 'dataset_maturity']) expected_ls8_path_row_duplicates = [ ( - ('090086', 'final'), + dupe_fields('090086', 'final'), {ls8_eo3_dataset.id, ls8_eo3_dataset2.id} ), ( - ('101077', 'final'), + dupe_fields('101077', 'final'), {ls8_eo3_dataset3.id, ls8_eo3_dataset4.id} ), @@ -811,6 +813,30 @@ def test_find_duplicates_eo3(index, assert sat_res == [] +def test_find_duplicates_with_time(index, nrt_dataset, final_dataset, ls8_eo3_dataset): + index.datasets.add(nrt_dataset, with_lineage=False) + index.datasets.add(final_dataset, with_lineage=False) + index.datasets.get(nrt_dataset.id).is_active + index.datasets.get(final_dataset.id).is_active + + all_datasets = index.datasets.search_eager() + assert len(all_datasets) == 3 + + dupe_fields = namedtuple('search_result', ['region_code', 'time']) + expected_result = [ + ( + dupe_fields('090086', '("2023-04-30 23:50:33.884549","2023-04-30 23:50:34.884549")'), + {nrt_dataset.id, final_dataset.id} + ) + ] + res = sorted(index.datasets.search_product_duplicates( + nrt_dataset.product, + 'region_code', 'time', + )) + + assert res == expected_result + + def test_csv_search_via_cli_eo3(clirunner: Any, ls8_eo3_dataset: Dataset, ls8_eo3_dataset2: Dataset) -> None: diff --git a/integration_tests/test_cli_output.py b/integration_tests/test_cli_output.py index be2726720..7c53fa899 100644 --- a/integration_tests/test_cli_output.py +++ b/integration_tests/test_cli_output.py @@ -83,6 +83,32 @@ def test_cli_dataset_subcommand(index, clirunner, for path in eo3_dataset_paths: result = clirunner(['dataset', 'add', "--ignore-lineage", path]) + runner = clirunner(['dataset', 'find-duplicates'], verbose_flag=False, expect_success=False) + assert "Error: must provide field names to match on" in runner.output + assert runner.exit_code == 1 + + runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'fake_field'], + verbose_flag=False, expect_success=False) + assert "Error: no products found with fields region_code, fake_field" in runner.output + assert runner.exit_code == 1 + + runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'landsat_scene_id', + '-p', 'ga_ls8c_ard_3', '-p', 'ga_ls_wo_3'], + verbose_flag=False, + expect_success=False) + assert "Error: specified products ga_ls_wo_3 do not contain all required fields" + assert runner.exit_code == 1 + + runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'uri'], verbose_flag=False) + assert "No potential duplicates found." in runner.output + assert runner.exit_code == 0 + + runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'dataset_maturity'], verbose_flag=False) + assert "No potential duplicates found." not in runner.output + assert "(region_code='090086', dataset_maturity='final')" in runner.output + assert "(region_code='101077', dataset_maturity='final')" in runner.output + assert runner.exit_code == 0 + runner = clirunner(['dataset', 'archive'], verbose_flag=False, expect_success=False) assert "Completed dataset archival." not in runner.output assert "Usage: [OPTIONS] [IDS]" in runner.output diff --git a/integration_tests/test_dataset_add.py b/integration_tests/test_dataset_add.py index 53ff2f42d..1a0e3de0f 100644 --- a/integration_tests/test_dataset_add.py +++ b/integration_tests/test_dataset_add.py @@ -7,6 +7,7 @@ import pytest import toolz import yaml +import logging from datacube.index import Index from datacube.index.hl import Doc2Dataset @@ -15,6 +16,8 @@ from datacube.utils import SimpleDocNav from datacube.scripts.dataset import _resolve_uri +logger = logging.getLogger(__name__) + def check_skip_lineage_test(clirunner, index): ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True)) @@ -207,6 +210,29 @@ def test_dataset_add_not_eo3(index, ls8_eo3_product, eo3_wo_dataset_doc): assert isinstance(_err, BadMatch) +@pytest.mark.parametrize('datacube_env_name', ('datacube', )) +def test_dataset_eo3_no_schema(dataset_add_configs, index_empty, clirunner, caplog): + p = dataset_add_configs + index = index_empty + ds = load_dataset_definition(p.datasets_eo3).doc + + clirunner(['metadata', 'add', p.metadata]) + clirunner(['product', 'add', p.products]) + + # no warnings if eo3 dataset has $schema + doc2ds = Doc2Dataset(index) + doc2ds(ds, 'file:///something') + warnings = [record for record in caplog.records if record.levelno == logging.WARNING] + assert len(warnings) == 0 + + # warn if eo3 metadata type but no $schema + del ds["$schema"] + doc2ds(ds, 'file:///something') + warnings = [record for record in caplog.records if record.levelno == logging.WARNING] + assert len(warnings) == 1 + assert "will not be recognised as an eo3 dataset" in warnings[0].msg + + # Current formulation of this test relies on non-EO3 test data @pytest.mark.parametrize('datacube_env_name', ('datacube', )) def test_dataset_add(dataset_add_configs, index_empty, clirunner): diff --git a/tests/api/test_query.py b/tests/api/test_query.py index aa59561db..c136cda9a 100644 --- a/tests/api/test_query.py +++ b/tests/api/test_query.py @@ -171,6 +171,13 @@ def test_solar_day(): assert 'Cannot compute solar_day: dataset is missing spatial info' in str(e.value) + # Test with Non-UTC timestamp in index. + ds = _s(center_time=parse_time('1997-05-22 22:07:44.2270250-7:00'), + metadata=_s(lon=Range(begin=-136.615, + end=-134.325))) + assert solar_day(ds) == np.datetime64('1997-05-22', 'D') + assert solar_day(ds, longitude=0) == np.datetime64('1997-05-23', 'D') + def test_solar_offset(): from odc.geo.geom import point