From 0122fd4158882841dcce2bcdb3f7dd3621f51b32 Mon Sep 17 00:00:00 2001
From: Paul Haesler <linkjuggler@gmail.com>
Date: Wed, 20 Dec 2023 14:19:20 +1100
Subject: [PATCH] Cherry picked PRs from develop (1.8) (#1524)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update whats_new.rst for 1.8.17 release. (#1510)

* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/adrienverge/yamllint.git: v1.32.0 → v1.33.0](https://github.com/adrienverge/yamllint.git/compare/v1.32.0...v1.33.0)

* Bump conda-incubator/setup-miniconda from 2 to 3

Bumps [conda-incubator/setup-miniconda](https://github.com/conda-incubator/setup-miniconda) from 2 to 3.
- [Release notes](https://github.com/conda-incubator/setup-miniconda/releases)
- [Changelog](https://github.com/conda-incubator/setup-miniconda/blob/main/CHANGELOG.md)
- [Commits](https://github.com/conda-incubator/setup-miniconda/compare/v2...v3)

---
updated-dependencies:
- dependency-name: conda-incubator/setup-miniconda
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Dataset CLI tool to find duplicates (#1517)

* add dataset cli tool to find duplicates

* convert timestamps to UTC

* update whats_new

* update memory driver search duplicates implementation

---------

Co-authored-by: Ariana Barzinpour <ariana.barzinpour@ga.gov.au>

* Make solar_date() timezone aware. (#1521)

* Warn if non-eo3 dataset has eo3 metadata type (#1523)

* warn if non-eo3 dataset has eo3 metadata type

* fix str contains

* add test

---------

Co-authored-by: Ariana Barzinpour <ariana.barzinpour@ga.gov.au>

* Fix merge oops

* Resolve some merge issues arising fromm differences between SQLAlchemy 1.4 and 2.0.

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Ariana-B <40238244+ariana-b@users.noreply.github.com>
Co-authored-by: Ariana Barzinpour <ariana.barzinpour@ga.gov.au>
---
 .github/workflows/test-conda-build.yml       |  2 +-
 .pre-commit-config.yaml                      |  2 +-
 datacube/api/query.py                        |  2 +-
 datacube/drivers/postgis/_api.py             | 55 +++++++++++++++++-
 datacube/drivers/postgis/_fields.py          | 15 ++++-
 datacube/drivers/postgres/_api.py            | 60 +++++++++++++++++++-
 datacube/drivers/postgres/_fields.py         | 15 ++++-
 datacube/index/hl.py                         | 15 +++++
 datacube/index/memory/_datasets.py           | 20 ++++---
 datacube/index/postgis/_datasets.py          |  9 ++-
 datacube/index/postgres/_datasets.py         |  9 ++-
 datacube/scripts/dataset.py                  | 42 ++++++++++++++
 docs/about/whats_new.rst                     |  9 +++
 integration_tests/index/test_memory_index.py |  4 +-
 integration_tests/index/test_search_eo3.py   | 30 +++++++++-
 integration_tests/test_cli_output.py         | 26 +++++++++
 integration_tests/test_dataset_add.py        | 26 +++++++++
 tests/api/test_query.py                      |  7 +++
 18 files changed, 316 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/test-conda-build.yml b/.github/workflows/test-conda-build.yml
index 3ec372b21..abe41e2e1 100644
--- a/.github/workflows/test-conda-build.yml
+++ b/.github/workflows/test-conda-build.yml
@@ -30,7 +30,7 @@ jobs:
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{
             hashFiles('conda-environment.yml') }}
 
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
         with:
           environment-file: conda-environment.yml
           auto-update-conda: true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ba8832218..4f102e6d3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
     - repo: https://github.com/adrienverge/yamllint.git
-      rev: v1.32.0
+      rev: v1.33.0
       hooks:
         - id: yamllint
     - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/datacube/api/query.py b/datacube/api/query.py
index 78559b859..a3e8cf812 100644
--- a/datacube/api/query.py
+++ b/datacube/api/query.py
@@ -389,7 +389,7 @@ def solar_day(dataset: Dataset, longitude: Optional[float] = None) -> np.datetim
     :param longitude: If supplied correct timestamp for this longitude,
                       rather than mid-point of the Dataset's footprint
     """
-    utc = dataset.center_time
+    utc = dataset.center_time.astimezone(datetime.timezone.utc)
 
     if longitude is None:
         _lon = _ds_mid_longitude(dataset)
diff --git a/datacube/drivers/postgis/_api.py b/datacube/drivers/postgis/_api.py
index 2e91f80a3..a910f3fc2 100644
--- a/datacube/drivers/postgis/_api.py
+++ b/datacube/drivers/postgis/_api.py
@@ -740,11 +740,14 @@ def search_unique_datasets(self, expressions, select_fields=None, limit=None):
 
     def get_duplicates(self, match_fields: Sequence[PgField], expressions: Sequence[PgExpression]) -> Iterable[tuple]:
         # TODO
+        if "time" in [f.name for f in match_fields]:
+            return self.get_duplicates_with_time(match_fields, expressions)
+
         group_expressions = tuple(f.alchemy_expression for f in match_fields)
         join_tables = PostgisDbAPI._join_tables(expressions, match_fields)
 
         query = select(
-            func.array_agg(Dataset.id),
+            func.array_agg(Dataset.id).label("ids"),
             *group_expressions
         ).select_from(Dataset)
         for joins in join_tables:
@@ -759,6 +762,56 @@ def get_duplicates(self, match_fields: Sequence[PgField], expressions: Sequence[
         )
         return self._connection.execute(query)
 
+    def get_duplicates_with_time(
+            self, match_fields: Sequence[PgField], expressions: Sequence[PgExpression]
+    ) -> Iterable[tuple]:
+        fields = []
+        for f in match_fields:
+            if f.name == "time":
+                time_field = f.expression_with_leniency
+            else:
+                fields.append(f.alchemy_expression)
+
+        join_tables = PostgisDbAPI._join_tables(expressions, match_fields)
+
+        cols = [Dataset.id, time_field.label('time'), *fields]
+        query = select(
+            *cols
+        ).select_from(Dataset)
+        for joins in join_tables:
+            query = query.join(*joins)
+
+        query = query.where(
+            and_(Dataset.archived == None, *(PostgisDbAPI._alchemify_expressions(expressions)))
+        )
+
+        t1 = query.alias("t1")
+        t2 = query.alias("t2")
+
+        time_overlap = select(
+            t1.c.id,
+            text("t1.time * t2.time as time_intersect"),
+            *fields
+        ).select_from(
+            t1.join(
+                t2,
+                and_(t1.c.time.overlaps(t2.c.time), t1.c.id != t2.c.id)
+            )
+        )
+
+        query = select(
+            func.array_agg(func.distinct(time_overlap.c.id)).label("ids"),
+            *fields,
+            text("(lower(time_intersect) at time zone 'UTC', upper(time_intersect) at time zone 'UTC') as time")
+        ).select_from(
+            time_overlap
+        ).group_by(
+            *fields, text("time_intersect")
+        ).having(
+            func.count(time_overlap.c.id) > 1
+        )
+        return self._connection.execute(query)
+
     def count_datasets(self, expressions):
         """
         :type expressions: tuple[datacube.drivers.postgis._fields.PgExpression]
diff --git a/datacube/drivers/postgis/_fields.py b/datacube/drivers/postgis/_fields.py
index 3db4bacb7..35ec5bdcb 100755
--- a/datacube/drivers/postgis/_fields.py
+++ b/datacube/drivers/postgis/_fields.py
@@ -16,6 +16,7 @@
 from sqlalchemy import cast, func, and_
 from sqlalchemy.dialects import postgresql as postgres
 from sqlalchemy.dialects.postgresql import NUMRANGE, TSTZRANGE
+from sqlalchemy.dialects.postgresql import INTERVAL
 from sqlalchemy.sql import ColumnElement
 from sqlalchemy.orm import aliased
 
@@ -223,7 +224,7 @@ def __init__(self, name, description, alchemy_column, indexed, offset=None, sele
 
     @property
     def alchemy_expression(self):
-        return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc)
+        return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc).label(self.name)
 
     def __eq__(self, value):
         """
@@ -362,7 +363,7 @@ def value_to_alchemy(self, value):
 
     @property
     def alchemy_expression(self):
-        return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression))
+        return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)).label(self.name)
 
     def __eq__(self, value):
         """
@@ -441,6 +442,16 @@ def between(self, low, high):
             raise ValueError("Unknown comparison type for date range: "
                              "expecting datetimes, got: (%r, %r)" % (low, high))
 
+    @property
+    def expression_with_leniency(self):
+        return func.tstzrange(
+            self.lower.alchemy_expression - cast('500 milliseconds', INTERVAL),
+            self.greater.alchemy_expression + cast('500 milliseconds', INTERVAL),
+            # Inclusive on both sides.
+            '[]',
+            type_=TSTZRANGE,
+        )
+
 
 def _number_implies_year(v: Union[int, datetime]) -> datetime:
     """
diff --git a/datacube/drivers/postgres/_api.py b/datacube/drivers/postgres/_api.py
index bd46ed2b2..9267dbc72 100644
--- a/datacube/drivers/postgres/_api.py
+++ b/datacube/drivers/postgres/_api.py
@@ -775,10 +775,13 @@ def search_unique_datasets(self, expressions, select_fields=None, limit=None):
         return self._connection.execute(select_query)
 
     def get_duplicates(self, match_fields: Iterable[PgField], expressions: Iterable[PgExpression]) -> Iterable[Tuple]:
+        if "time" in [f.name for f in match_fields]:
+            return self.get_duplicates_with_time(match_fields, expressions)
+
         group_expressions = tuple(f.alchemy_expression for f in match_fields)
 
         select_query = select(
-            func.array_agg(DATASET.c.id),
+            func.array_agg(DATASET.c.id).label('ids'),
             *group_expressions
         ).select_from(
             PostgresDbAPI._from_expression(DATASET, expressions, match_fields)
@@ -791,6 +794,61 @@ def get_duplicates(self, match_fields: Iterable[PgField], expressions: Iterable[
         )
         return self._connection.execute(select_query)
 
+    def get_duplicates_with_time(
+            self, match_fields: Iterable[PgField], expressions: Iterable[PgExpression]
+    ) -> Iterable[Tuple]:
+        """
+        If considering time when searching for duplicates, we need to grant some amount of leniency
+        in case timestamps are not exactly the same.
+        From the set of datasets that are active and have the correct product (candidates),
+        find all those whose extended timestamp range overlap (overlapping),
+        then group them by the other fields.
+        """
+        fields = []
+        for f in match_fields:
+            if f.name == "time":
+                time_field = f.expression_with_leniency
+            else:
+                fields.append(f.alchemy_expression)
+
+        candidates_table = select(
+            DATASET.c.id,
+            time_field.label('time'),
+            *fields
+        ).select_from(
+            PostgresDbAPI._from_expression(DATASET, expressions, match_fields)
+        ).where(
+            and_(DATASET.c.archived == None, *(PostgresDbAPI._alchemify_expressions(expressions)))
+        )
+
+        t1 = candidates_table.alias("t1")
+        t2 = candidates_table.alias("t2")
+
+        overlapping = select(
+            t1.c.id,
+            text("t1.time * t2.time as time_intersect"),
+            *fields
+        ).select_from(
+            t1.join(
+                t2,
+                and_(t1.c.time.overlaps(t2.c.time), t1.c.id != t2.c.id)
+            )
+        )
+
+        final_query = select(
+            func.array_agg(func.distinct(overlapping.c.id)).label("ids"),
+            *fields,
+            text("(lower(time_intersect) at time zone 'UTC', upper(time_intersect) at time zone 'UTC') as time")
+        ).select_from(
+            overlapping
+        ).group_by(
+            *fields, text("time_intersect")
+        ).having(
+            func.count(overlapping.c.id) > 1
+        )
+
+        return self._connection.execute(final_query)
+
     def count_datasets(self, expressions):
         """
         :type expressions: tuple[datacube.drivers.postgres._fields.PgExpression]
diff --git a/datacube/drivers/postgres/_fields.py b/datacube/drivers/postgres/_fields.py
index fc3023b05..c6a98470f 100755
--- a/datacube/drivers/postgres/_fields.py
+++ b/datacube/drivers/postgres/_fields.py
@@ -15,6 +15,7 @@
 from sqlalchemy.dialects import postgresql as postgres
 from sqlalchemy.dialects.postgresql import INT4RANGE
 from sqlalchemy.dialects.postgresql import NUMRANGE, TSTZRANGE
+from sqlalchemy.dialects.postgresql import INTERVAL
 from sqlalchemy.sql import ColumnElement
 
 from datacube import utils
@@ -193,7 +194,7 @@ def __init__(self, name, description, alchemy_column, indexed, offset=None, sele
 
     @property
     def alchemy_expression(self):
-        return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc)
+        return self._alchemy_offset_value(self.offset, self.aggregation.pg_calc).label(self.name)
 
     def __eq__(self, value):
         """
@@ -322,7 +323,7 @@ def postgres_index_type(self):
 
     @property
     def alchemy_expression(self):
-        return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression))
+        return self.value_to_alchemy((self.lower.alchemy_expression, self.greater.alchemy_expression)).label(self.name)
 
     def __eq__(self, value):
         """
@@ -431,6 +432,16 @@ def between(self, low, high):
             raise ValueError("Unknown comparison type for date range: "
                              "expecting datetimes, got: (%r, %r)" % (low, high))
 
+    @property
+    def expression_with_leniency(self):
+        return func.tstzrange(
+            self.lower.alchemy_expression - cast('500 milliseconds', INTERVAL),
+            self.greater.alchemy_expression + cast('500 milliseconds', INTERVAL),
+            # Inclusive on both sides.
+            '[]',
+            type_=TSTZRANGE,
+        )
+
 
 def _number_implies_year(v: Union[int, datetime]) -> datetime:
     """
diff --git a/datacube/index/hl.py b/datacube/index/hl.py
index 2dd945948..8403dc7ae 100644
--- a/datacube/index/hl.py
+++ b/datacube/index/hl.py
@@ -5,6 +5,8 @@
 """
 High level indexing operations/utilities
 """
+import logging
+
 import json
 import toolz
 from uuid import UUID
@@ -18,6 +20,8 @@
 from datacube.model import LineageDirection
 from .eo3 import prep_eo3, is_doc_eo3, is_doc_geo  # type: ignore[attr-defined]
 
+_LOG = logging.getLogger(__name__)
+
 
 class ProductRule:
     def __init__(self, product: Product, signature: Mapping[str, Any]):
@@ -145,6 +149,14 @@ def render_diff(offset, a, b):
 ]
 
 
+def check_intended_eo3(ds: SimpleDocNav, product: Product) -> None:
+    # warn if it looks like dataset was meant to be eo3 but is not
+    if not is_doc_eo3(ds.doc) and ("eo3" in product.metadata_type.name):
+        _LOG.warning(f"Dataset {ds.id} has a product with an eo3 metadata type, "
+                     "but the dataset definition does not include the $schema field "
+                     "and so will not be recognised as an eo3 dataset.")
+
+
 def resolve_no_lineage(ds: SimpleDocNav,
                        uri: str,
                        matcher: ProductMatcher,
@@ -156,6 +168,7 @@ def resolve_no_lineage(ds: SimpleDocNav,
         product = matcher(doc)
     except BadMatch as e:
         return None, e
+    check_intended_eo3(ds, product)
     return Dataset(product, doc, uris=[uri], sources={}), None
 
 
@@ -190,6 +203,7 @@ def resolve_with_lineage(doc: SimpleDocNav, uri: str, matcher: ProductMatcher,
         if source_tree.direction == LineageDirection.DERIVED:
             raise ValueError("source_tree cannot be a derived tree.")
         source_tree = source_tree.find_subtree(uuid_)
+    check_intended_eo3(doc, product)
     return Dataset(product,
                    doc.doc,
                    source_tree=source_tree,
@@ -268,6 +282,7 @@ def resolve_ds(ds: SimpleDocNav,
         else:
             product = matcher(doc)
 
+        check_intended_eo3(ds, product)
         return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache)
     try:
         return remap_lineage_doc(main_ds, resolve_ds, cache={}), None
diff --git a/datacube/index/memory/_datasets.py b/datacube/index/memory/_datasets.py
index ebc1e907d..67a2a969f 100755
--- a/datacube/index/memory/_datasets.py
+++ b/datacube/index/memory/_datasets.py
@@ -133,10 +133,11 @@ def search_product_duplicates(self,
                                   product: Product,
                                   *args: Union[str, Field]
                                   ) -> Iterable[Tuple[Tuple, Iterable[UUID]]]:
-        field_names: List[str] = [arg.name if isinstance(arg, Field) else arg for arg in args]
-        #   Typing note: mypy cannot handle dynamically created namedtuples
-        GroupedVals = namedtuple('search_result', field_names)  # type: ignore[misc]
-
+        """
+        Find dataset ids of a given product that have duplicates of the given set of field names.
+        Returns each set of those field values and the datasets that have them.
+        Note that this implementation does not account for slight timestamp discrepancies.
+        """
         def to_field(f: Union[str, Field]) -> Field:
             if isinstance(f, str):
                 f = product.metadata_type.dataset_fields[f]
@@ -144,6 +145,8 @@ def to_field(f: Union[str, Field]) -> Field:
             return f
 
         fields = [to_field(f) for f in args]
+        # Typing note: mypy cannot handle dynamically created namedtuples
+        GroupedVals = namedtuple('search_result', list(f.name for f in fields))  # type: ignore[misc]
 
         def values(ds: Dataset) -> GroupedVals:
             vals = []
@@ -151,16 +154,17 @@ def values(ds: Dataset) -> GroupedVals:
                 vals.append(field.extract(ds.metadata_doc))  # type: ignore[attr-defined]
             return GroupedVals(*vals)
 
-        dups: Dict[Tuple, List[UUID]] = {}
+        dups: Dict[Tuple, set[UUID]] = {}
         for ds in self.active_by_id.values():
             if ds.product.name != product.name:
                 continue
             vals = values(ds)
             if vals in dups:
-                dups[vals].append(ds.id)
+                dups[vals].add(ds.id)
             else:
-                dups[vals] = [ds.id]
-        return list(dups.items())
+                dups[vals] = set([ds.id])  # avoid duplicate entries
+        # only return entries with more than one dataset
+        return list({k: v for k, v in dups.items() if len(v) > 1})
 
     def can_update(self,
                    dataset: Dataset,
diff --git a/datacube/index/postgis/_datasets.py b/datacube/index/postgis/_datasets.py
index 4ea756635..d3c54ed6a 100755
--- a/datacube/index/postgis/_datasets.py
+++ b/datacube/index/postgis/_datasets.py
@@ -270,15 +270,14 @@ def load_field(f: Union[str, fields.Field]) -> fields.Field:
             return f
 
         group_fields: List[fields.Field] = [load_field(f) for f in args]
-        result_type = namedtuple('search_result', list(f.name for f in group_fields))  # type: ignore
-
         expressions = [product.metadata_type.dataset_fields.get('product') == product.name]
 
         with self._db_connection() as connection:
             for record in connection.get_duplicates(group_fields, expressions):
-                dataset_ids = set(record[0])
-                grouped_fields = tuple(record[1:])
-                yield result_type(*grouped_fields), dataset_ids
+                as_dict = record._asdict()
+                if 'ids' in as_dict.keys():
+                    ids = as_dict.pop('ids')
+                    yield namedtuple('search_result', as_dict.keys())(**as_dict), set(ids)
 
     def can_update(self, dataset, updates_allowed=None):
         """
diff --git a/datacube/index/postgres/_datasets.py b/datacube/index/postgres/_datasets.py
index 8f22536ba..006965cc9 100755
--- a/datacube/index/postgres/_datasets.py
+++ b/datacube/index/postgres/_datasets.py
@@ -245,15 +245,14 @@ def load_field(f: Union[str, fields.Field]) -> fields.Field:
             return f
 
         group_fields: List[fields.Field] = [load_field(f) for f in args]
-        result_type = namedtuple('search_result', list(f.name for f in group_fields))  # type: ignore
-
         expressions = [product.metadata_type.dataset_fields.get('product') == product.name]
 
         with self._db_connection() as connection:
             for record in connection.get_duplicates(group_fields, expressions):
-                dataset_ids = set(record[0])
-                grouped_fields = tuple(record[1:])
-                yield result_type(*grouped_fields), dataset_ids
+                as_dict = record._asdict()
+                if "ids" in as_dict.keys():
+                    ids = as_dict.pop('ids')
+                    yield namedtuple('search_result', as_dict.keys())(**as_dict), set(ids)
 
     def can_update(self, dataset, updates_allowed=None):
         """
diff --git a/datacube/scripts/dataset.py b/datacube/scripts/dataset.py
index fe2e71632..184f575bc 100644
--- a/datacube/scripts/dataset.py
+++ b/datacube/scripts/dataset.py
@@ -648,3 +648,45 @@ def purge_cmd(index: Index, dry_run: bool, all_ds: bool, ids: List[str]):
         click.echo(f'{len(datasets_for_archive)} datasets not purged (dry run)')
 
     click.echo('Completed dataset purge.')
+
+
+@dataset_cmd.command('find-duplicates', help="Search for duplicate indexed datasets")
+@click.option('--product', '-p', 'product_names',
+              help=("Only search within product(s) specified with this option. "
+                    "You can supply several by repeating this option with a new product name."),
+              multiple=True)
+@click.argument('fields', nargs=-1)
+@ui.pass_index()
+def find_duplicates(index: Index, product_names, fields):
+    """
+    Find dataset ids of two or more active datasets that have duplicate values in the specified fields.
+    If products are specified, search only within those products. Otherwise, search within any products that
+    have the fields.
+    """
+    if not fields:
+        click.echo('Error: must provide field names to match on\n')
+        sys.exit(1)
+
+    # if no products were specified, use whichever ones have the specified search fields
+    # if products were specified, check they all have the required fields
+    products_with_fields = list(index.products.get_with_fields(fields))
+    if not products_with_fields:
+        click.echo(f'Error: no products found with fields {", ".join(fields)}\n')
+        sys.exit(1)
+    if not list(product_names):
+        products = products_with_fields
+    else:
+        products = [index.products.get_by_name(name) for name in product_names]
+        products_without_fields = set(products).difference(set(products_with_fields))
+        if len(products_without_fields):
+            click.echo(f'Error: specified products {", ".join(p.name for p in products_without_fields)} '
+                       'do not contain all required fields\n')
+            sys.exit(1)
+
+    dupes = []
+    for product in products:
+        dupes.extend(list(index.datasets.search_product_duplicates(product, *fields)))
+    if len(dupes):
+        print(dupes)
+    else:
+        click.echo('No potential duplicates found.')
diff --git a/docs/about/whats_new.rst b/docs/about/whats_new.rst
index 0e19faf7d..3b8718bc1 100644
--- a/docs/about/whats_new.rst
+++ b/docs/about/whats_new.rst
@@ -25,11 +25,20 @@ v1.9.next
 - Increase minimum Python version to 3.10 (:pull:`1509`)
 - Virtual product tests using odc-geo GridSpec (:pull:`1512`)
 - New Configuration API, as per ODC-EP10 (:pull:`1505`)
+- Alembic migrations for postgis driver (:pull:`1520`)
 
 
 v1.8.next
 =========
+- Add dataset cli tool ``find-duplicates`` to identify duplicate indexed datasets (:pull:`1517`)
+- Make solar_day() timezone aware (:pull:`1521`)
+- Warn if non-eo3 dataset has eo3 metadata type (:pull:`1523`)
+
+v1.8.17 (8th November 2023)
+===========================
 - Fix schema creation with postgres driver when initialising system with ``--no-init-users`` (:pull:`1504`)
+- Switch to new jsonschema 'referencing' API and repin jsonschema to >=4.18 (:pull:`1477`)
+- Update whats_new.rst for release (:pull:`1510`)
 
 v1.8.16 (17th October 2023)
 ===========================
diff --git a/integration_tests/index/test_memory_index.py b/integration_tests/index/test_memory_index.py
index f691b111f..f35e8efc2 100644
--- a/integration_tests/index/test_memory_index.py
+++ b/integration_tests/index/test_memory_index.py
@@ -204,9 +204,7 @@ def test_mem_ds_search_dups(mem_eo3_data: ODCEnvironment):
     dc, ls8_id, wo_id = mem_eo3_data
     ls8_ds = dc.index.datasets.get(ls8_id)
     dup_results = dc.index.datasets.search_product_duplicates(ls8_ds.product, "cloud_cover", "dataset_maturity")
-    assert len(dup_results) == 1
-    assert dup_results[0][0].cloud_cover == ls8_ds.metadata.cloud_cover
-    assert ls8_id in dup_results[0][1]
+    assert len(dup_results) == 0
 
 
 def test_mem_ds_locations(mem_eo3_data: ODCEnvironment):
diff --git a/integration_tests/index/test_search_eo3.py b/integration_tests/index/test_search_eo3.py
index 6813c0711..fe0a758ee 100644
--- a/integration_tests/index/test_search_eo3.py
+++ b/integration_tests/index/test_search_eo3.py
@@ -7,6 +7,7 @@
 """
 import datetime
 from typing import Any
+from collections import namedtuple
 
 import pytest
 import yaml
@@ -777,13 +778,14 @@ def test_find_duplicates_eo3(index,
     assert len(all_datasets) == 5
 
     # First two ls8 datasets have the same path/row, last two have a different row.
+    dupe_fields = namedtuple('search_result', ['region_code', 'dataset_maturity'])
     expected_ls8_path_row_duplicates = [
         (
-            ('090086', 'final'),
+            dupe_fields('090086', 'final'),
             {ls8_eo3_dataset.id, ls8_eo3_dataset2.id}
         ),
         (
-            ('101077', 'final'),
+            dupe_fields('101077', 'final'),
             {ls8_eo3_dataset3.id, ls8_eo3_dataset4.id}
         ),
 
@@ -811,6 +813,30 @@ def test_find_duplicates_eo3(index,
     assert sat_res == []
 
 
+def test_find_duplicates_with_time(index, nrt_dataset, final_dataset, ls8_eo3_dataset):
+    index.datasets.add(nrt_dataset, with_lineage=False)
+    index.datasets.add(final_dataset, with_lineage=False)
+    index.datasets.get(nrt_dataset.id).is_active
+    index.datasets.get(final_dataset.id).is_active
+
+    all_datasets = index.datasets.search_eager()
+    assert len(all_datasets) == 3
+
+    dupe_fields = namedtuple('search_result', ['region_code', 'time'])
+    expected_result = [
+        (
+            dupe_fields('090086', '("2023-04-30 23:50:33.884549","2023-04-30 23:50:34.884549")'),
+            {nrt_dataset.id, final_dataset.id}
+        )
+    ]
+    res = sorted(index.datasets.search_product_duplicates(
+        nrt_dataset.product,
+        'region_code', 'time',
+    ))
+
+    assert res == expected_result
+
+
 def test_csv_search_via_cli_eo3(clirunner: Any,
                                 ls8_eo3_dataset: Dataset,
                                 ls8_eo3_dataset2: Dataset) -> None:
diff --git a/integration_tests/test_cli_output.py b/integration_tests/test_cli_output.py
index be2726720..7c53fa899 100644
--- a/integration_tests/test_cli_output.py
+++ b/integration_tests/test_cli_output.py
@@ -83,6 +83,32 @@ def test_cli_dataset_subcommand(index, clirunner,
     for path in eo3_dataset_paths:
         result = clirunner(['dataset', 'add', "--ignore-lineage", path])
 
+    runner = clirunner(['dataset', 'find-duplicates'], verbose_flag=False, expect_success=False)
+    assert "Error: must provide field names to match on" in runner.output
+    assert runner.exit_code == 1
+
+    runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'fake_field'],
+                       verbose_flag=False, expect_success=False)
+    assert "Error: no products found with fields region_code, fake_field" in runner.output
+    assert runner.exit_code == 1
+
+    runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'landsat_scene_id',
+                        '-p', 'ga_ls8c_ard_3', '-p', 'ga_ls_wo_3'],
+                       verbose_flag=False,
+                       expect_success=False)
+    assert "Error: specified products ga_ls_wo_3 do not contain all required fields"
+    assert runner.exit_code == 1
+
+    runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'uri'], verbose_flag=False)
+    assert "No potential duplicates found." in runner.output
+    assert runner.exit_code == 0
+
+    runner = clirunner(['dataset', 'find-duplicates', 'region_code', 'dataset_maturity'], verbose_flag=False)
+    assert "No potential duplicates found." not in runner.output
+    assert "(region_code='090086', dataset_maturity='final')" in runner.output
+    assert "(region_code='101077', dataset_maturity='final')" in runner.output
+    assert runner.exit_code == 0
+
     runner = clirunner(['dataset', 'archive'], verbose_flag=False, expect_success=False)
     assert "Completed dataset archival." not in runner.output
     assert "Usage:  [OPTIONS] [IDS]" in runner.output
diff --git a/integration_tests/test_dataset_add.py b/integration_tests/test_dataset_add.py
index 53ff2f42d..1a0e3de0f 100644
--- a/integration_tests/test_dataset_add.py
+++ b/integration_tests/test_dataset_add.py
@@ -7,6 +7,7 @@
 import pytest
 import toolz
 import yaml
+import logging
 
 from datacube.index import Index
 from datacube.index.hl import Doc2Dataset
@@ -15,6 +16,8 @@
 from datacube.utils import SimpleDocNav
 from datacube.scripts.dataset import _resolve_uri
 
+logger = logging.getLogger(__name__)
+
 
 def check_skip_lineage_test(clirunner, index):
     ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True))
@@ -207,6 +210,29 @@ def test_dataset_add_not_eo3(index, ls8_eo3_product, eo3_wo_dataset_doc):
     assert isinstance(_err, BadMatch)
 
 
+@pytest.mark.parametrize('datacube_env_name', ('datacube', ))
+def test_dataset_eo3_no_schema(dataset_add_configs, index_empty, clirunner, caplog):
+    p = dataset_add_configs
+    index = index_empty
+    ds = load_dataset_definition(p.datasets_eo3).doc
+
+    clirunner(['metadata', 'add', p.metadata])
+    clirunner(['product', 'add', p.products])
+
+    # no warnings if eo3 dataset has $schema
+    doc2ds = Doc2Dataset(index)
+    doc2ds(ds, 'file:///something')
+    warnings = [record for record in caplog.records if record.levelno == logging.WARNING]
+    assert len(warnings) == 0
+
+    # warn if eo3 metadata type but no $schema
+    del ds["$schema"]
+    doc2ds(ds, 'file:///something')
+    warnings = [record for record in caplog.records if record.levelno == logging.WARNING]
+    assert len(warnings) == 1
+    assert "will not be recognised as an eo3 dataset" in warnings[0].msg
+
+
 # Current formulation of this test relies on non-EO3 test data
 @pytest.mark.parametrize('datacube_env_name', ('datacube', ))
 def test_dataset_add(dataset_add_configs, index_empty, clirunner):
diff --git a/tests/api/test_query.py b/tests/api/test_query.py
index aa59561db..c136cda9a 100644
--- a/tests/api/test_query.py
+++ b/tests/api/test_query.py
@@ -171,6 +171,13 @@ def test_solar_day():
 
     assert 'Cannot compute solar_day: dataset is missing spatial info' in str(e.value)
 
+    # Test with Non-UTC timestamp in index.
+    ds = _s(center_time=parse_time('1997-05-22 22:07:44.2270250-7:00'),
+            metadata=_s(lon=Range(begin=-136.615,
+                                  end=-134.325)))
+    assert solar_day(ds) == np.datetime64('1997-05-22', 'D')
+    assert solar_day(ds, longitude=0) == np.datetime64('1997-05-23', 'D')
+
 
 def test_solar_offset():
     from odc.geo.geom import point