Postgis performance tuning (#1480)

* Document best practice for bringing changes over from master and update constraints.txt * Types -> Datasets in postgis/_datasets.py * GIST indexes for range types. * Remove redundant indexes. * Basic profiling script. * Update whats_new.rst. * linting of profiling script.
opendatacube · Aug 1, 2023 · 9ef6456 · 9ef6456
1 parent 822b7d2
commit 9ef6456
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 17 deletions.
diff --git a/datacube/drivers/postgis/_schema.py b/datacube/drivers/postgis/_schema.py
@@ -263,7 +263,7 @@ class DatasetSearchNumeric:
     __table_args__ = (
         _core.METADATA,
         PrimaryKeyConstraint("dataset_ref", "search_key"),
-        Index("ix_num_search", "search_key", "search_val"),
+        Index("ix_num_search", "search_val", postgresql_using="gist"),
         {
             "schema": sql.SCHEMA_NAME,
             "comment": "Index for searching datasets by search fields of numeric type"
@@ -276,7 +276,7 @@ class DatasetSearchNumeric:
                         comment="The name of the search field")
     search_val = Column(NUMRANGE,
                         nullable=True,
-                        comment="The value of the numeric range search field")
+                        comment="The value of the numeric range search field",)
 
 
 @orm_registry.mapped
@@ -285,7 +285,7 @@ class DatasetSearchDateTime:
     __table_args__ = (
         _core.METADATA,
         PrimaryKeyConstraint("dataset_ref", "search_key"),
-        Index("ix_dt_search", "search_key", "search_val"),
+        Index("ix_dt_search", "search_val", postgresql_using="gist"),
         {
             "schema": sql.SCHEMA_NAME,
             "comment": "Index for searching datasets by search fields of datetime type"

diff --git a/datacube/index/postgis/_datasets.py b/datacube/index/postgis/_datasets.py
@@ -40,7 +40,7 @@
 class DatasetResource(AbstractDatasetResource, IndexResourceAddIn):
     """
     :type _db: datacube.drivers.postgis._connections.PostgresDb
-    :type types: datacube.index._products.ProductResource
+    :type products: datacube.index._products.ProductResource
     """
 
     def __init__(self, db, index):
@@ -346,7 +346,7 @@ def update(self, dataset: Dataset, updates_allowed=None, archive_less_mature=Fal
 
         _LOG.info("Updating dataset %s", dataset.id)
 
-        product = self.types.get_by_name(dataset.product.name)
+        product = self.products.get_by_name(dataset.product.name)
         with self._db_connection(transaction=True) as transaction:
             if not transaction.update_dataset(dataset.metadata_doc_without_lineage(), dataset.id, product.id):
                 raise ValueError("Failed to update dataset %s..." % dataset.id)
@@ -426,13 +426,13 @@ def get_field_names(self, product_name=None):
         :rtype: set[str]
         """
         if product_name is None:
-            types = self.types.get_all()
+            products = self.products.get_all()
         else:
-            types = [self.types.get_by_name(product_name)]
+            products = [self.products.get_by_name(product_name)]
 
         out = set()
-        for type_ in types:
-            out.update(type_.metadata_type.dataset_fields)
+        for prod_ in products:
+            out.update(prod_.metadata_type.dataset_fields)
         return out
 
     def get_locations(self, id_):
@@ -538,7 +538,7 @@ def _make(self, dataset_res, full_info=False, product=None):
         else:
             uris = []
 
-        product = product or self.types.get(dataset_res.product_ref)
+        product = product or self.products.get(dataset_res.product_ref)
 
         return Dataset(
             product=product,
@@ -665,19 +665,19 @@ def count_product_through_time(self, period, **query):
         return next(self._do_time_count(period, query, ensure_single=True))[1]
 
     def _get_products(self, q):
-        types = set()
+        products = set()
         if 'product' in q.keys():
-            types.add(self.types.get_by_name(q['product']))
+            products.add(self.products.get_by_name(q['product']))
         else:
             # Otherwise search any metadata type that has all the given search fields.
-            types = self.types.get_with_fields(tuple(q.keys()))
-            if not types:
+            products = self.products.get_with_fields(tuple(q.keys()))
+            if not products:
                 raise ValueError('No type of dataset has fields: {}'.format(q.keys()))
 
-        return types
+        return products
 
     def _get_product_queries(self, query):
-        for product, q in self.types.search_robust(**query):
+        for product, q in self.products.search_robust(**query):
             q['product_id'] = product.id
             yield q, product
 
@@ -782,7 +782,7 @@ def get_product_time_bounds(self, product: str):
         """
 
         # Get the offsets from dataset doc
-        product = self.types.get_by_name(product)
+        product = self.products.get_by_name(product)
         dataset_section = product.metadata_type.definition['dataset']
         min_offset = dataset_section['search_fields']['time']['min_offset']
         max_offset = dataset_section['search_fields']['time']['max_offset']

diff --git a/docs/about/whats_new.rst b/docs/about/whats_new.rst
@@ -20,6 +20,7 @@ v1.9.next
 - Remove ingestion methods, `GridWorkflow` and `Tile` classes (:pull:`1465`)
 - Fix postgis queries for numeric custom search fields (:pull:`1475`)
 - Document best practice for pulling in changes from develop and update constraints.txt (:pull:`1478`)
+- Postgis index driver performance tuning (:pull:`1480`)
 
 
 v1.8.next

diff --git a/odc_search_profile.py b/odc_search_profile.py
@@ -0,0 +1,105 @@
+import sys
+
+from time import monotonic
+from datacube import Datacube
+from datacube.model import Range
+from datetime import timezone
+from datetime import datetime
+from odc.geo.geom import CRS, polygon
+
+
+def benchmark(test, dc, label, n):
+    total = 0.0
+    total_first = 0.0
+    last_count = None
+    for i in range(n):
+        start = monotonic()
+        # count, first = test(dc)
+        count = 0
+        first = None
+        for ds in test(dc):
+            if not count:
+                first = monotonic()
+            count += 1
+        if count == 0:
+            first = start
+        end = monotonic()
+        if last_count and count != last_count:
+            print(f"Count mismatch in {label}: {count} vs {last_count}")
+        last_count = count
+        print(f"Test {label}#{i+1}: {end-start}s   ({first-start}s to first returned dataset)")
+        total += end - start
+        total_first += first - start
+    print(f"Test {label}-count: {count} rows")
+    print(f"Test {label}-avg: {total/n}s  ({total/(n*count)})s/row")
+    print(f"Test {label}-avg-to-first-return: {total_first/n}s  ({total/(n*count)})s/row")
+    print()
+    print("-----------------------------------------------------------------")
+
+
+def test_less_than(dc):
+    return dc.index.datasets.search(
+        product='ga_ls8c_ard_3',
+        cloud_cover=Range(None, 0.2)
+    )
+
+
+def test_geospatial_search(dc):
+    return dc.index.datasets.search(
+        product='ga_ls8c_ard_3',
+        lat=Range(-30.0, -25.0),
+        lon=Range(140.0, 145.0),
+    )
+
+
+def test_offset_geom(dc):
+    if dc.index.supports_external_lineage:
+        return dc.index.datasets.search(
+            product='ga_ls8c_ard_3',
+            geometry=polygon(
+                [
+                    [140.0, -25.0],
+                    [142.0, -25.0],
+                    [145.0, -30.0],
+                    [145.0, -30.0],
+                    [140.0, -25.0],
+                ],
+                crs=CRS("epsg:4326")
+            )
+        )
+    else:
+        return dc.index.datasets.search(
+            product='ga_ls8c_ard_3',
+            lat=Range(-30.0, -25.0),
+            lon=Range(140.0, 145.0),
+        )
+
+
+def test_temporal_search(dc):
+    return dc.index.datasets.search(
+        product='ga_ls8c_ard_3',
+        time=Range(datetime(2016, 1, 1, tzinfo=timezone.utc), datetime(2016, 4, 5, tzinfo=timezone.utc)),
+    )
+
+
+def main(args):
+    if args:
+        env = args.pop()
+    else:
+        env = "datacube_real"
+    print("Testing on database ", env)
+    dc = Datacube(env=env)
+    benchmark(test_less_than, dc, "less_than", 20)
+    benchmark(test_geospatial_search, dc, "geospatial", 20)
+    benchmark(test_temporal_search, dc, "temporal", 20)
+    benchmark(test_offset_geom, dc, "geom", 20)
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:]
+    main(args)
+
+# For custom CRS search tests (TODO)
+# BoundingBox(
+#       left=762759.2567816022, bottom=-3326371.8490792206,
+#       right=1295116.9248742603, top=-2727561.09954842, crs=CRS('EPSG:3577'))