Skip to content

Commit

Permalink
Postgis performance tuning (#1480)
Browse files Browse the repository at this point in the history
* Document best practice for bringing changes over from master and update constraints.txt

* Types -> Datasets in postgis/_datasets.py

* GIST indexes for range types.

* Remove redundant indexes.

* Basic profiling script.

* Update whats_new.rst.

* linting of profiling script.
  • Loading branch information
SpacemanPaul committed Aug 1, 2023
1 parent 822b7d2 commit 9ef6456
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 17 deletions.
6 changes: 3 additions & 3 deletions datacube/drivers/postgis/_schema.py
Expand Up @@ -263,7 +263,7 @@ class DatasetSearchNumeric:
__table_args__ = (
_core.METADATA,
PrimaryKeyConstraint("dataset_ref", "search_key"),
Index("ix_num_search", "search_key", "search_val"),
Index("ix_num_search", "search_val", postgresql_using="gist"),
{
"schema": sql.SCHEMA_NAME,
"comment": "Index for searching datasets by search fields of numeric type"
Expand All @@ -276,7 +276,7 @@ class DatasetSearchNumeric:
comment="The name of the search field")
search_val = Column(NUMRANGE,
nullable=True,
comment="The value of the numeric range search field")
comment="The value of the numeric range search field",)


@orm_registry.mapped
Expand All @@ -285,7 +285,7 @@ class DatasetSearchDateTime:
__table_args__ = (
_core.METADATA,
PrimaryKeyConstraint("dataset_ref", "search_key"),
Index("ix_dt_search", "search_key", "search_val"),
Index("ix_dt_search", "search_val", postgresql_using="gist"),
{
"schema": sql.SCHEMA_NAME,
"comment": "Index for searching datasets by search fields of datetime type"
Expand Down
28 changes: 14 additions & 14 deletions datacube/index/postgis/_datasets.py
Expand Up @@ -40,7 +40,7 @@
class DatasetResource(AbstractDatasetResource, IndexResourceAddIn):
"""
:type _db: datacube.drivers.postgis._connections.PostgresDb
:type types: datacube.index._products.ProductResource
:type products: datacube.index._products.ProductResource
"""

def __init__(self, db, index):
Expand Down Expand Up @@ -346,7 +346,7 @@ def update(self, dataset: Dataset, updates_allowed=None, archive_less_mature=Fal

_LOG.info("Updating dataset %s", dataset.id)

product = self.types.get_by_name(dataset.product.name)
product = self.products.get_by_name(dataset.product.name)
with self._db_connection(transaction=True) as transaction:
if not transaction.update_dataset(dataset.metadata_doc_without_lineage(), dataset.id, product.id):
raise ValueError("Failed to update dataset %s..." % dataset.id)
Expand Down Expand Up @@ -426,13 +426,13 @@ def get_field_names(self, product_name=None):
:rtype: set[str]
"""
if product_name is None:
types = self.types.get_all()
products = self.products.get_all()
else:
types = [self.types.get_by_name(product_name)]
products = [self.products.get_by_name(product_name)]

out = set()
for type_ in types:
out.update(type_.metadata_type.dataset_fields)
for prod_ in products:
out.update(prod_.metadata_type.dataset_fields)
return out

def get_locations(self, id_):
Expand Down Expand Up @@ -538,7 +538,7 @@ def _make(self, dataset_res, full_info=False, product=None):
else:
uris = []

product = product or self.types.get(dataset_res.product_ref)
product = product or self.products.get(dataset_res.product_ref)

return Dataset(
product=product,
Expand Down Expand Up @@ -665,19 +665,19 @@ def count_product_through_time(self, period, **query):
return next(self._do_time_count(period, query, ensure_single=True))[1]

def _get_products(self, q):
types = set()
products = set()
if 'product' in q.keys():
types.add(self.types.get_by_name(q['product']))
products.add(self.products.get_by_name(q['product']))
else:
# Otherwise search any metadata type that has all the given search fields.
types = self.types.get_with_fields(tuple(q.keys()))
if not types:
products = self.products.get_with_fields(tuple(q.keys()))
if not products:
raise ValueError('No type of dataset has fields: {}'.format(q.keys()))

return types
return products

def _get_product_queries(self, query):
for product, q in self.types.search_robust(**query):
for product, q in self.products.search_robust(**query):
q['product_id'] = product.id
yield q, product

Expand Down Expand Up @@ -782,7 +782,7 @@ def get_product_time_bounds(self, product: str):
"""

# Get the offsets from dataset doc
product = self.types.get_by_name(product)
product = self.products.get_by_name(product)
dataset_section = product.metadata_type.definition['dataset']
min_offset = dataset_section['search_fields']['time']['min_offset']
max_offset = dataset_section['search_fields']['time']['max_offset']
Expand Down
1 change: 1 addition & 0 deletions docs/about/whats_new.rst
Expand Up @@ -20,6 +20,7 @@ v1.9.next
- Remove ingestion methods, `GridWorkflow` and `Tile` classes (:pull:`1465`)
- Fix postgis queries for numeric custom search fields (:pull:`1475`)
- Document best practice for pulling in changes from develop and update constraints.txt (:pull:`1478`)
- Postgis index driver performance tuning (:pull:`1480`)


v1.8.next
Expand Down
105 changes: 105 additions & 0 deletions odc_search_profile.py
@@ -0,0 +1,105 @@
import sys

from time import monotonic
from datacube import Datacube
from datacube.model import Range
from datetime import timezone
from datetime import datetime
from odc.geo.geom import CRS, polygon


def benchmark(test, dc, label, n):
total = 0.0
total_first = 0.0
last_count = None
for i in range(n):
start = monotonic()
# count, first = test(dc)
count = 0
first = None
for ds in test(dc):
if not count:
first = monotonic()
count += 1
if count == 0:
first = start
end = monotonic()
if last_count and count != last_count:
print(f"Count mismatch in {label}: {count} vs {last_count}")
last_count = count
print(f"Test {label}#{i+1}: {end-start}s ({first-start}s to first returned dataset)")
total += end - start
total_first += first - start
print(f"Test {label}-count: {count} rows")
print(f"Test {label}-avg: {total/n}s ({total/(n*count)})s/row")
print(f"Test {label}-avg-to-first-return: {total_first/n}s ({total/(n*count)})s/row")
print()
print("-----------------------------------------------------------------")


def test_less_than(dc):
return dc.index.datasets.search(
product='ga_ls8c_ard_3',
cloud_cover=Range(None, 0.2)
)


def test_geospatial_search(dc):
return dc.index.datasets.search(
product='ga_ls8c_ard_3',
lat=Range(-30.0, -25.0),
lon=Range(140.0, 145.0),
)


def test_offset_geom(dc):
if dc.index.supports_external_lineage:
return dc.index.datasets.search(
product='ga_ls8c_ard_3',
geometry=polygon(
[
[140.0, -25.0],
[142.0, -25.0],
[145.0, -30.0],
[145.0, -30.0],
[140.0, -25.0],
],
crs=CRS("epsg:4326")
)
)
else:
return dc.index.datasets.search(
product='ga_ls8c_ard_3',
lat=Range(-30.0, -25.0),
lon=Range(140.0, 145.0),
)


def test_temporal_search(dc):
return dc.index.datasets.search(
product='ga_ls8c_ard_3',
time=Range(datetime(2016, 1, 1, tzinfo=timezone.utc), datetime(2016, 4, 5, tzinfo=timezone.utc)),
)


def main(args):
if args:
env = args.pop()
else:
env = "datacube_real"
print("Testing on database ", env)
dc = Datacube(env=env)
benchmark(test_less_than, dc, "less_than", 20)
benchmark(test_geospatial_search, dc, "geospatial", 20)
benchmark(test_temporal_search, dc, "temporal", 20)
benchmark(test_offset_geom, dc, "geom", 20)


if __name__ == "__main__":
args = sys.argv[1:]
main(args)

# For custom CRS search tests (TODO)
# BoundingBox(
# left=762759.2567816022, bottom=-3326371.8490792206,
# right=1295116.9248742603, top=-2727561.09954842, crs=CRS('EPSG:3577'))

0 comments on commit 9ef6456

Please sign in to comment.