Merge pull request #5 from scottstanie/add-datasets

Add datasets pulled by `pooch`
opera-adt · Nov 11, 2023 · 73e6059 · 73e6059
2 parents c261907 + 8d7baf3
commit 73e6059
Show file tree

Hide file tree

Showing 13 changed files with 974 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -33,8 +33,8 @@ mamba install -c conda-forge opera-utils
 (Note: [using `mamba`](https://mamba.readthedocs.io/en/latest/mamba-installation.html#mamba-install) is recommended for conda-forge packages, but miniconda can also be used.)
 
 While not required for all, some utilities use the GDAL package, which can be installed most easily on conda-forge:
-``` bash
-mamba install -c conda-forge gdal
+```bash
+mamba env update --file environment-geo.yml
 ```
 
 ## Setup for Developers

diff --git a/environment-geo.yml b/environment-geo.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+dependencies:
+  - pyogrio>=0.5
+  - gdal>=3.3
+  - geopandas-base>=0.12
diff --git a/environment.yml b/environment.yml
@@ -6,5 +6,5 @@ dependencies:
   - pip>=21.3  # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#editable-installation
   - git  # for pip install, due to setuptools_scm
   - h5py>=1.10
+  - pooch>=1.7
   - shapely>=1.8
-  # - gdal>=3.3  # For now, gdal will be an optional dependency
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,7 @@ profile = "black"
 known_first_party = [" opera_utils"]
 
 [tool.mypy]
-python_version = "3.8"
+python_version = "3.10"
 ignore_missing_imports = true
 
 

diff --git a/src/opera_utils/__init__.py b/src/opera_utils/__init__.py
@@ -7,7 +7,10 @@
 from ._io import *
 from ._version import version as __version__
 from .bursts import *
+from .burst_frame_db import *
 from .constants import *
+from .datasets import *
 from .missing_data import *
 
+
 __all__ = ["__version__"]
diff --git a/src/opera_utils/burst_frame_db.py b/src/opera_utils/burst_frame_db.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import json
+import zipfile
+from pathlib import Path
+from typing import Optional, Sequence
+
+from . import datasets
+from ._types import Bbox, PathOrStr
+
+
+def read_zipped_json(filename: PathOrStr):
+    """Read a zipped JSON file and returns its contents as a dictionary.
+
+    Parameters
+    ----------
+    filename : PathOrStr
+        The path to the zipped JSON file.
+
+    Returns
+    -------
+    dict
+        The contents of the zipped JSON file as a dictionary.
+    """
+    if Path(filename).suffix == ".zip":
+        with zipfile.ZipFile(filename) as zf:
+            bytes = zf.read(str(Path(filename).name).replace(".zip", ""))
+            return json.loads(bytes.decode())
+    else:
+        with open(filename) as f:
+            return json.load(f)
+
+
+def get_frame_to_burst_mapping(
+    frame_id: int, json_file: Optional[PathOrStr] = None
+) -> dict:
+    """Get the frame data for one frame ID.
+
+    Parameters
+    ----------
+    frame_id : int
+        The ID of the frame to get the bounding box for.
+    json_file : PathOrStr, optional
+        The path to the JSON file containing the frame-to-burst mapping.
+        If `None`, uses the zip file contained in `data/`
+    Returns
+    -------
+    dict
+        The frame data for the given frame ID.
+    """
+    if json_file is None:
+        json_file = datasets.fetch_frame_to_burst_mapping_file()
+    js = read_zipped_json(json_file)
+    return js["data"][str(frame_id)]
+
+
+def get_frame_geojson(
+    as_geodataframe: bool = False,
+    columns: Optional[Sequence[str]] = None,
+    frame_ids: Optional[Sequence[str]] = None,
+) -> dict:
+    """Get the GeoJSON for the frame geometries."""
+    where = _form_where_in_query(frame_ids, "frame_id") if frame_ids else None
+    return _get_geojson(
+        datasets.fetch_frame_geometries_simple(),
+        as_geodataframe=as_geodataframe,
+        columns=columns,
+        where=where,
+    )
+
+
+def get_burst_id_geojson(
+    as_geodataframe: bool = False,
+    columns: Optional[Sequence[str]] = None,
+    burst_ids: Optional[Sequence[str]] = None,
+) -> dict:
+    """Get the GeoJSON for the burst_id geometries."""
+    where = _form_where_in_query(burst_ids, "burst_id_jpl") if burst_ids else None
+    return _get_geojson(
+        datasets.fetch_burst_id_geometries_simple(),
+        as_geodataframe=as_geodataframe,
+        columns=columns,
+        where=where,
+    )
+
+
+def _form_where_in_query(values: Sequence[str], column_name):
+    # Example:
+    # "burst_id_jpl in ('t005_009471_iw2','t007_013706_iw2','t008_015794_iw1')"
+    burst_str = ",".join(f"'{b}'" for b in values)
+    return f"{column_name} IN ({burst_str})"
+
+
+def _get_geojson(
+    f,
+    as_geodataframe: bool = False,
+    columns: Optional[Sequence[str]] = None,
+    where: Optional[str] = None,
+) -> dict:
+    # https://gdal.org/user/ogr_sql_dialect.html#where
+    # https://pyogrio.readthedocs.io/en/latest/introduction.html#filter-records-by-attribute-value
+    if as_geodataframe:
+        from pyogrio import read_dataframe
+
+        # import geopandas as gpd
+        # return gpd.read_file(f)
+        return read_dataframe(f, columns=columns, where=where)
+
+    return read_zipped_json(f)
+
+
+def get_frame_bbox(
+    frame_id: int, json_file: Optional[PathOrStr] = None
+) -> tuple[int, Bbox]:
+    """Get the bounding box of a frame from a JSON file.
+
+    Parameters
+    ----------
+    frame_id : int
+        The ID of the frame to get the bounding box for.
+    json_file : PathOrStr, optional
+        The path to the JSON file containing the frame-to-burst mapping.
+        If `None`, fetches the remote zip file from `datasets`
+
+    Returns
+    -------
+    epsg : int
+        EPSG code for the bounds coordinates
+    tuple[float, float, float, float]
+        bounding box coordinates (xmin, ymin, xmax, ymax)
+    """
+    frame_dict = get_frame_to_burst_mapping(frame_id=frame_id, json_file=json_file)
+    epsg = int(frame_dict["epsg"])
+    bounds = (
+        float(frame_dict["xmin"]),
+        float(frame_dict["ymin"]),
+        float(frame_dict["xmax"]),
+        float(frame_dict["ymax"]),
+    )
+    return epsg, bounds
+
+
+def get_burst_ids_for_frame(
+    frame_id: int, json_file: Optional[PathOrStr] = None
+) -> list[str]:
+    """Get the burst IDs for one frame ID.
+
+    Parameters
+    ----------
+    frame_id : int
+        The ID of the frame to get the bounding box for.
+    json_file : PathOrStr, optional
+        The path to the JSON file containing the frame-to-burst mapping.
+        If `None`, fetches the remote zip file from `datasets`
+
+    Returns
+    -------
+    list[str]
+        The burst IDs for the given frame ID.
+    """
+    frame_data = get_frame_to_burst_mapping(frame_id, json_file)
+    return frame_data["burst_ids"]
diff --git a/src/opera_utils/bursts.py b/src/opera_utils/bursts.py
@@ -149,36 +149,43 @@ def sort_by_burst_id(file_list, burst_id_fmt):
 @overload
 def filter_by_burst_id(
     files: Iterable[PathLikeT],
-    burst_ids: Iterable[str],
+    burst_ids: str | Iterable[str],
 ) -> list[PathLikeT]:
     ...
 
 
 @overload
 def filter_by_burst_id(
     files: Iterable[str],
-    burst_ids: Iterable[str],
+    burst_ids: str | Iterable[str],
 ) -> list[str]:
     ...
 
 
 def filter_by_burst_id(files, burst_ids):
     """Keep only items from `files` which contain a burst ID in `burst_ids`.
 
+    Searches only the burst ID in the base name, not the full path.
+
     Parameters
     ----------
     files : Iterable[PathLikeT] or Iterable[str]
         Iterable of files to filter
-    burst_ids : Iterable[str]
-        Iterable containing the of burst IDs to keep
+    burst_ids : str | Iterable[str]
+        Burst ID/Iterable containing the of burst IDs to keep
 
     Returns
     -------
     list[PathLikeT] or list[str]
         filtered list of files
     """
+    if isinstance(burst_ids, str):
+        burst_ids = [burst_ids]
+
     burst_id_set = set(burst_ids)
-    return [f for f in files if get_burst_id(f) in burst_id_set]
+    parsed_burst_ids = [get_burst_id(Path(f).name) for f in files]
+    # Only search the burst ID in the name, not the full path
+    return [f for (f, b) in zip(files, parsed_burst_ids) if b in burst_id_set]
 
 
 def get_cslc_polygon(

diff --git a/src/opera_utils/datasets.py b/src/opera_utils/datasets.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import pooch
+
+__all__ = [
+    "fetch_frame_geometries_simple",
+    "fetch_burst_id_geometries_simple",
+    "fetch_burst_to_frame_mapping_file",
+    "fetch_frame_to_burst_mapping_file",
+]
+
+# See: https://github.com/opera-adt/burst_db/tree/main/src/burst_db/data
+# BASE_URL = "https://github.com/opera-adt/burst_db/raw/v{version}/src/burst_db/data"
+# BASE_URL = "https://github.com/opera-adt/burst_db/raw/v0.3.0/src/burst_db/data"
+BASE_URL = "https://github.com/opera-adt/burst_db/releases/download/v{version}/"
+
+# $ ls *json.zip | xargs -n1 shasum -a 256
+# 8ee9cae079b9adb24e223b9ff9c81c66506a2a1a72a456220133a9f7f5d4d93b  burst_id_geometries_simple.geojson.zip
+# 86657e4e578cfced18a66984758fff9a1bf94e8591a288be0d1ad391399f2e59  frame_geometries_simple.geojson.zip
+# 436cce345378dc31e81ed661497bab2e744217a5d63c0bb92817dc837786cd22  opera-s1-disp-burst-to-frame.json.zip
+# 8b7ed8c8d90ef3d3348bc226958a26a2cb8ab302a6466762aa971b8f7333517f  opera-s1-disp-frame-to-burst.json.zip
+
+BURST_DB_VERSION = "0.3.1"
+
+POOCH = pooch.create(
+    # Folder where the data will be stored. For a sensible default, use the
+    # default cache folder for your OS.
+    path=pooch.os_cache("opera_utils"),
+    # Base URL of the remote data store. Will call .format on this string
+    # to insert the version (see below).
+    base_url=BASE_URL,
+    # Pooches are versioned so that you can use multiple versions of a
+    # package simultaneously. Use PEP440 compliant version number. The
+    # version will be appended to the path.
+    version=BURST_DB_VERSION,
+    # If a version as a "+XX.XXXXX" suffix, we'll assume that this is a dev
+    # version and replace the version with this string.
+    version_dev="main",
+    # An environment variable that overwrites the path.
+    env="OPERA_UTILS_DATA_DIR",
+    # The cache file registry. A dictionary with all files managed by this
+    # pooch. Keys are the file names (relative to *base_url*) and values
+    # are their respective SHA256 hashes. Files will be downloaded
+    # automatically when needed.
+    registry={
+        f"frame-geometries-simple-{BURST_DB_VERSION}.geojson.zip": "f0094f4cdc287d56d7a126a42f1e3075e50309afe8a431f49df1ecd8d8b26c8b",
+        f"burst-id-geometries-simple-{BURST_DB_VERSION}.geojson.zip": "d9cfe71ec836facd5a782ea82625c30a824b78f2b2689106c4d6808bbfce0898",
+        f"opera-s1-disp-burst-to-frame-{BURST_DB_VERSION}.json.zip": "436cce345378dc31e81ed661497bab2e744217a5d63c0bb92817dc837786cd22",
+        f"opera-s1-disp-frame-to-burst-{BURST_DB_VERSION}.json.zip": "a48382afcb89f0ff681982b0fc24476ec9c6c1b8a67ae1a26cf380a450ffadc0",
+    },
+)
+
+
+def fetch_frame_geometries_simple() -> str:
+    """Get the simplified frame geometries for the burst database."""
+    return POOCH.fetch(f"frame-geometries-simple-{BURST_DB_VERSION}.geojson.zip")
+
+
+def fetch_burst_id_geometries_simple() -> str:
+    """Get the simplified burst ID geometries for the burst database."""
+    return POOCH.fetch(f"burst-id-geometries-simple-{BURST_DB_VERSION}.geojson.zip")
+
+
+def fetch_burst_to_frame_mapping_file() -> str:
+    """Get the burst-to-frame mapping for the burst database."""
+    return POOCH.fetch(f"opera-s1-disp-burst-to-frame-{BURST_DB_VERSION}.json.zip")
+
+
+def fetch_frame_to_burst_mapping_file() -> str:
+    """Get the frame-to-burst mapping for the burst database."""
+    return POOCH.fetch(f"opera-s1-disp-frame-to-burst-{BURST_DB_VERSION}.json.zip")