Skip to content

Commit

Permalink
Merge pull request #5 from scottstanie/add-datasets
Browse files Browse the repository at this point in the history
Add datasets pulled by `pooch`
  • Loading branch information
scottstanie committed Nov 11, 2023
2 parents c261907 + 8d7baf3 commit 73e6059
Show file tree
Hide file tree
Showing 13 changed files with 974 additions and 13 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ mamba install -c conda-forge opera-utils
(Note: [using `mamba`](https://mamba.readthedocs.io/en/latest/mamba-installation.html#mamba-install) is recommended for conda-forge packages, but miniconda can also be used.)

While not required for all, some utilities use the GDAL package, which can be installed most easily on conda-forge:
``` bash
mamba install -c conda-forge gdal
```bash
mamba env update --file environment-geo.yml
```

## Setup for Developers
Expand Down
6 changes: 6 additions & 0 deletions environment-geo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
dependencies:
- pyogrio>=0.5
- gdal>=3.3
- geopandas-base>=0.12
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ dependencies:
- pip>=21.3 # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#editable-installation
- git # for pip install, due to setuptools_scm
- h5py>=1.10
- pooch>=1.7
- shapely>=1.8
# - gdal>=3.3 # For now, gdal will be an optional dependency
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ profile = "black"
known_first_party = [" opera_utils"]

[tool.mypy]
python_version = "3.8"
python_version = "3.10"
ignore_missing_imports = true


Expand Down
3 changes: 3 additions & 0 deletions src/opera_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
from ._io import *
from ._version import version as __version__
from .bursts import *
from .burst_frame_db import *
from .constants import *
from .datasets import *
from .missing_data import *


__all__ = ["__version__"]
162 changes: 162 additions & 0 deletions src/opera_utils/burst_frame_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
from __future__ import annotations

import json
import zipfile
from pathlib import Path
from typing import Optional, Sequence

from . import datasets
from ._types import Bbox, PathOrStr


def read_zipped_json(filename: PathOrStr):
"""Read a zipped JSON file and returns its contents as a dictionary.
Parameters
----------
filename : PathOrStr
The path to the zipped JSON file.
Returns
-------
dict
The contents of the zipped JSON file as a dictionary.
"""
if Path(filename).suffix == ".zip":
with zipfile.ZipFile(filename) as zf:
bytes = zf.read(str(Path(filename).name).replace(".zip", ""))
return json.loads(bytes.decode())
else:
with open(filename) as f:
return json.load(f)


def get_frame_to_burst_mapping(
frame_id: int, json_file: Optional[PathOrStr] = None
) -> dict:
"""Get the frame data for one frame ID.
Parameters
----------
frame_id : int
The ID of the frame to get the bounding box for.
json_file : PathOrStr, optional
The path to the JSON file containing the frame-to-burst mapping.
If `None`, uses the zip file contained in `data/`
Returns
-------
dict
The frame data for the given frame ID.
"""
if json_file is None:
json_file = datasets.fetch_frame_to_burst_mapping_file()
js = read_zipped_json(json_file)
return js["data"][str(frame_id)]


def get_frame_geojson(
as_geodataframe: bool = False,
columns: Optional[Sequence[str]] = None,
frame_ids: Optional[Sequence[str]] = None,
) -> dict:
"""Get the GeoJSON for the frame geometries."""
where = _form_where_in_query(frame_ids, "frame_id") if frame_ids else None
return _get_geojson(
datasets.fetch_frame_geometries_simple(),
as_geodataframe=as_geodataframe,
columns=columns,
where=where,
)


def get_burst_id_geojson(
as_geodataframe: bool = False,
columns: Optional[Sequence[str]] = None,
burst_ids: Optional[Sequence[str]] = None,
) -> dict:
"""Get the GeoJSON for the burst_id geometries."""
where = _form_where_in_query(burst_ids, "burst_id_jpl") if burst_ids else None
return _get_geojson(
datasets.fetch_burst_id_geometries_simple(),
as_geodataframe=as_geodataframe,
columns=columns,
where=where,
)


def _form_where_in_query(values: Sequence[str], column_name):
# Example:
# "burst_id_jpl in ('t005_009471_iw2','t007_013706_iw2','t008_015794_iw1')"
burst_str = ",".join(f"'{b}'" for b in values)
return f"{column_name} IN ({burst_str})"


def _get_geojson(
f,
as_geodataframe: bool = False,
columns: Optional[Sequence[str]] = None,
where: Optional[str] = None,
) -> dict:
# https://gdal.org/user/ogr_sql_dialect.html#where
# https://pyogrio.readthedocs.io/en/latest/introduction.html#filter-records-by-attribute-value
if as_geodataframe:
from pyogrio import read_dataframe

# import geopandas as gpd
# return gpd.read_file(f)
return read_dataframe(f, columns=columns, where=where)

return read_zipped_json(f)


def get_frame_bbox(
frame_id: int, json_file: Optional[PathOrStr] = None
) -> tuple[int, Bbox]:
"""Get the bounding box of a frame from a JSON file.
Parameters
----------
frame_id : int
The ID of the frame to get the bounding box for.
json_file : PathOrStr, optional
The path to the JSON file containing the frame-to-burst mapping.
If `None`, fetches the remote zip file from `datasets`
Returns
-------
epsg : int
EPSG code for the bounds coordinates
tuple[float, float, float, float]
bounding box coordinates (xmin, ymin, xmax, ymax)
"""
frame_dict = get_frame_to_burst_mapping(frame_id=frame_id, json_file=json_file)
epsg = int(frame_dict["epsg"])
bounds = (
float(frame_dict["xmin"]),
float(frame_dict["ymin"]),
float(frame_dict["xmax"]),
float(frame_dict["ymax"]),
)
return epsg, bounds


def get_burst_ids_for_frame(
frame_id: int, json_file: Optional[PathOrStr] = None
) -> list[str]:
"""Get the burst IDs for one frame ID.
Parameters
----------
frame_id : int
The ID of the frame to get the bounding box for.
json_file : PathOrStr, optional
The path to the JSON file containing the frame-to-burst mapping.
If `None`, fetches the remote zip file from `datasets`
Returns
-------
list[str]
The burst IDs for the given frame ID.
"""
frame_data = get_frame_to_burst_mapping(frame_id, json_file)
return frame_data["burst_ids"]
17 changes: 12 additions & 5 deletions src/opera_utils/bursts.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,36 +149,43 @@ def sort_by_burst_id(file_list, burst_id_fmt):
@overload
def filter_by_burst_id(
files: Iterable[PathLikeT],
burst_ids: Iterable[str],
burst_ids: str | Iterable[str],
) -> list[PathLikeT]:
...


@overload
def filter_by_burst_id(
files: Iterable[str],
burst_ids: Iterable[str],
burst_ids: str | Iterable[str],
) -> list[str]:
...


def filter_by_burst_id(files, burst_ids):
"""Keep only items from `files` which contain a burst ID in `burst_ids`.
Searches only the burst ID in the base name, not the full path.
Parameters
----------
files : Iterable[PathLikeT] or Iterable[str]
Iterable of files to filter
burst_ids : Iterable[str]
Iterable containing the of burst IDs to keep
burst_ids : str | Iterable[str]
Burst ID/Iterable containing the of burst IDs to keep
Returns
-------
list[PathLikeT] or list[str]
filtered list of files
"""
if isinstance(burst_ids, str):
burst_ids = [burst_ids]

burst_id_set = set(burst_ids)
return [f for f in files if get_burst_id(f) in burst_id_set]
parsed_burst_ids = [get_burst_id(Path(f).name) for f in files]
# Only search the burst ID in the name, not the full path
return [f for (f, b) in zip(files, parsed_burst_ids) if b in burst_id_set]


def get_cslc_polygon(
Expand Down
71 changes: 71 additions & 0 deletions src/opera_utils/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import pooch

__all__ = [
"fetch_frame_geometries_simple",
"fetch_burst_id_geometries_simple",
"fetch_burst_to_frame_mapping_file",
"fetch_frame_to_burst_mapping_file",
]

# See: https://github.com/opera-adt/burst_db/tree/main/src/burst_db/data
# BASE_URL = "https://github.com/opera-adt/burst_db/raw/v{version}/src/burst_db/data"
# BASE_URL = "https://github.com/opera-adt/burst_db/raw/v0.3.0/src/burst_db/data"
BASE_URL = "https://github.com/opera-adt/burst_db/releases/download/v{version}/"

# $ ls *json.zip | xargs -n1 shasum -a 256
# 8ee9cae079b9adb24e223b9ff9c81c66506a2a1a72a456220133a9f7f5d4d93b burst_id_geometries_simple.geojson.zip
# 86657e4e578cfced18a66984758fff9a1bf94e8591a288be0d1ad391399f2e59 frame_geometries_simple.geojson.zip
# 436cce345378dc31e81ed661497bab2e744217a5d63c0bb92817dc837786cd22 opera-s1-disp-burst-to-frame.json.zip
# 8b7ed8c8d90ef3d3348bc226958a26a2cb8ab302a6466762aa971b8f7333517f opera-s1-disp-frame-to-burst.json.zip

BURST_DB_VERSION = "0.3.1"

POOCH = pooch.create(
# Folder where the data will be stored. For a sensible default, use the
# default cache folder for your OS.
path=pooch.os_cache("opera_utils"),
# Base URL of the remote data store. Will call .format on this string
# to insert the version (see below).
base_url=BASE_URL,
# Pooches are versioned so that you can use multiple versions of a
# package simultaneously. Use PEP440 compliant version number. The
# version will be appended to the path.
version=BURST_DB_VERSION,
# If a version as a "+XX.XXXXX" suffix, we'll assume that this is a dev
# version and replace the version with this string.
version_dev="main",
# An environment variable that overwrites the path.
env="OPERA_UTILS_DATA_DIR",
# The cache file registry. A dictionary with all files managed by this
# pooch. Keys are the file names (relative to *base_url*) and values
# are their respective SHA256 hashes. Files will be downloaded
# automatically when needed.
registry={
f"frame-geometries-simple-{BURST_DB_VERSION}.geojson.zip": "f0094f4cdc287d56d7a126a42f1e3075e50309afe8a431f49df1ecd8d8b26c8b",
f"burst-id-geometries-simple-{BURST_DB_VERSION}.geojson.zip": "d9cfe71ec836facd5a782ea82625c30a824b78f2b2689106c4d6808bbfce0898",
f"opera-s1-disp-burst-to-frame-{BURST_DB_VERSION}.json.zip": "436cce345378dc31e81ed661497bab2e744217a5d63c0bb92817dc837786cd22",
f"opera-s1-disp-frame-to-burst-{BURST_DB_VERSION}.json.zip": "a48382afcb89f0ff681982b0fc24476ec9c6c1b8a67ae1a26cf380a450ffadc0",
},
)


def fetch_frame_geometries_simple() -> str:
"""Get the simplified frame geometries for the burst database."""
return POOCH.fetch(f"frame-geometries-simple-{BURST_DB_VERSION}.geojson.zip")


def fetch_burst_id_geometries_simple() -> str:
"""Get the simplified burst ID geometries for the burst database."""
return POOCH.fetch(f"burst-id-geometries-simple-{BURST_DB_VERSION}.geojson.zip")


def fetch_burst_to_frame_mapping_file() -> str:
"""Get the burst-to-frame mapping for the burst database."""
return POOCH.fetch(f"opera-s1-disp-burst-to-frame-{BURST_DB_VERSION}.json.zip")


def fetch_frame_to_burst_mapping_file() -> str:
"""Get the frame-to-burst mapping for the burst database."""
return POOCH.fetch(f"opera-s1-disp-frame-to-burst-{BURST_DB_VERSION}.json.zip")

0 comments on commit 73e6059

Please sign in to comment.