Skip to content

Commit

Permalink
Fix missing_data_options to exit early for lots of missing data opt…
Browse files Browse the repository at this point in the history
…ions (#24)

* add candidates to burst options atrs

* adjust env/gha for no scipy

* Stop considering missing options after 1e4

* export `sort_files_by_date`

* update missing data test for `num_candidate_bursts`
  • Loading branch information
scottstanie committed Feb 6, 2024
1 parent 8257eb5 commit d717b39
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 9 deletions.
1 change: 0 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ jobs:
rasterio=1.3
gdal=3.5
asf_search=6.7.2
scipy=1.5
fail-fast: false
name: ${{ matrix.os }} • ${{ matrix.deps.label }}
Expand Down
14 changes: 12 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# [Unreleased](https://github.com/opera-adt/opera-utils/compare/v0.1.6...HEAD)
# [Unreleased](https://github.com/opera-adt/opera-utils/compare/v0.2.1...HEAD)

# [v0.1.6](https://github.com/opera-adt/opera-utils/compare/v0.1.5...v0.1.6)
# [v0.2.1](https://github.com/opera-adt/opera-utils/compare/v0.2.0...v0.2.1)

**Fixed**
- Export `sort_files_by_date`
- Stop considering missing data options after 10,000 in `get_missing_data_options`

**Requirements**
- Remove scipy


# [v0.2.0](https://github.com/opera-adt/opera-utils/compare/v0.1.5...v0.2.0)

**Added**

Expand Down
1 change: 0 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ dependencies:
- click>=7.0
- h5py>=1.10
- numpy>=1.20
- scipy>=1.5
- pooch>=1.7
- pyproj>=3.3
- shapely>=1.8
Expand Down
2 changes: 2 additions & 0 deletions src/opera_utils/_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
"get_dates",
"filter_by_date",
"group_by_date",
"sort_files_by_date",
"DATE_FORMAT",
"DATETIME_FORMAT",
]

DATE_FORMAT = "%Y%m%d"
Expand Down
18 changes: 13 additions & 5 deletions src/opera_utils/missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ class BurstSubsetOption:
dates: tuple[date, ...]
"""Dates used in this subset."""
# subset_selected: list[bool]
# """True if the corresponding file/ (burst ID, date) was selected."""
num_candidate_bursts: int
"""The number of (burst_id, date) pairs that were passed as options."""

@property
def num_dates(self) -> int:
Expand Down Expand Up @@ -206,7 +207,8 @@ def generate_burst_subset_options(
that the first option is the one that uses the most data.
"""
options = []

num_candidate_bursts = B.sum()
logger.debug("Number of candidates: %s", num_candidate_bursts)
# Get the idxs where there are any missing dates for each burst
# We're going to try all possible combinations of these *groups*,
# not all possible combinations of the individual missing dates
Expand All @@ -215,9 +217,14 @@ def generate_burst_subset_options(
missing_date_idxs.add(tuple(np.where(~row)[0]))

# Generate all unique combinations of idxs to exclude
date_idxs_to_exclude_combinations = [
set(flatten(combo)) for combo in powerset(missing_date_idxs)
]
date_idxs_to_exclude_combinations = []
# NOTE: if `missing_date_idxs` is larger than ~25, this blows up
# Since most cases take milliseconds, we'll set a cap at considering
# a million (more than we need)
for i, combo in enumerate(powerset(missing_date_idxs)):
if i > 1e4:
break
date_idxs_to_exclude_combinations.append(set(flatten(combo)))

all_column_idxs = set(range(B.shape[1]))
all_row_idxs = set(range(B.shape[0]))
Expand Down Expand Up @@ -272,6 +279,7 @@ def generate_burst_subset_options(
total_num_bursts=total_num_bursts,
burst_ids=selected_burst_ids,
dates=selected_dates,
num_candidate_bursts=num_candidate_bursts,
)
)

Expand Down
3 changes: 3 additions & 0 deletions tests/test_missing_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,21 @@ def test_get_missing_data_options(filenames):
assert len(mdo.burst_ids) == 2
assert mdo.num_burst_ids == 2
assert mdo.total_num_bursts == 8
assert mdo.num_candidate_bursts == len(filenames)

mdo = mdos[1]
assert mdo.num_dates == 2
assert len(mdo.burst_ids) == 3
assert mdo.num_burst_ids == 3
assert mdo.total_num_bursts == 6
assert mdo.num_candidate_bursts == len(filenames)

mdo = mdos[2]
assert mdo.num_dates == 3
assert len(mdo.burst_ids) == 1
assert mdo.num_burst_ids == 1
assert mdo.total_num_bursts == 3
assert mdo.num_candidate_bursts == len(filenames)


@pytest.fixture
Expand Down

0 comments on commit d717b39

Please sign in to comment.