In [1]:
import logging

import click
import fsspec
import geopandas as gpd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.datasets import get_datasets_ids
from deafrica_waterbodies.io import check_if_s3_uri

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
aoi_vector_file = "data/SenegalBasin.geojson"
num_workers = 16
dataset_ids_text_file = "s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt"

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib Paths.
aoi_vector_file = str(aoi_vector_file)
dataset_ids_text_file = str(dataset_ids_text_file)

In [6]:
# Load the area of interest as a GeoDataFrame.
if aoi_vector_file is not None:
    try:
        aoi_gdf = gpd.read_file(aoi_vector_file)
    except Exception as error:
        _log.exception(f"Could not read the file {aoi_vector_file}")
        raise error
else:
    aoi_gdf = None

In [7]:
# Get the WOfS All Time Summary scene ids for the scenes whose extent
# intersects with the area of interest.
dataset_ids = get_datasets_ids(aoi_gdf=aoi_gdf, num_workers=num_workers)

4461it [00:07, 586.18it/s]


In [8]:
# Set the filesystem to use.
if check_if_s3_uri(dataset_ids_text_file):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

In [9]:
# Write the dataset ids to the text file.
with fs.open(dataset_ids_text_file, "w") as file:
    for dataset_id in dataset_ids:
        file.write(f"{dataset_id}\n")

_log.info(f"Dataset IDs written to: {dataset_ids_text_file}.")

[2023-10-06 16:39:44,855] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2023-10-06 16:39:45,022] {2876337735.py:6} INFO - Dataset IDs written to: s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/dataset_ids.txt.
