In [1]:
import logging
import os

import click
import geopandas as gpd
import pandas as pd

import deafrica_waterbodies.io
import deafrica_waterbodies.make_polygons
from deafrica_waterbodies.cli.logs import logging_setup

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1   
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir"

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Directory containing the water body polygons generated from 
# thresholding WOfS All time summary scenes.
polygons_from_thresholds_dir = os.path.join(output_directory, "polygons_from_thresholds")

In [7]:
# Find all parquet files for the primary threshold.
primary_threshold_polygons_paths = deafrica_waterbodies.io.find_parquet_files(
    path=polygons_from_thresholds_dir, pattern=".*primary.*"
)
_log.info(f"Found {len(primary_threshold_polygons_paths)} parquet files for the primary threshold polygons.")

[2023-10-04 07:55:34,283] {credentials.py:620} INFO - Found credentials in shared credentials file: ~/.aws/credentials
[2023-10-04 07:55:34,558] {3351114371.py:5} INFO - Found 74 parquet files for the primary threshold polygons.


In [8]:
# Load all the primary threshold polygons into a single GeoDataFrame.
_log.info("Loading the primary threshold polygons parquet files..")
primary_threshold_polygons_list = []
for path in primary_threshold_polygons_paths:
    gdf = gpd.read_parquet(path)
    primary_threshold_polygons_list.append(gdf)

primary_threshold_polygons = pd.concat(primary_threshold_polygons_list, ignore_index=True)
_log.info(f"Found {len(primary_threshold_polygons)} primary threshold polygons.")

[2023-10-04 07:55:34,563] {370454188.py:2} INFO - Loading the primary threshold polygons parquet files..
[2023-10-04 07:55:40,851] {370454188.py:9} INFO - Found 42728 primary threshold polygons.


In [9]:
_log.info("Merging primary threshold waterbody polygons located at dataset/scene boundaries...")
primary_threshold_polygons_merged = (
    deafrica_waterbodies.make_polygons.merge_polygons_at_dataset_boundaries(
        primary_threshold_polygons
    )
)
_log.info(f"Primary threshold polygons count {len(primary_threshold_polygons_merged)}.")

[2023-10-04 07:55:40,856] {2486117815.py:1} INFO - Merging primary threshold waterbody polygons located at dataset/scene boundaries...
[2023-10-04 07:55:46,235] {2486117815.py:7} INFO - Primary threshold polygons count 42588.


In [10]:
_log.info("Writing primary threshold polygons merged at dataset boundaries to disk..")
primary_threshold_polygons_output_fp = os.path.join(
    output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet"
)

primary_threshold_polygons_merged.to_parquet(primary_threshold_polygons_output_fp)
_log.info(f"Polygons written to {primary_threshold_polygons_output_fp}")

[2023-10-04 07:55:46,240] {2868347883.py:1} INFO - Writing primary threshold polygons merged at dataset boundaries to disk..
[2023-10-04 07:55:46,576] {2868347883.py:7} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/primary_threshold_polygons_merged_at_ds_boundaries.parquet


In [11]:
# Find all parquet files for the secondary threshold.
secondary_threshold_polygons_paths = deafrica_waterbodies.io.find_parquet_files(
    path=polygons_from_thresholds_dir, pattern=".*secondary.*"
)
_log.info(f"Found {len(secondary_threshold_polygons_paths)} parquet files for the secondary threshold polygons.")

[2023-10-04 07:55:46,637] {2573623230.py:5} INFO - Found 74 parquet files for the secondary threshold polygons.


In [12]:
# Load all the secondary threshold polygons into a single GeoDataFrame.
_log.info("Loading the secondary threshold polygons parquet files...")
secondary_threshold_polygons_list = []
for path in secondary_threshold_polygons_paths:
    gdf = gpd.read_parquet(path)
    secondary_threshold_polygons_list.append(gdf)

secondary_threshold_polygons = pd.concat(secondary_threshold_polygons_list, ignore_index=True)
_log.info(f"Found {len(secondary_threshold_polygons)} secondary threshold polygons.")

[2023-10-04 07:55:46,642] {4278796675.py:2} INFO - Loading the secondary threshold polygons parquet files...
[2023-10-04 07:55:53,743] {4278796675.py:9} INFO - Found 81652 secondary threshold polygons.


In [13]:
_log.info("Merging secondary threshold waterbody polygons located at dataset/scene boundaries...")
secondary_threshold_polygons_merged = (
    deafrica_waterbodies.make_polygons.merge_polygons_at_dataset_boundaries(
        secondary_threshold_polygons
    )
)
_log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons_merged)}.")

[2023-10-04 07:55:53,748] {2569896688.py:1} INFO - Merging secondary threshold waterbody polygons located at dataset/scene boundaries...
[2023-10-04 07:56:05,352] {2569896688.py:7} INFO - Secondary threshold polygons count 81450.


In [14]:
_log.info("Writing secondary threshold polygons merged at dataset boundaries to disk..")
secondary_threshold_polygons_output_fp = os.path.join(
    output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet"
)

secondary_threshold_polygons_merged.to_parquet(secondary_threshold_polygons_output_fp)

_log.info(f"Polygons written to {secondary_threshold_polygons_output_fp}")


[2023-10-04 07:56:05,357] {4268535924.py:1} INFO - Writing secondary threshold polygons merged at dataset boundaries to disk..
[2023-10-04 07:56:05,895] {4268535924.py:8} INFO - Polygons written to s3://deafrica-waterbodies-dev/test_out_dir/secondary_threshold_polygons_merged_at_ds_boundaries.parquet
