In [1]:
import logging
import math

import deafrica_waterbodies.filters
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
land_sea_mask_fp = "GOaS_v1_20211214/goas_v01.shp"
major_rivers_mask_fp = ""
urban_mask_fp = ""
handle_large_polygons = "erode-dilate-v2"
pp_test_threshold = 0.005

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the primary and secondary threshold polygons
_log.info("Loading primary and secondary threshold polygons...")

primary_threshold_polygons_fp = os.path.join(
    output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
secondary_threshold_polygons_fp = os.path.join(
    output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
primary_threshold_polygons = gpd.read_parquet(primary_threshold_polygons_fp)
secondary_threshold_polygons = gpd.read_parquet(secondary_threshold_polygons_fp)

_log.info(f"Primary threshold polygons count {len(primary_threshold_polygons)}.")
_log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons)}.")

[2023-10-04 20:32:17,363] {7015612.py:2} INFO - Loading primary and secondary threshold polygons...
[2023-10-04 20:32:18,241] {7015612.py:13} INFO - Primary threshold polygons count 42588.
[2023-10-04 20:32:18,242] {7015612.py:14} INFO - Secondary threshold polygons count 81450.


In [7]:
assert primary_threshold_polygons.crs == secondary_threshold_polygons.crs
crs = primary_threshold_polygons.crs

In [8]:
_log.info(f"Filtering primary threshold polygons by minimum area {min_polygon_size} and max area {max_polygon_size}...")

primary_threshold_polygons["area"] = pd.to_numeric(primary_threshold_polygons.area)
area_filtered_primary_threshold_polygons = primary_threshold_polygons.loc[
    ((primary_threshold_polygons["area"] > min_polygon_size) & (primary_threshold_polygons["area"] <= max_polygon_size))
]
_log.info(
    f"Filtered out {len(primary_threshold_polygons) - len(area_filtered_primary_threshold_polygons)} primary threshold polygons"
)

area_filtered_primary_threshold_polygons_fp = os.path.join(output_directory, "area_filtered_primary_threshold_polygons.parquet")
area_filtered_primary_threshold_polygons.to_parquet(area_filtered_primary_threshold_polygons_fp)
_log.info(f"Area filtered primary threshold polygons written to {area_filtered_primary_threshold_polygons_fp}")

[2023-10-04 20:32:18,252] {676914410.py:1} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...
[2023-10-04 20:32:18,264] {676914410.py:7} INFO - Filtered out 30951 primary threshold polygons
[2023-10-04 20:32:18,583] {676914410.py:13} INFO - Area filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/area_filtered_primary_threshold_polygons.parquet


In [9]:
_log.info(f"Filtering secondary threshold polygons by max area {max_polygon_size}...")

secondary_threshold_polygons["area"] = pd.to_numeric(secondary_threshold_polygons.area)
area_filtered_secondary_threshold_polygons = secondary_threshold_polygons.loc[
    secondary_threshold_polygons["area"] <= max_polygon_size
]
_log.info(
    f"Filtered out {len(secondary_threshold_polygons) - len(area_filtered_secondary_threshold_polygons)} secondary threshold polygons"
)

area_filtered_secondary_threshold_polygons_fp = os.path.join(output_directory, "area_filtered_secondary_threshold_polygons.parquet")
area_filtered_secondary_threshold_polygons.to_parquet(area_filtered_secondary_threshold_polygons_fp)
_log.info(f"Area filtered secondary threshold polygons written to {area_filtered_secondary_threshold_polygons_fp}")

[2023-10-04 20:32:18,589] {45407760.py:1} INFO - Filtering secondary threshold polygons by max area inf...
[2023-10-04 20:32:18,612] {45407760.py:7} INFO - Filtered out 0 secondary threshold polygons
[2023-10-04 20:32:19,137] {45407760.py:13} INFO - Area filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/area_filtered_secondary_threshold_polygons.parquet


In [None]:
if land_sea_mask_fp:
    _log.info("Filtering out ocean polygons from the primary and secondary threshold waterbody polygons.")
    try:
        land_sea_mask = gpd.read_file(land_sea_mask_fp).to_crs(crs)
    except Exception as error:
        _log.exception(f"Could not read file {land_sea_mask}")
        raise error
    else:
        inland_primary_threshold_polygons, _ = deafrica_waterbodies.filters.filter_geodataframe_by_intersection(area_filtered_primary_threshold_polygons, land_sea_mask, invert_mask=True)
        _log.info(f"Filtered out {len(area_filtered_primary_threshold_polygons) - len(inland_primary_threshold_polygons)} primary threshold polygons.")
        
        inland_primary_threshold_polygons_fp =  os.path.join(output_directory, "inland_primary_threshold_polygons.parquet")
        inland_primary_threshold_polygons.to_parquet(inland_primary_threshold_polygons_fp)
        _log.info(f"Ocean filtered primary threshold polygons written to {inland_primary_threshold_polygons_fp}")
        
        inland_secondary_threshold_polygons, _ = deafrica_waterbodies.filters.filter_geodataframe_by_intersection(area_filtered_secondary_threshold_polygons, land_sea_mask, invert_mask=True)
        _log.info(f"Filtered out {len(area_filtered_secondary_threshold_polygons) - len(inland_secondary_threshold_polygons)} secondary threshold polygons.")
        
        inland_secondary_threshold_polygons_fp =  os.path.join(output_directory, "inland_secondary_threshold_polygons.parquet")
        inland_secondary_threshold_polygons.to_parquet(inland_secondary_threshold_polygons_fp)
        _log.info(f"Ocean filtered secondary threshold polygons written to {inland_secondary_threshold_polygons_fp}")
        
else:
    _log.info("Skipping filtering out ocean polygons step.")
    inland_primary_threshold_polygons =  area_filtered_primary_threshold_polygons
    inland_secondary_threshold_polygons = area_filtered_secondary_threshold_polygons

[2023-10-04 20:32:19,144] {2040202294.py:2} INFO - Filtering out ocean polygons from the primary and secondary threshold waterbody polygons.
[2023-10-04 20:33:02,727] {2040202294.py:10} INFO - Filtered out 107 primary threshold polygons.
[2023-10-04 20:33:03,067] {2040202294.py:14} INFO - Ocean filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/inland_primary_threshold_polygons.parquet


In [None]:
if urban_mask_fp:
    _log.info("Filtering out CBDs polygons from the primary threshold polygons...")
    try:
        urban_mask = gpd.read_file(urban_mask_fp).to_crs(crs)
    except Exception as error:
        _log.exception(f"Could not read file {urban_mask_fp}")
        raise error
    else:
        cbd_filtered_primary_threshold_polygons, _ = deafrica_waterbodies.filters.filter_geodataframe_by_intersection(
            inland_primary_threshold_polygons,
            urban_mask)
        _log.info(f"Filtered out {len(inland_primary_threshold_polygons) - len(cbd_filtered_primary_threshold_polygons)} primary threshold polygons.")
        
        cbd_filtered_primary_threshold_polygons_fp =  os.path.join(output_directory, "cbd_filtered_primary_threshold_polygons.parquet")
        cbd_filtered_primary_threshold_polygons.to_parquet(cbd_filtered_primary_threshold_polygons_fp)
        _log.info(f"CBDs filtered primary threshold polygons written to {cbd_filtered_primary_threshold_polygons_fp}")
        
else:
    _log.info("Skipping filtering out CBDs polygons step.")
    cbd_filtered_primary_threshold_polygons = inland_primary_threshold_polygons

In [None]:
%%time
# Merge the primary and secondary threshold polygons.
_log.info("Merging the primary threshold and secondary threshold polygons...")
merged_polygons = deafrica_waterbodies.filters.merge_primary_and_secondary_threshold_polygons(
    primary_threshold_polygons=cbd_filtered_primary_threshold_polygons,
    secondary_threshold_polygons=inland_secondary_threshold_polygons)
_log.info(f"Total waterbody polygons count after merge: {len(merged_polygons)}.")

merged_polygons_fp = os.path.join(output_directory, "merged_polygons.parquet")
merged_polygons.to_parquet(merged_polygons_fp)
_log.info(f"Merged waterbody polygons written to {merged_polygons_fp}")

In [None]:
if major_rivers_mask_fp:
    _log.info("Filtering out major rivers polygons from the waterbody polygons...")
    try:
        major_rivers = gpd.read_file(major_rivers_mask_fp).to_crs(crs)
    except Exception as error:
        _log.exception(f"Could not read file {major_rivers_mask_fp}")
        raise error
    else:
        major_rivers_filtered_polygons, _ = deafrica_waterbodies.filters.filter_geodataframe_by_intersection(
            merged_polygons,
            major_rivers)
        _log.info(f"Filtered out {len(merged_polygons) - len(major_rivers_filtered_polygons)} waterbody polygons.")
        
        major_rivers_filtered_polygons_fp =  os.path.join(output_directory, "major_rivers_filtered_polygons.parquet")
        major_rivers_filtered_polygons.to_parquet(major_rivers_filtered_polygons_fp)
        _log.info(f"Major rivers filtered polygons written to {major_rivers_filtered_polygons_fp}")
        
else:
    _log.info("Skipping filtering out major rivers polygons step.")
    major_rivers_filtered_polygons = merged_polygons

In [None]:
# Handle large polygons.
_log.info("Splitting large polygons...")
large_polygons_handled = deafrica_waterbodies.filters.split_large_polygons(
    input_gdf=major_rivers_filtered_polygons, pp_thresh=pp_test_threshold, method=handle_large_polygons
)
_log.info(f"Waterbody polygons count after splitting large polygons {len(large_polygons_handled)}")

large_polygons_handled_fp = os.path.join(output_directory, "large_polygons_handled.parquet")
large_polygons_handled.to_parquet(large_polygons_handled_fp)
_log.info(f"Waterbodies with large polygons handled written to {large_polygons_handled_fp}")