In [1]:
import logging
import math

import deafrica_waterbodies.filters
import geopandas as gpd
import pandas as pd
from deafrica_waterbodies.cli.logs import logging_setup

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
land_sea_mask_fp = "../../../WaterbodiesTests/GOaS_v1_20211214/goas_v01.shp"
major_rivers_mask_fp = ""
urban_mask_fp = ""
handle_large_polygons = "erode-dilate-v2"
pp_test_threshold = 0.005

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the primary and secondary threshold polygons
_log.info("Loading primary and secondary threshold polygons...")

primary_threshold_polygons_fp = os.path.join(
    output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
secondary_threshold_polygons_fp = os.path.join(
    output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
primary_threshold_polygons = gpd.read_parquet(primary_threshold_polygons_fp)
secondary_threshold_polygons = gpd.read_parquet(secondary_threshold_polygons_fp)

_log.info(f"Primary threshold polygons count {len(primary_threshold_polygons)}.")
_log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons)}.")

[2023-10-05 17:34:02,674] {7015612.py:2} INFO - Loading primary and secondary threshold polygons...
[2023-10-05 17:34:03,461] {7015612.py:13} INFO - Primary threshold polygons count 42588.
[2023-10-05 17:34:03,461] {7015612.py:14} INFO - Secondary threshold polygons count 81450.


In [7]:
(
    area_filtered_primary_threshold_polygons,
    area_filtered_secondary_threshold_polygons,
) = deafrica_waterbodies.filters.filter_by_area(
    primary_threshold_polygons=primary_threshold_polygons,
    secondary_threshold_polygons=secondary_threshold_polygons,
    min_polygon_size=min_polygon_size,
    max_polygon_size=max_polygon_size,
)

area_filtered_primary_threshold_polygons_fp = os.path.join(
    output_directory, "area_filtered_primary_threshold_polygons.parquet"
)
area_filtered_primary_threshold_polygons.to_parquet(area_filtered_primary_threshold_polygons_fp)
_log.info(f"Area filtered primary threshold polygons written to {area_filtered_primary_threshold_polygons_fp}")

area_filtered_secondary_threshold_polygons_fp = os.path.join(
    output_directory, "area_filtered_secondary_threshold_polygons.parquet"
)
area_filtered_secondary_threshold_polygons.to_parquet(area_filtered_secondary_threshold_polygons_fp)
_log.info(f"Area filtered secondary threshold polygons written to {area_filtered_secondary_threshold_polygons_fp}")

[2023-10-05 17:34:03,467] {filters.py:126} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...
[2023-10-05 17:34:03,479] {filters.py:137} INFO - Filtered out 30951 primary threshold polygons.
[2023-10-05 17:34:03,480] {filters.py:144} INFO - Filtering secondary threshold polygons by max area inf...
[2023-10-05 17:34:03,503] {filters.py:150} INFO - Filtered out 0 secondary threshold polygons.
[2023-10-05 17:34:03,848] {383104105.py:15} INFO - Area filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/area_filtered_primary_threshold_polygons.parquet
[2023-10-05 17:34:04,166] {383104105.py:21} INFO - Area filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/area_filtered_secondary_threshold_polygons.parquet


In [8]:
(
    inland_primary_threshold_polygons,
    inland_secondary_threshold_polygons,
) = deafrica_waterbodies.filters.filter_using_land_sea_mask(
    primary_threshold_polygons=area_filtered_primary_threshold_polygons,
    secondary_threshold_polygons=area_filtered_secondary_threshold_polygons,
    land_sea_mask_fp=land_sea_mask_fp,
)

inland_primary_threshold_polygons_fp = os.path.join(output_directory, "inland_primary_threshold_polygons.parquet")
inland_primary_threshold_polygons.to_parquet(inland_primary_threshold_polygons_fp)
_log.info(f"Ocean filtered primary threshold polygons written to {inland_primary_threshold_polygons_fp}")

inland_secondary_threshold_polygons_fp = os.path.join(output_directory, "inland_secondary_threshold_polygons.parquet")
inland_secondary_threshold_polygons.to_parquet(inland_secondary_threshold_polygons_fp)
_log.info(f"Ocean filtered secondary threshold polygons written to {inland_secondary_threshold_polygons_fp}")

[2023-10-05 17:34:04,172] {filters.py:188} INFO - Filtering out ocean polygons from the primary and secondary threshold waterbody polygons...
[2023-10-05 17:34:47,222] {filters.py:204} INFO - Filtered out 107 primary threshold polygons.
[2023-10-05 17:34:53,671] {filters.py:215} INFO - Filtered out 107 secondary threshold polygons.
[2023-10-05 17:34:54,030] {3083387701.py:12} INFO - Ocean filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/inland_primary_threshold_polygons.parquet
[2023-10-05 17:34:54,329] {3083387701.py:16} INFO - Ocean filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/inland_secondary_threshold_polygons.parquet


In [9]:
cbd_filtered_primary_threshold_polygons, cbd_filtered_secondary_threshold_polygons = deafrica_waterbodies.filters.filter_using_urban_mask(
        primary_threshold_polygons=inland_primary_threshold_polygons,
        secondary_threshold_polygons=inland_secondary_threshold_polygons,
        urban_mask_fp=urban_mask_fp)


cbd_filtered_primary_threshold_polygons_fp = os.path.join(
    output_directory, "cbd_filtered_primary_threshold_polygons.parquet"
)
cbd_filtered_primary_threshold_polygons.to_parquet(cbd_filtered_primary_threshold_polygons_fp)
_log.info(f"CBDs filtered primary threshold polygons written to {cbd_filtered_primary_threshold_polygons_fp}")

cbd_filtered_secondary_threshold_polygons_fp = os.path.join(
    output_directory, "cbd_filtered_secondary_threshold_polygons.parquet"
)
cbd_filtered_secondary_threshold_polygons.to_parquet(cbd_filtered_secondary_threshold_polygons_fp)
_log.info(f"CBDs filtered secondary threshold polygons written to {cbd_filtered_secondary_threshold_polygons_fp}")

[2023-10-05 17:34:54,336] {filters.py:291} INFO - Skipping filtering out CBDs step.
[2023-10-05 17:34:54,624] {1718929370.py:11} INFO - CBDs filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/cbd_filtered_primary_threshold_polygons.parquet
[2023-10-05 17:34:54,952] {1718929370.py:17} INFO - CBDs filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/cbd_filtered_secondary_threshold_polygons.parquet


In [10]:
%%time
# Merge the primary and secondary threshold polygons.
merged_polygons = deafrica_waterbodies.filters.merge_primary_and_secondary_threshold_polygons(
    primary_threshold_polygons=cbd_filtered_primary_threshold_polygons,
    secondary_threshold_polygons=cbd_filtered_secondary_threshold_polygons,
)

merged_polygons_fp = os.path.join(output_directory, "merged_polygons.parquet")
merged_polygons.to_parquet(merged_polygons_fp)
_log.info(f"Merged waterbody polygons written to {merged_polygons_fp}")

[2023-10-05 17:34:54,959] {filters.py:316} INFO - Merging the primary threshold and secondary threshold polygons...
[2023-10-05 17:35:15,157] {filters.py:337} INFO - Waterbody polygons count after merge: 11530.
[2023-10-05 17:35:15,500] {<timed exec>:9} INFO - Merged waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/merged_polygons.parquet
CPU times: user 20.3 s, sys: 27.6 ms, total: 20.3 s
Wall time: 20.5 s


In [11]:
major_rivers_filtered_polygons = deafrica_waterbodies.filters.filter_using_major_rivers_mask(
    waterbody_polygons=merged_polygons, major_rivers_mask_fp=major_rivers_mask_fp
)

major_rivers_filtered_polygons_fp = os.path.join(output_directory, "major_rivers_filtered_polygons.parquet")
major_rivers_filtered_polygons.to_parquet(major_rivers_filtered_polygons_fp)
_log.info(f"Major rivers filtered polygons written to {major_rivers_filtered_polygons_fp}")

[2023-10-05 17:35:15,507] {filters.py:381} INFO - Skipping filtering out major rivers polygons step.
[2023-10-05 17:35:15,844] {3945903002.py:7} INFO - Major rivers filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/major_rivers_filtered_polygons.parquet


In [12]:
# Handle large polygons.
large_polygons_handled = deafrica_waterbodies.filters.split_large_polygons(
    waterbody_polygons=major_rivers_filtered_polygons,
    pp_thresh=pp_test_threshold,
    method=handle_large_polygons
)
_log.info(f"Waterbody polygons count after splitting large polygons {len(large_polygons_handled)}.")

large_polygons_handled_fp = os.path.join(output_directory, "large_polygons_handled.parquet")
large_polygons_handled.to_parquet(large_polygons_handled_fp)
_log.info(f"Waterbodies with large polygons handled written to {large_polygons_handled_fp}")

[2023-10-05 17:35:15,863] {filters.py:458} INFO - Splitting large polygons using the `erode-dilate-v2` method, using the threshold 0.005.
[2023-10-05 17:37:13,815] {1033945788.py:7} INFO - Waterbody polygons count after splitting large polygons 11782.
[2023-10-05 17:37:14,156] {1033945788.py:11} INFO - Waterbodies with large polygons handled written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/large_polygons_handled.parquet


In [13]:
# Reapply the size filtering, just to check that all of the split and filtered waterbodies are
# still in the size range we want.
area_filtered_large_polygons_handled, _ = deafrica_waterbodies.filters.filter_by_area(
    primary_threshold_polygons=large_polygons_handled,
    secondary_threshold_polygons=None,
    min_polygon_size=min_polygon_size,
    max_polygon_size=max_polygon_size)

area_filtered_large_polygons_handled_fp = os.path.join(output_directory, "area_filtered_large_polygons_handled.parquet")
area_filtered_large_polygons_handled.to_parquet(area_filtered_large_polygons_handled_fp)
_log.info(f"Area filtered polygons written to {area_filtered_large_polygons_handled_fp}")

[2023-10-05 17:37:14,162] {filters.py:126} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...
[2023-10-05 17:37:14,171] {filters.py:137} INFO - Filtered out 82 primary threshold polygons.
[2023-10-05 17:37:14,499] {2615340852.py:11} INFO - Area filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/area_filtered_large_polygons_handled.parquet


In [14]:
# Return a GeoDataFrame with the geometry column only.
filtered_polygons = gpd.GeoDataFrame(geometry=area_filtered_large_polygons_handled["geometry"], crs=area_filtered_large_polygons_handled.crs)
filtered_polygons_fp = os.path.join(output_directory, "filtered_polygons.parquet")
filtered_polygons.to_parquet(filtered_polygons_fp)
_log.info(f"Filtered waterbody polygons written to {filtered_polygons_fp}")

[2023-10-05 17:37:14,814] {983404294.py:5} INFO - Filtered waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile/filtered_polygons.parquet
