In [1]:
import logging
import math
import os

import click
import geopandas as gpd
from deafrica_waterbodies.cli.logs import logging_setup
from deafrica_waterbodies.filters import (
    filter_by_area,
    filter_using_land_sea_mask,
    filter_using_major_rivers_mask,
    filter_using_urban_mask,
    merge_primary_and_secondary_threshold_polygons,
    split_large_polygons,
)

In [2]:
import os

# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables.
aws_default_config = {
    # "AWS_NO_SIGN_REQUEST": "YES",
    "AWS_SECRET_ACCESS_KEY": "fake",
    "AWS_ACCESS_KEY_ID": "fake",
}

# To access public bucket, need to remove the AWS credentials in
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [3]:
verbose = 1
output_directory = "s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2"
min_polygon_size = 4500  # 5 pixels
max_polygon_size = math.inf
land_sea_mask_fp = ""
major_rivers_mask_fp = ""
urban_mask_fp = ""
handle_large_polygons = "nothing"
pp_test_threshold = 0.005

In [4]:
# Set up logger.
logging_setup(verbose=verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib paths.
output_directory = str(output_directory)

In [6]:
# Load the primary and secondary threshold polygons
_log.info("Loading primary and secondary threshold polygons...")

primary_threshold_polygons_fp = os.path.join(
    output_directory, "primary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
secondary_threshold_polygons_fp = os.path.join(
    output_directory, "secondary_threshold_polygons_merged_at_ds_boundaries.parquet"
)
primary_threshold_polygons = gpd.read_parquet(primary_threshold_polygons_fp)
secondary_threshold_polygons = gpd.read_parquet(secondary_threshold_polygons_fp)

_log.info(f"Primary threshold polygons count {len(primary_threshold_polygons)}.")
_log.info(f"Secondary threshold polygons count {len(secondary_threshold_polygons)}.")

[2023-10-06 18:47:20,768] {7015612.py:2} INFO - Loading primary and secondary threshold polygons...
[2023-10-06 18:47:21,486] {7015612.py:13} INFO - Primary threshold polygons count 42750.
[2023-10-06 18:47:21,487] {7015612.py:14} INFO - Secondary threshold polygons count 81635.


In [7]:
(
    area_filtered_primary_threshold_polygons,
    area_filtered_secondary_threshold_polygons,
) = filter_by_area(
    primary_threshold_polygons=primary_threshold_polygons,
    secondary_threshold_polygons=secondary_threshold_polygons,
    min_polygon_size=min_polygon_size,
    max_polygon_size=max_polygon_size,
)

area_filtered_primary_threshold_polygons_fp = os.path.join(
    output_directory, "area_filtered_primary_threshold_polygons.parquet"
)
area_filtered_primary_threshold_polygons.to_parquet(area_filtered_primary_threshold_polygons_fp)
_log.info(f"Area filtered primary threshold polygons written to {area_filtered_primary_threshold_polygons_fp}")

area_filtered_secondary_threshold_polygons_fp = os.path.join(
    output_directory, "area_filtered_secondary_threshold_polygons.parquet"
)
area_filtered_secondary_threshold_polygons.to_parquet(area_filtered_secondary_threshold_polygons_fp)
_log.info(f"Area filtered secondary threshold polygons written to {area_filtered_secondary_threshold_polygons_fp}")

[2023-10-06 18:47:21,493] {filters.py:127} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...
[2023-10-06 18:47:21,506] {filters.py:139} INFO - Filtered out 31040 primary threshold polygons.
[2023-10-06 18:47:21,506] {filters.py:146} INFO - Filtering secondary threshold polygons by max area inf...
[2023-10-06 18:47:21,530] {filters.py:153} INFO - Filtered out 0 secondary threshold polygons.
[2023-10-06 18:47:21,837] {87560456.py:15} INFO - Area filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_primary_threshold_polygons.parquet
[2023-10-06 18:47:22,393] {87560456.py:21} INFO - Area filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_secondary_threshold_polygons.parquet


In [8]:
(
    inland_primary_threshold_polygons,
    inland_secondary_threshold_polygons,
) = filter_using_land_sea_mask(
    primary_threshold_polygons=area_filtered_primary_threshold_polygons,
    secondary_threshold_polygons=area_filtered_secondary_threshold_polygons,
    land_sea_mask_fp=land_sea_mask_fp,
)

inland_primary_threshold_polygons_fp = os.path.join(output_directory, "inland_primary_threshold_polygons.parquet")
inland_primary_threshold_polygons.to_parquet(inland_primary_threshold_polygons_fp)
_log.info(f"Ocean filtered primary threshold polygons written to {inland_primary_threshold_polygons_fp}")

inland_secondary_threshold_polygons_fp = os.path.join(output_directory, "inland_secondary_threshold_polygons.parquet")
inland_secondary_threshold_polygons.to_parquet(inland_secondary_threshold_polygons_fp)
_log.info(f"Ocean filtered secondary threshold polygons written to {inland_secondary_threshold_polygons_fp}")

[2023-10-06 18:47:22,399] {filters.py:225} INFO - Skipping filtering out ocean polygons step.
[2023-10-06 18:47:22,743] {2722900615.py:12} INFO - Ocean filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/inland_primary_threshold_polygons.parquet
[2023-10-06 18:47:23,347] {2722900615.py:16} INFO - Ocean filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/inland_secondary_threshold_polygons.parquet


In [9]:
(
    cbd_filtered_primary_threshold_polygons,
    cbd_filtered_secondary_threshold_polygons,
) = filter_using_urban_mask(
    primary_threshold_polygons=inland_primary_threshold_polygons,
    secondary_threshold_polygons=inland_secondary_threshold_polygons,
    urban_mask_fp=urban_mask_fp,
)


cbd_filtered_primary_threshold_polygons_fp = os.path.join(
    output_directory, "cbd_filtered_primary_threshold_polygons.parquet"
)
cbd_filtered_primary_threshold_polygons.to_parquet(cbd_filtered_primary_threshold_polygons_fp)
_log.info(f"CBDs filtered primary threshold polygons written to {cbd_filtered_primary_threshold_polygons_fp}")

cbd_filtered_secondary_threshold_polygons_fp = os.path.join(
    output_directory, "cbd_filtered_secondary_threshold_polygons.parquet"
)
cbd_filtered_secondary_threshold_polygons.to_parquet(cbd_filtered_secondary_threshold_polygons_fp)
_log.info(f"CBDs filtered secondary threshold polygons written to {cbd_filtered_secondary_threshold_polygons_fp}")

[2023-10-06 18:47:23,353] {filters.py:294} INFO - Skipping filtering out CBDs step.
[2023-10-06 18:47:23,662] {2340062003.py:15} INFO - CBDs filtered primary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/cbd_filtered_primary_threshold_polygons.parquet
[2023-10-06 18:47:24,170] {2340062003.py:21} INFO - CBDs filtered secondary threshold polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/cbd_filtered_secondary_threshold_polygons.parquet


In [10]:
%%time
# Merge the primary and secondary threshold polygons.
merged_polygons = merge_primary_and_secondary_threshold_polygons(
    primary_threshold_polygons=cbd_filtered_primary_threshold_polygons,
    secondary_threshold_polygons=cbd_filtered_secondary_threshold_polygons,
)

merged_polygons_fp = os.path.join(output_directory, "merged_polygons.parquet")
merged_polygons.to_parquet(merged_polygons_fp)
_log.info(f"Merged waterbody polygons written to {merged_polygons_fp}")

[2023-10-06 18:47:24,176] {filters.py:319} INFO - Merging the primary threshold and secondary threshold polygons...
[2023-10-06 18:49:10,808] {filters.py:340} INFO - Waterbody polygons count after merge: 7164.
[2023-10-06 18:49:11,201] {<timed exec>:9} INFO - Merged waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/merged_polygons.parquet
CPU times: user 1min 46s, sys: 463 ms, total: 1min 46s
Wall time: 1min 47s


In [11]:
major_rivers_filtered_polygons = filter_using_major_rivers_mask(
    waterbody_polygons=merged_polygons, major_rivers_mask_fp=major_rivers_mask_fp
)

major_rivers_filtered_polygons_fp = os.path.join(output_directory, "major_rivers_filtered_polygons.parquet")
major_rivers_filtered_polygons.to_parquet(major_rivers_filtered_polygons_fp)
_log.info(f"Major rivers filtered polygons written to {major_rivers_filtered_polygons_fp}")

[2023-10-06 18:49:11,209] {filters.py:384} INFO - Skipping filtering out major rivers polygons step.
[2023-10-06 18:49:11,615] {1532569761.py:7} INFO - Major rivers filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/major_rivers_filtered_polygons.parquet


In [12]:
# Handle large polygons.
large_polygons_handled = split_large_polygons(
    waterbody_polygons=major_rivers_filtered_polygons, pp_thresh=pp_test_threshold, method=handle_large_polygons
)
_log.info(f"Waterbody polygons count after splitting large polygons {len(large_polygons_handled)}.")

large_polygons_handled_fp = os.path.join(output_directory, f"large_polygons_handled_{handle_large_polygons}.parquet")
large_polygons_handled.to_parquet(large_polygons_handled_fp)
_log.info(f"Waterbodies with large polygons handled written to {large_polygons_handled_fp}")

[2023-10-06 18:49:11,639] {filters.py:458} INFO - You have chosen not to split large polygons. If you meant to use this option, please select one of the following methods: ['erode-dilate-v1', 'erode-dilate-v2'].
[2023-10-06 18:49:11,640] {2125941619.py:5} INFO - Waterbody polygons count after splitting large polygons 7164.
[2023-10-06 18:49:12,032] {2125941619.py:9} INFO - Waterbodies with large polygons handled written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/large_polygons_handled_nothing.parquet


In [13]:
# Reapply the size filtering, just to check that all of the split and filtered waterbodies are
# still in the size range we want.
area_filtered_large_polygons_handled, _ = filter_by_area(
    primary_threshold_polygons=large_polygons_handled,
    secondary_threshold_polygons=None,
    min_polygon_size=min_polygon_size,
    max_polygon_size=max_polygon_size,
)

area_filtered_large_polygons_handled_fp = os.path.join(output_directory, "area_filtered_large_polygons_handled.parquet")
area_filtered_large_polygons_handled.to_parquet(area_filtered_large_polygons_handled_fp)
_log.info(f"Area filtered polygons written to {area_filtered_large_polygons_handled_fp}")

[2023-10-06 18:49:12,042] {filters.py:127} INFO - Filtering primary threshold polygons by minimum area 4500 and max area inf...
[2023-10-06 18:49:12,054] {filters.py:139} INFO - Filtered out 621 primary threshold polygons.
[2023-10-06 18:49:12,466] {1618256413.py:12} INFO - Area filtered polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/area_filtered_large_polygons_handled.parquet


In [14]:
# Return a GeoDataFrame with the geometry column only.
filtered_polygons = gpd.GeoDataFrame(
    geometry=area_filtered_large_polygons_handled["geometry"], crs=area_filtered_large_polygons_handled.crs
)
filtered_polygons_fp = os.path.join(output_directory, "filtered_polygons.parquet")
filtered_polygons.to_parquet(filtered_polygons_fp)
_log.info(f"Filtered waterbody polygons written to {filtered_polygons_fp}")

[2023-10-06 18:49:12,873] {1197084677.py:7} INFO - Filtered waterbody polygons written to s3://deafrica-waterbodies-dev/test_out_dir/0-0-1/shapefile2/filtered_polygons.parquet
