# Check the shapefile

Before running conflux check over the shapefile. 
Looking for null geometries, invalid geometries, and overlaps in the shapefile.
They may run fine through conflux but it helps to check.

In [None]:
import geopandas as gpd
import s3fs
from shapely.validation import explain_validity
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
import folium

In [None]:
# set up S3 filesystem
s3 = s3fs.S3FileSystem(anon=True) 

### Read in shapefile

In [None]:
# path to shapefile on S3
s3_shapefile_path = "s3://dea-public-data-dev/projects/WIT/BWS_MDBA_ANAE_WIT_05_2022/shp_file/ANAEv3_WIT_clean19042022/ANAEv3_WIT_clean.shp"

# load
shapefile = gpd.read_file(s3_shapefile_path, engine='fiona')

print(f"✅ Shapefile loaded. Number of rows: {len(shapefile)}")

In [None]:
shapefile.head()

In [None]:
shapefile.crs

In [None]:
# check UID is unique
unique_id_column_name = "UID"
len(shapefile[unique_id_column_name]) == len(set(shapefile[unique_id_column_name]))

### Check for null geometries e.g. rows where the geometry is null

In [None]:
null_geoms = shapefile[shapefile.geometry.isnull()]
if not null_geoms.empty:
    print(f"❗ Rows with null geometries: {null_geoms.index.tolist()}")
else:
    print("✅ No null geometries found.")

In [None]:
# remove any null geometries 
shapefile_cleaned = shapefile[shapefile.geometry.notnull()]
print(f"Removed {len(shapefile) - len(shapefile_cleaned)} rows with null geometries.")

### Check for invalid geometries e.g. self intersecting geometries

In [None]:
invalid_geoms = shapefile_cleaned[~shapefile_cleaned.is_valid]

if not invalid_geoms.empty:
    print(f"❗ Found {len(invalid_geoms)} invalid geometries.")
    
    for idx, row in invalid_geoms.iterrows():
        uid = row["UID"]  
        reason = explain_validity(row.geometry)
        print(f" - UID {uid} is invalid: {reason}")
else:
    print("✅ All remaining geometries are valid.")

In [None]:
# remove invalid geometries (just for analysis!!)
shapefile_cleaned = shapefile_cleaned[shapefile_cleaned.is_valid]
print(f"Remaining rows after cleaning: {len(shapefile_cleaned)}")

### Check for intersecting polygons 
We remove null or invalid geometries because testing for intersections could result in weird results.

In [None]:
# build spatial index for fast lookup
sindex = shapefile_cleaned.sindex

overlap_pairs = set()

# loop through each geometry and compare it to potential overlaps
for idx, geom in shapefile_cleaned.geometry.items():
    if geom is None or geom.is_empty:
        continue  # skip if geometry is missing or empty

    # get possible matches using bounding boxes
    possible_matches_index = list(sindex.intersection(geom.bounds))
    
    # filter to actual overlaps (excluding itself)
    for match_idx in possible_matches_index:
        if idx >= match_idx:
            continue  # avoid duplicates and self

        other_geom = shapefile_cleaned.geometry.iloc[match_idx]
        if other_geom is None or other_geom.is_empty:
            continue

        if geom.intersects(other_geom) and not geom.touches(other_geom):
            # record a sorted tuple to prevent duplicate pairings (e.g., (a, b) vs (b, a))
            uid1 = shapefile_cleaned.loc[idx, "UID"]
            uid2 = shapefile_cleaned.iloc[match_idx]["UID"]
            overlap_pairs.add(tuple(sorted((uid1, uid2))))

#  results
if overlap_pairs:
    print(f"❗ Found {len(overlap_pairs)} overlapping UID pairs.")
    for pair in sorted(overlap_pairs):
        print(f" - {pair[0]} overlaps with {pair[1]}")
else:
    print("✅ No overlapping geometries found.")

Running the code on the non-cleaned shapefile to see what happens ?

In [None]:
# build spatial index for fast lookup
sindex = shapefile.sindex

overlap_pairs = set()

# loop through each geometry and compare it to potential overlaps
for idx, geom in shapefile.geometry.items():
    if geom is None or geom.is_empty:
        continue  # skip if geometry is missing or empty

    # get possible matches using bounding boxes
    possible_matches_index = list(sindex.intersection(geom.bounds))
    
    # filter to actual overlaps (excluding itself)
    for match_idx in possible_matches_index:
        if idx >= match_idx:
            continue  # avoid duplicates and self

        other_geom = shapefile.geometry.iloc[match_idx]
        if other_geom is None or other_geom.is_empty:
            continue

        if geom.intersects(other_geom) and not geom.touches(other_geom):
            # record a sorted tuple to prevent duplicate pairings (e.g., (a, b) vs (b, a))
            uid1 = shapefile.loc[idx, "UID"]
            uid2 = shapefile.iloc[match_idx]["UID"]
            overlap_pairs.add(tuple(sorted((uid1, uid2))))

#  results
if overlap_pairs:
    print(f"❗ Found {len(overlap_pairs)} overlapping UID pairs.")
    for pair in sorted(overlap_pairs):
        print(f" - {pair[0]} overlaps with {pair[1]}")
else:
    print("✅ No overlapping geometries found.")