# Topology checking with geonurse

## Import libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [79]:
from pprint import pprint

In [2]:
import geonurse
import geonurse.tools.topology as topology

In [20]:
import geopandas as gpd
import shapely
import shapely.wkt
import shapely.geometry

## Create SparkSession

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('geonurse').getOrCreate()
spark

## Polygon/MultiPolygon

### Features with duplicated coordinates on exterior

In [80]:
exterior_duplicates_path = '../tests/data/topology/test_data_polygon_exterior_duplicates.geojson'

exterior_duplicates_rdd = (
    geonurse.read_file(spark, exterior_duplicates_path)
        .geometries('shapely')
        .cache()
)

exterior_duplicates = (
    exterior_duplicates_rdd
        .filter(lambda x: topology.has_exterior_duplicates(x))
)

pprint(exterior_duplicates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(exterior_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(exterior_duplicates.count()))

['POLYGON ((0 0, 0 3, 3 3, 3 0, 3 0, 0 0))',
 'POLYGON ((0 0, 0 3, 3 3, 3 3, 3 0, 3 0, 0 0))',
 'POLYGON ((0 0, 0 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))',
 'POLYGON ((0 0, 0 3, 3 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))',
 'POLYGON ((5 0, 5 3, 8 3, 8 0, 8 0, 5 0))',
 'POLYGON ((5 0, 5 3, 8 3, 8 3, 8 0, 8 0, 5 0))',
 'POLYGON ((5 0, 5 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1))',
 'POLYGON ((5 0, 5 3, 8 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 3 0, 0 0)), ((5 0, 5 3, 8 3, 8 0, 5 0)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0)), ((5 0, 5 3, 8 3, 8 0, 8 0, 5 0)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), '
 '((5 0, 5 3, 8 3, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), ((5 0, '
 '5 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 3 0, 0 0)), ((5 0, 5 3, 8 3, 8 0, 

#### Get duplicated coordinates

In [81]:
duplicated_exterior_coordinates = (
    exterior_duplicates
        .map(lambda x: topology._duplicated_exterior_coordinates_list(x))
        .map(lambda x: shapely.geometry.MultiPoint(x))
)

pprint(duplicated_exterior_coordinates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(exterior_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(duplicated_exterior_coordinates.count()))

['MULTIPOINT (3 0)',
 'MULTIPOINT (3 3, 3 0)',
 'MULTIPOINT (3 0)',
 'MULTIPOINT (3 3, 3 0)',
 'MULTIPOINT (8 0)',
 'MULTIPOINT (8 3, 8 0)',
 'MULTIPOINT (8 0)',
 'MULTIPOINT (8 3, 8 0)',
 'MULTIPOINT (3 0)',
 'MULTIPOINT (8 0)',
 'MULTIPOINT (3 0)',
 'MULTIPOINT (8 0)',
 'MULTIPOINT (3 0, 8 0)',
 'MULTIPOINT (3 0, 8 0)',
 'MULTIPOINT (3 3, 3 0)',
 'MULTIPOINT (8 3, 8 0)',
 'MULTIPOINT (3 3, 3 0)',
 'MULTIPOINT (8 3, 8 0)',
 'MULTIPOINT (3 3, 3 0, 8 3, 8 0)',
 'MULTIPOINT (3 3, 3 0, 8 3, 8 0)']
Input rdd length: 26
Filtered rdd length: 20


### Features with interiors

In [83]:
geoms_with_interiors = (
    exterior_duplicates_rdd
        .filter(lambda x: topology.has_interior(x))
)

pprint(geoms_with_interiors.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(exterior_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(geoms_with_interiors.count()))

['POLYGON ((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))',
 'POLYGON ((0 0, 0 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))',
 'POLYGON ((0 0, 0 3, 3 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))',
 'POLYGON ((5 0, 5 3, 8 3, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1))',
 'POLYGON ((5 0, 5 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1))',
 'POLYGON ((5 0, 5 3, 8 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), '
 '((5 0, 5 3, 8 3, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), ((5 0, '
 '5 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), '
 '((5 0, 5 3, 8 3, 8 0, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1)), ((5 0, '
 '5 3, 8 3, 8 0, 5 0), (6 1, 6 2, 7 2, 7 1, 6 1)))',
 'MULTIPOLYG

### Features with duplicated coordinates on interior(s)

In [84]:
interior_duplicates_path = '../tests/data/topology/test_data_polygon_interior_duplicates.geojson'

interior_duplicates_rdd = (
    geonurse.read_file(spark, interior_duplicates_path)
        .geometries('shapely')
        .cache()
)

interior_duplicates = (
    interior_duplicates_rdd
        .filter(lambda x: topology.has_interior_duplicates(x))
)

pprint(interior_duplicates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(interior_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(interior_duplicates.count()))

['POLYGON ((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 2 1, 1 1))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0)), ((5 0, 5 3, 8 3, 8 0, 5 0), (6 1, '
 '6 2, 7 2, 7 1, 7 1, 6 1)))',
 'MULTIPOLYGON (((0 0, 0 3, 3 3, 3 0, 0 0), (1 1, 1 2, 2 2, 2 1, 2 1, 1 1)), '
 '((5 0, 5 3, 8 3, 8 0, 5 0), (6 1, 6 2, 7 2, 7 2, 7 1, 7 1, 6 1)))',
 'POLYGON ((0 0, 0 3, 5 3, 5 0, 0 0), (1 1, 1 2, 2 2, 2 1, 2 1, 1 1), (3 1, 3 '
 '2, 4 2, 4 2, 4 1, 4 1, 3 1))']
Input rdd length: 8
Filtered rdd length: 4


#### Get duplicated coordinates

In [86]:
duplicated_interior_coordinates = (
    interior_duplicates
        .map(lambda x: topology._duplicated_interior_coordinates_list(x))
        .map(lambda x: shapely.geometry.MultiPoint(x))
)

pprint(duplicated_interior_coordinates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(interior_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(duplicated_interior_coordinates.count()))

['MULTIPOINT (2 1)',
 'MULTIPOINT (7 1)',
 'MULTIPOINT (2 1, 7 2, 7 1)',
 'MULTIPOINT (2 1, 4 2, 4 1)']
Input rdd length: 8
Filtered rdd length: 4


## LineString/MultiLineString

### Duplicated coordinates along features

In [88]:
linestring_duplicates_path = '../tests/data/topology/test_data_linestring_duplicates.geojson'

linestring_duplicates_rdd = (
    geonurse.read_file(spark, linestring_duplicates_path)
        .geometries('shapely')
        .cache()
)

duplicated_coordinates = (
    linestring_duplicates_rdd
        .filter(lambda x: topology.has_duplicates(x))
)

pprint(duplicated_coordinates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(linestring_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(duplicated_coordinates.count()))

['LINESTRING (0 0, 0 3, 3 3, 3 3, 3 0)',
 'LINESTRING (0 0, 0 3, 3 3, 3 3, 3 0, 0 0)',
 'MULTILINESTRING ((0 0, 0 3, 3 3, 3 3, 3 0), (4 0, 4 3, 7 3, 7 0))',
 'MULTILINESTRING ((0 0, 0 3, 3 3, 3 3, 3 0, 0 0), (4 0, 4 3, 7 3, 7 3, 7 0))',
 'MULTILINESTRING ((0 0, 0 3, 0 3, 3 3, 3 3, 3 0), (4 0, 4 3, 4 3, 7 3, 7 3, 7 '
 '0, 4 0))']
Input rdd length: 8
Filtered rdd length: 5


#### Get duplicated coordinates

In [90]:
duplicated_interior_coordinates = (
    duplicated_coordinates
        .map(lambda x: topology._duplicated_coordinates_list(x))
        .map(lambda x: shapely.geometry.MultiPoint(x))
)

pprint(duplicated_interior_coordinates.map(lambda x: x.wkt).collect())

print("Input rdd length: {}".format(linestring_duplicates_rdd.count()))
print("Filtered rdd length: {}".format(duplicated_interior_coordinates.count()))

['MULTIPOINT (3 3)',
 'MULTIPOINT (3 3)',
 'MULTIPOINT (3 3)',
 'MULTIPOINT (3 3, 7 3)',
 'MULTIPOINT (0 3, 3 3, 4 3, 7 3)']
Input rdd length: 8
Filtered rdd length: 5
