Skip to content

Commit

Permalink
Add test data covering different native (geoarrow-based) encodings (#204
Browse files Browse the repository at this point in the history
)

* Add test data covering different native (geoarrow-based) encodings

* Update test_data/generate_test_data.py

Co-authored-by: Even Rouault <even.rouault@spatialys.com>

* fix geometry type

* add .csv extension

* add null values

* rename csv files

* add back csv files

* properly specify mask when creating the Arrow data

---------

Co-authored-by: Even Rouault <even.rouault@spatialys.com>
Co-authored-by: Chris Holmes <chomie@gmail.com>
  • Loading branch information
3 people committed May 28, 2024
1 parent 4f0a1f0 commit dced61c
Show file tree
Hide file tree
Showing 19 changed files with 248 additions and 0 deletions.
Binary file added test_data/data-linestring-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-linestring-encoding_wkb.parquet
Binary file not shown.
4 changes: 4 additions & 0 deletions test_data/data-linestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"col","geometry"
0,"LINESTRING (30 10, 10 30, 40 40)"
1,"LINESTRING EMPTY"
2,
Binary file not shown.
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multilinestring-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTILINESTRING ((30 10, 10 30, 40 40))"
1,"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))"
2,"MULTILINESTRING EMPTY"
3,
Binary file added test_data/data-multipoint-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-multipoint-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-multipoint-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"MULTIPOINT ((30 10))"
1,"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))"
2,"MULTIPOINT EMPTY"
3,
Binary file not shown.
Binary file added test_data/data-multipolygon-encoding_wkb.parquet
Binary file not shown.
6 changes: 6 additions & 0 deletions test_data/data-multipolygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"col","geometry"
0,"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))"
1,"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))"
2,"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))"
3,"MULTIPOLYGON EMPTY"
4,
Binary file added test_data/data-point-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-point-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-point-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POINT (30 10)"
1,"POINT EMPTY"
2,
3,"POINT (40 40)"
Binary file added test_data/data-polygon-encoding_native.parquet
Binary file not shown.
Binary file added test_data/data-polygon-encoding_wkb.parquet
Binary file not shown.
5 changes: 5 additions & 0 deletions test_data/data-polygon-wkt.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"col","geometry"
0,"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"
1,"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"
2,"POLYGON EMPTY"
3,
218 changes: 218 additions & 0 deletions test_data/generate_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
"""
Generates example data using pyarrow by running `python generate_test_data.py`.
You can print the metadata with:
.. code-block:: python
>>> import json, pprint, pyarrow.parquet as pq
>>> pprint.pprint(json.loads(pq.read_schema("example.parquet").metadata[b"geo"]))
"""
import json
import pathlib
import copy

import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.csv import write_csv

from shapely import from_wkt, to_wkb


HERE = pathlib.Path(__file__).parent


metadata_template = {
"version": "1.1.0",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": [],
},
},
}


## Various geometry types with WKB and native (GeoArrow-based) encodings

def write_encoding_files(geometries_wkt, geometries_geoarrow, geometry_type):

table = pa.table({"col": range(len(geometries_wkt)), "geometry": geometries_wkt})
write_csv(table, HERE / f"data-{geometry_type.lower()}-wkt.csv")

# WKB encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": to_wkb(from_wkt(geometries_wkt))}
)
metadata = copy.deepcopy(metadata_template)
metadata["columns"]["geometry"]["geometry_types"] = [geometry_type]
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_wkb.parquet")

# native (geoarrow) encoding
table = pa.table(
{"col": range(len(geometries_wkt)), "geometry": geometries_geoarrow}
)
metadata["columns"]["geometry"]["encoding"] = geometry_type.lower()
table = table.replace_schema_metadata({"geo": json.dumps(metadata)})
pq.write_table(table, HERE / f"data-{geometry_type.lower()}-encoding_native.parquet")


# point

geometries_wkt = [
"POINT (30 10)",
"POINT EMPTY",
None,
"POINT (40 40)",
]

point_type = pa.struct(
[
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False)
]
)
geometries = pa.array(
[(30, 10), (float("nan"), float("nan")), (float("nan"), float("nan")), (40, 40)],
mask=np.array([False, False, True, False]),
type=point_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Point"
)

# linestring

geometries_wkt = [
"LINESTRING (30 10, 10 30, 40 40)",
"LINESTRING EMPTY",
None
]

linestring_type = pa.list_(pa.field("vertices", point_type, nullable=False))
geometries = pa.array(
[[(30, 10), (10, 30), (40, 40)], [], []],
mask=np.array([False, False, True]),
type=linestring_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="LineString"
)

# polygon

geometries_wkt = [
"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))",
"POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))",
"POLYGON EMPTY",
None,
]

polygon_type = pa.list_(
pa.field("rings", pa.list_(
pa.field("vertices", point_type, nullable=False)
), nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]],
[[(35, 10), (45, 45), (15, 40), (10, 20), (35, 10)],
[(20, 30), (35, 35), (30, 20), (20, 30)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=polygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="Polygon"
)

# multipoint

geometries_wkt = [
"MULTIPOINT ((30 10))",
"MULTIPOINT ((10 40), (40 30), (20 20), (30 10))",
"MULTIPOINT EMPTY",
None,
]

multipoint_type = pa.list_(pa.field("points", point_type, nullable=False))
geometries = pa.array(
[
[(30, 10)],
[(10, 40), (40, 30), (20, 20), (30, 10)],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multipoint_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPoint"
)

# multilinestring

geometries_wkt = [
"MULTILINESTRING ((30 10, 10 30, 40 40))",
"MULTILINESTRING ((10 10, 20 20, 10 40), (40 40, 30 30, 40 20, 30 10))",
"MULTILINESTRING EMPTY",
None,
]

multilinestring_type = pa.list_(
pa.field("linestrings", linestring_type, nullable=False)
)
geometries = pa.array(
[
[[(30, 10), (10, 30), (40, 40)]],
[[(10, 10), (20, 20), (10, 40)],
[(40, 40), (30, 30), (40, 20), (30, 10)]],
[],
[],
],
mask=np.array([False, False, False, True]),
type=multilinestring_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiLineString"
)

# multipolygon

geometries_wkt = [
"MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))",
"MULTIPOLYGON (((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 20, 5 10, 15 5)))",
"MULTIPOLYGON (((40 40, 20 45, 45 30, 40 40)), ((20 35, 10 30, 10 10, 30 5, 45 20, 20 35), (30 20, 20 15, 20 25, 30 20)))",
"MULTIPOLYGON EMPTY",
None,
]

multipolygon_type = pa.list_(pa.field("polygons", polygon_type, nullable=False))
geometries = pa.array(
[
[[[(30, 10), (40, 40), (20, 40), (10, 20), (30, 10)]]],
[[[(30, 20), (45, 40), (10, 40), (30, 20)]],
[[(15, 5), (40, 10), (10, 20), (5, 10), (15, 5)]]],
[[[(40, 40), (20, 45), (45, 30), (40, 40)]],
[[(20, 35), (10, 30), (10, 10), (30, 5), (45, 20), (20, 35)],
[(30, 20), (20, 15), (20, 25), (30, 20)]]],
[],
[],
],
mask=np.array([False, False, False, False, True]),
type=multipolygon_type
)

write_encoding_files(
geometries_wkt, geometries, geometry_type="MultiPolygon"
)

0 comments on commit dced61c

Please sign in to comment.