In [1]:
%%time
import geopandas as gpd
import numpy as np
from shapely.geometry import box

CPU times: user 843 ms, sys: 100 ms, total: 943 ms
Wall time: 555 ms


In [2]:
%%time
box = box(-180, -90, 180, 90)
data = {"geometry": [box.wkt]}

poly = gpd.GeoSeries.from_wkt(data["geometry"]).unary_union

x_min, y_min, x_max, y_max = poly.bounds
# CPU times: user 1.28 ms, sys: 1.09 ms, total: 2.37 ms
# Wall time: 1.48 ms

CPU times: user 1.28 ms, sys: 1.09 ms, total: 2.37 ms
Wall time: 1.48 ms


In [3]:
%%time

n = 10_000_000

x = np.random.uniform(x_min, x_max, n)
y = np.random.uniform(y_min, y_max, n)
# CPU times: user 150 ms, sys: 30.1 ms, total: 180 ms
# Wall time: 179 ms

CPU times: user 150 ms, sys: 30.1 ms, total: 180 ms
Wall time: 179 ms


In [4]:
%%time
points = gpd.GeoSeries(gpd.points_from_xy(x, y))
points = points[points.within(poly)]
# CPU times: user 3.39 s, sys: 387 ms, total: 3.78 s
# Wall time: 3.78 s

CPU times: user 3.39 s, sys: 387 ms, total: 3.78 s
Wall time: 3.78 s


In [5]:
%%time
gdf = gpd.GeoDataFrame(geometry=points)
# CPU times: user 406 ms, sys: 108 ms, total: 513 ms
# Wall time: 512 ms

CPU times: user 406 ms, sys: 108 ms, total: 513 ms
Wall time: 512 ms


In [7]:
%%time
# GeoParquet
gdf.to_parquet(
    "export.parquet",
    index=False,
    compression="brotli",
)
# CPU times: user 28.8 s, sys: 2.2 s, total: 31 s
# Wall time: 31.5 s


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



CPU times: user 28.8 s, sys: 2.2 s, total: 31 s
Wall time: 31.5 s


In [8]:
%%time
# GeoPackage
gdf.to_file("export.gpkg", index=False, driver="GPKG", layer="layer")
# CPU times: user 11min 7s, sys: 2min 28s, total: 13min 36s
# Wall time: 13min 37s

CPU times: user 11min 7s, sys: 2min 28s, total: 13min 36s
Wall time: 13min 37s


In [9]:
%%time
# FlatGeobuf
gdf.to_file("export.fgb", index=False, driver="FlatGeobuf", spatial_index="NO")
# CPU times: user 7min 55s, sys: 5.25 s, total: 8min
# Wall time: 8min 6s

CPU times: user 7min 55s, sys: 5.25 s, total: 8min
Wall time: 8min 6s


In [10]:
%%time
# GeoJSON
gdf.to_file("export.geojson", index=False, driver="GeoJSON")
# CPU times: user 9min 47s, sys: 8.96 s, total: 9min 56s
# Wall time: 10min

CPU times: user 9min 47s, sys: 8.96 s, total: 9min 56s
Wall time: 10min


In [11]:
%%time
# Shapefile
gdf.to_file("export.shp", index=False, driver="ESRI Shapefile")
# CPU times: user 8min 29s, sys: 1min, total: 9min 29s
# Wall time: 9min 30s

CPU times: user 8min 29s, sys: 1min, total: 9min 29s
Wall time: 9min 30s


In [15]:
import math


def convert_size(size):
    units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB")
    i = math.floor(math.log(size, 1024)) if size > 0 else 0
    size = round(size / 1024**i, 2)

    return f"{size} {units[i]}"

In [16]:
%%time
import os

geo_parquet = os.path.getsize("export.parquet")
geo_package = os.path.getsize("export.gpkg")
flat_geobuf = os.path.getsize("export.fgb")
geojson = os.path.getsize("export.geojson")
shapefile = (
    os.path.getsize("export.shp")
    + os.path.getsize("export.shx")
    + os.path.getsize("export.cpg")
    + os.path.getsize("export.dbf")
)

print(f"geo_parquet={convert_size(geo_parquet)}")
print(f"geo_package={convert_size(geo_package)}")
print(f"flat_geobuf={convert_size(flat_geobuf)}")
print(f"geojson={convert_size(geojson)}")
print(f"shapefile={convert_size(shapefile)}")

# geo_parquet=150.0 MB
# geo_package=903.49 MB
# flat_geobuf=610.35 MB
# geojson=1.26 GB
# shapefile=457.76 MB

# CPU times: user 748 µs, sys: 396 µs, total: 1.14 ms
# Wall time: 884 µs

geo_parquet=150.0 MB
geo_package=903.49 MB
flat_geobuf=610.35 MB
geojson=1.26 GB
shapefile=457.76 MB
CPU times: user 748 µs, sys: 396 µs, total: 1.14 ms
Wall time: 884 µs


In [18]:
%%time
# GeoParquet
geo_parquet_gdf = gpd.read_parquet("export.parquet", columns=["geometry"])
geo_parquet_gdf.head()
# CPU times: user 4.66 s, sys: 960 ms, total: 5.62 s
# Wall time: 5.67 s
# geometry
# 0	POINT (84.21447 -60.10291)
# 1	POINT (-72.92868 53.36820)
# 2	POINT (-64.87842 44.19289)
# 3	POINT (-179.93497 -72.37761)
# 4	POINT (26.41531 3.05427)

CPU times: user 4.66 s, sys: 960 ms, total: 5.62 s
Wall time: 5.67 s


Unnamed: 0,geometry
0,POINT (84.21447 -60.10291)
1,POINT (-72.92868 53.36820)
2,POINT (-64.87842 44.19289)
3,POINT (-179.93497 -72.37761)
4,POINT (26.41531 3.05427)


In [19]:
%%time
# GeoPackage
geo_package_gdf = gpd.read_file("export.gpkg")
geo_package_gdf.head()
# CPU times: user 3min 8s, sys: 3.86 s, total: 3min 12s
# Wall time: 3min 12s
# geometry
# 0	POINT (84.21447 -60.10291)
# 1	POINT (-72.92868 53.36820)
# 2	POINT (-64.87842 44.19289)
# 3	POINT (-179.93497 -72.37761)
# 4	POINT (26.41531 3.05427)

CPU times: user 3min 8s, sys: 3.86 s, total: 3min 12s
Wall time: 3min 12s


Unnamed: 0,geometry
0,POINT (84.21447 -60.10291)
1,POINT (-72.92868 53.36820)
2,POINT (-64.87842 44.19289)
3,POINT (-179.93497 -72.37761)
4,POINT (26.41531 3.05427)


In [20]:
%%time
# FlatGeobuf
flat_geobuf_gdf = gpd.read_file("export.fgb")
flat_geobuf_gdf.head()
# CPU times: user 2min 54s, sys: 2.9 s, total: 2min 57s
# Wall time: 2min 57s
# geometry
# 0	POINT (84.21447 -60.10291)
# 1	POINT (-72.92868 53.36820)
# 2	POINT (-64.87842 44.19289)
# 3	POINT (-179.93497 -72.37761)
# 4	POINT (26.41531 3.05427)

CPU times: user 2min 54s, sys: 2.9 s, total: 2min 57s
Wall time: 2min 57s


Unnamed: 0,geometry
0,POINT (84.21447 -60.10291)
1,POINT (-72.92868 53.36820)
2,POINT (-64.87842 44.19289)
3,POINT (-179.93497 -72.37761)
4,POINT (26.41531 3.05427)


In [21]:
%%time
# GeoJSON
geojson_gdf = gpd.read_file("export.geojson")
geojson_gdf.head()
# CPU times: user 3min 57s, sys: 3.74 s, total: 4min 1s
# Wall time: 4min 1s
# geometry
# 0	POINT (84.21447 -60.10291)
# 1	POINT (-72.92868 53.36820)
# 2	POINT (-64.87842 44.19289)
# 3	POINT (-179.93497 -72.37761)
# 4	POINT (26.41531 3.05427)

CPU times: user 3min 57s, sys: 3.74 s, total: 4min 1s
Wall time: 4min 1s


Unnamed: 0,geometry
0,POINT (84.21447 -60.10291)
1,POINT (-72.92868 53.36820)
2,POINT (-64.87842 44.19289)
3,POINT (-179.93497 -72.37761)
4,POINT (26.41531 3.05427)


In [22]:
%%time
# Shapefile
shapefile_gdf = gpd.read_file("export.shp")
shapefile_gdf.head()
# CPU times: user 3min 40s, sys: 4.54 s, total: 3min 45s
# Wall time: 3min 46s
# FID	geometry
# 0	0	POINT (84.21447 -60.10291)
# 1	1	POINT (-72.92868 53.36820)
# 2	2	POINT (-64.87842 44.19289)
# 3	3	POINT (-179.93497 -72.37761)
# 4	4	POINT (26.41531 3.05427)

CPU times: user 3min 40s, sys: 4.54 s, total: 3min 45s
Wall time: 3min 46s


Unnamed: 0,FID,geometry
0,0,POINT (84.21447 -60.10291)
1,1,POINT (-72.92868 53.36820)
2,2,POINT (-64.87842 44.19289)
3,3,POINT (-179.93497 -72.37761)
4,4,POINT (26.41531 3.05427)


In [23]:
%%time
len(geo_parquet_gdf)
# CPU times: user 11 µs, sys: 2 µs, total: 13 µs
# Wall time: 16.2 µs
# 10000000

CPU times: user 11 µs, sys: 2 µs, total: 13 µs
Wall time: 16.2 µs


10000000

In [24]:
%%time
len(geo_package_gdf)
# CPU times: user 13 µs, sys: 1e+03 ns, total: 14 µs
# Wall time: 17.2 µs
# 10000000

CPU times: user 13 µs, sys: 1e+03 ns, total: 14 µs
Wall time: 17.2 µs


10000000

In [27]:
%%time
len(flat_geobuf_gdf)
# CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
# Wall time: 13.8 µs
# 10000000

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 13.8 µs


10000000

In [28]:
%%time
len(geojson_gdf)
# CPU times: user 11 µs, sys: 1e+03 ns, total: 12 µs
# Wall time: 15.3 µs
# 10000000

CPU times: user 11 µs, sys: 1e+03 ns, total: 12 µs
Wall time: 15.3 µs


10000000

In [29]:
%%time
len(shapefile_gdf)
# CPU times: user 12 µs, sys: 1e+03 ns, total: 13 µs
# Wall time: 32.9 µs
# 10000000

CPU times: user 12 µs, sys: 1e+03 ns, total: 13 µs
Wall time: 32.9 µs


10000000