In [1]:
import pandas as pd
import dask_geopandas as dgp
from shapely import wkb
import matplotlib.pyplot as plt
import geopandas as gpd

# Adding Hilbert, Morton, Geohash spatial partitioning

In [2]:
# Using for input file, a version that was cleaned of all rows that had bounding boxes up to global mins/maxs (see dgpd_bbox_clean.ipynb)
# e.g. -180, -90, 180, 90
input_file = '3mil_no_global_bounds.parquet'

In [4]:
# todo -- this doesn't work without specifying False for gather_spatial_partitions
ddf = dgp.read_parquet(input_file, gather_spatial_partitions=False)

In [5]:
ddf.head()

Unnamed: 0,GranuleUR,StartTime,EndTime,ConceptId,CollectionConceptId,CoordinateSystem,DayNight,EntryTitle,MetadataFormat,NativeId,...,ProductionDate,RevisionDate,RevisionId,Size,CloudCover,geometry,minx,miny,maxx,maxy
0,09bebccf-f8e5-4198-92cf-95ce697046b7,2023-01-01,2023-01-01 00:59:59.085,G2782615398-LARC_ASDC,C1584977040-LARC_ASDC,CARTESIAN,UNSPECIFIED,SatCORPS CERES GEO Edition 4 Meteosat-11 North...,umm-json,09bebccf-f8e5-4198-92cf-95ce697046b7,...,2023-10-12 20:22:56.000,2024-08-01 03:49:28.061,3,51.672165,,"POLYGON ((-50 0, 60 0, 60 60, -50 60, -50 0))",-50.0,0.0,60.0,60.0
1,1_modisa_l1_a2023001000000.l1a_lac,2023-01-01,2023-01-01 00:04:59.000,G2576055627-OB_DAAC,C1570116979-OB_DAAC,GEODETIC,DAY,Aqua MODIS Level-1 Data,umm-json,5d0032d4aa56049abd3c68504fd6d76f3cd1ca1d,...,2023-01-01 01:40:53.893,2023-01-01 01:55:20.790,1,,,"POLYGON ((-145.11024 1.71275, -165.72551 -1.26...",-165.72551,-18.98281,-140.73468,1.71275
2,1_modisa_l1_geo_aqua_modis.20230101t000000.geo...,2023-01-01,2023-01-01 00:04:59.000,G2614304214-OB_DAAC,C2526537408-OB_DAAC,GEODETIC,DAY,"Aqua MODIS Geolocation Product Data, version 1",umm-json,f82f0796553b32a3bc27e1b67c0487d9e9e841da,...,2023-02-18 15:37:40.240,2023-02-18 15:50:32.896,1,,,"POLYGON ((-145.11024 1.71275, -165.72551 -1.26...",-165.72551,-18.98281,-140.73468,1.71275
3,1f0160ca-33d3-41c7-89fb-042fde741034,2023-01-01,2023-01-01 23:59:59.000,G2602313637-LARC_ASDC,C2128176689-LARC_ASDC,CARTESIAN,UNSPECIFIED,DSCOVR EPIC Level 2 EPICAERUV-Fast,umm-json,1f0160ca-33d3-41c7-89fb-042fde741034,...,2023-02-03 14:22:39.000,2023-02-05 10:24:24.814,2,172.60641,,"POLYGON ((0 89.0871, -88.59808 45.09003, -88.9...",-88.933213,-89.087097,88.933213,89.087097
4,2016.2_viirsn_l2_sst3_nrt_snpp_viirs.20230101t...,2023-01-01,2023-01-01 00:05:58.000,G2576154860-OB_DAAC,C1658475737-OB_DAAC,GEODETIC,NIGHT,Suomi-NPP VIIRS Regional Triple-window Sea Sur...,umm-json,86f6b5e6ec29fefef3333bd819e48019dade254a,...,2023-01-01 08:34:39.213,2023-01-01 08:43:13.405,1,,0.0,"POLYGON ((6.12589 0.00717, 33.35773 -4.21387, ...",6.12589,-4.21387,38.71748,20.72994


In [6]:
ddf.calculate_spatial_partitions()
print(f"Number of spatial partitions: {len(ddf.spatial_partitions)}")
print(ddf.spatial_partitions)

Number of spatial partitions: 5
0    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
1    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
2    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
3    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
4    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
dtype: geometry


In [7]:
# dask-geopandas offers these three methods for spatial shuffle, which are ways 
# to represent two-dimensional objects in one-dimensional space.
# hilbert and morton distance are space-filling curves -- lines that pass through every point in space
# the geohash (not the same as geohashing) subdivides space into buckets of grid shape

# Read more here: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html

# todo -- finetune number of partitions
hilbert5 = ddf.spatial_shuffle(by="hilbert", npartitions=5)
morton5 = ddf.spatial_shuffle(by="morton", npartitions=5)
geohash5 = ddf.spatial_shuffle(by="geohash", npartitions=5)

In [13]:
print("Hilbert spatial partitions:")
print(hilbert5.spatial_partitions)
print("Morton spatial partitions:")
print(morton5.spatial_partitions)
print("Geohash spatial partitions:")
print(geohash5.spatial_partitions)

Hilbert spatial partitions:
0    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
1    POLYGON ((-117.24001 -87.92, -180 -86.613, -18...
2    POLYGON ((108.96493 -89.92861, -109.32123 -89....
3    POLYGON ((179.85704 -87.91135, 91.585 -86.6, 1...
4    POLYGON ((-113.02219 -90, -179.76422 -89.96562...
dtype: geometry
Morton spatial partitions:
0    POLYGON ((-180 -90, -180 89.875, -0.125 89.875...
1    POLYGON ((-180 -90, -180 90, 180 90, 180 -90, ...
2    POLYGON ((108.96493 -89.92861, 0.125 -89.875, ...
3    POLYGON ((-179.872 -89.549, -179.989 -89.547, ...
4    POLYGON ((179.999 -85.98228, 148.70906 -84.998...
dtype: geometry
Geohash spatial partitions:
0    POLYGON ((-180 -90, -180 86.414, -76.04083 88....
1    POLYGON ((-180 -90, -180 90, 179.9 90, 179.995...
2    POLYGON ((-180 -90, -180 89.641, -179.99991 89...
3    POLYGON ((1.77296 -90, -179.99 -89.99, -180 -8...
4    POLYGON ((0.125 -89.875, -180 0, -180 90, 180 ...
dtype: geometry


In [8]:
# writes as directory of parquet files, each partition is a file in the directory
hilbert5.to_parquet(f"{input_file}_hilbert")

In [9]:
morton5.to_parquet(f"{input_file}_morton")

In [10]:
geohash5.to_parquet(f"{input_file}_geohash")