# Map row group spatial metadata/statistics

In [53]:
import sys
from shapely import wkb
import geopandas as gpd
import pyarrow.parquet as pq
import lonboard # these won't render on GitHub

sys.path.append('../analyze')
import info

In [100]:
def get_parquet_row_count(file_path):
    # Open the Parquet file
    parquet_file = pq.ParquetFile(file_path)

    # Get the number of row groups
    num_row_groups = parquet_file.num_row_groups

    # Initialize total row count
    total_rows = 0

    # Iterate through row groups and sum up the row counts
    for i in range(num_row_groups):
        row_group_rows = get_parquet_row_group_row_count(parquet_file, i)
        total_rows += row_group_rows

    return total_rows

def get_parquet_row_group_row_count(parquet_file, row_group_id):
    first_column_name = parquet_file.schema[0].name

    num_rows = parquet_file.read_row_group(row_group_id, columns=first_column_name).num_rows

    return num_rows

In [101]:
pq_path = '../test_lpcloud_data/single_file/lpcloud_starttime.parquet'
print(f'Rows in parquet file: {get_parquet_row_count(pq_path)}')

Rows in parquet file: 964695


In [94]:
group_stats = info.get_parquet_group_stats(pq_path, False)

Example of min and max geometry metadata for a row group

In [95]:
gid = 0
min_shape = wkb.loads(group_stats[f'Group_{gid}']['geometry']['min'])
max_shape = wkb.loads(group_stats[f'Group_{gid}']['geometry']['max'])
print(str(min_shape))
print(str(max_shape))

POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90))
POLYGON ((-127.00027778 57.99972222, -125.99972222 57.99972222, -125.99972222 59.00027778, -127.00027778 59.00027778, -127.00027778 57.99972222))


Collect min or max geometry metadata for all row groups into a geodataframe

In [96]:
geometries = []
group_ids = []

for gid in range(0,len(group_stats)):
  this_stats = group_stats[f'Group_{gid}']['geometry']
  geometries.append(wkb.loads(this_stats['min']))
  group_ids.append(gid)

In [97]:
rg_gdf = gpd.GeoDataFrame({'group_id': group_ids, 'geometry': geometries}, crs = "EPSG:4326")
rg_gdf

Unnamed: 0,group_id,geometry
0,0,"POLYGON ((-180 -90, 180 -90, 180 90, -180 90, ..."
1,1,"POLYGON ((-108.64062 40.66909, -102.50259 40.6..."
2,2,"POLYGON ((6 10.96574, 10.83852 10.96574, 10.83..."
3,3,"POLYGON ((-100.21875 14.07468, -94.92781 14.07..."
4,4,"POLYGON ((-117.65625 24.36216, -112.35619 24.3..."
5,5,"POLYGON ((-180 -90, 180 -90, 180 90, -180 90, ..."
6,6,"POLYGON ((-180 -90, 180 -90, 180 90, -180 90, ..."
7,7,"POLYGON ((-180 -90, 180 -90, 180 90, -180 90, ..."


Global granules make visualization difficult. Remove them from the granule data, then check the row group metadata of that new file.

Note that doing this in Geopandas may run into memory constraints for larger files. Solutions would include batching the reading & writing in Geopandas, splitting the task using dask-geopandas, or doing the operation within DuckDB.

In [98]:
# A geometry object with global extent we can use for excluding matches
str(geometries[7])

'POLYGON ((-180 -90, 180 -90, 180 90, -180 90, -180 -90))'

In [69]:
# This will read the entire parquet file into memory
pq_gdf = gpd.read_parquet(pq_path)

In [70]:
# Create a new geodataframe that excludes global granules
pq_gdf_sub = pq_gdf[pq_gdf.geometry != geometries[7]]

In [71]:
# Compare table dimensions before and after excluding global granules
print(pq_gdf.shape)
print(pq_gdf_sub.shape)

(964695, 4)
(833018, 4)


In [76]:
out_pq_path = '../test_lpcloud_data/single_file/lpcloud_starttime_sub.parquet'
row_group_size = get_parquet_row_group_row_count(pq.ParquetFile(pq_path),1)
pq_gdf_sub.to_parquet(out_pq_path, row_group_size=row_group_size)

Run group stats again, this time on the file that excludes global granules

In [107]:
group_stats = info.get_parquet_group_stats(out_pq_path, False)
geometries = []
group_ids = []

for gid in range(0,len(group_stats)):
  this_stats = group_stats[f'Group_{gid}']['geometry']
  geometries.append(wkb.loads(this_stats['min'])) # also try max
  group_ids.append(gid)

Collect all the (minimum) bounding boxes into a geodataframe

In [108]:
rg_gdf_aub = gpd.GeoDataFrame({'group_id': group_ids, 'geometry': geometries}, crs = "EPSG:4326")
rg_gdf_aub

Unnamed: 0,group_id,geometry
0,0,"POLYGON ((34.74051 45.5806, 40.95589 45.5806, ..."
1,1,"POLYGON ((103.33645 29.62608, 109.33861 29.626..."
2,2,"POLYGON ((-98.90105 35.97371, -92.49903 35.973..."
3,3,"POLYGON ((80.56327 12.43794, 85.79968 12.43794..."
4,4,"POLYGON ((-62.59673 -17.56367, -57.36228 -17.5..."
5,5,"POLYGON ((112.45622 -21.77249, 117.79999 -21.7..."
6,6,"POLYGON ((-65.70419 -21.98913, -60.31323 -21.9..."


TODO:  
- visualize both min and max boxes
- switch to static maps

In [106]:
lonboard.viz(rg_gdf_aub)

Map(basemap_style=<CartoBasemap.DarkMatter: 'https://basemaps.cartocdn.com/gl/dark-matter-gl-style/style.json'…