# Bucket Brigade

Bucket Brigade is a system for taking a large parquet file breaking it up spatially across multiple files

In [None]:
import os
from collections import Counter
import json

import pandas as pd
import geopandas as gpd
#import dask_geopandas
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
from shapely import wkb, wkt
import pygeohash as pgh

import ipywidgets as widgets
from IPython.display import display
import tkinter as tk
from tkinter import filedialog


## What is a GEOHash

Lets take a second to look at geo hash and what it looks like.

In [None]:
print(pgh.encode(latitude=42.6, longitude=-5.6))
print(pgh.encode(latitude=42.6, longitude=-5.6, precision=5))
print(pgh.decode(geohash='ezs42'))
print(pgh.geohash_approximate_distance(geohash_1='bcd3u', geohash_2='bc83n') /1000 ,"km")
#not found in lib???
#print(pgh.get_adjacent(geohash='kd3ybyu', direction='right'))

## Settings

Pick a file and then read it in using GeoPandas

In [None]:
selected_file = "~/src/project/cmr-bigstac-prototype/bigstac/scripts_explore/3mil_no_global_bounds.parquet"

# Read the GeoParquet file
gdf = gpd.read_parquet(selected_file)

## Some utility functions

going to use these things latter on

In [None]:
# some utilities
def write_string_to_file(filename, content):
    try:
        with open(filename, 'w') as file:
            file.write(content)
        #print(f"Successfully wrote to the file {filename}")
    except IOError as e:
        print(f"An error occurred while writing to the file {filename}: {e}")

def make_geo_box(sub:str, details:dict):
  data = f"{os.getcwd()}/data/{sub}"
  if not os.path.exists(data):
    os.makedirs(data)
    write_string_to_file(f"{data}/info.json", json.dumps(details))

## Break things up

Here we will go thru all the rows in a parquet file. For each file we will use the bounding box for
the row and calculate the GeoHash for the two corners. We will then use the GeoHash to create a
'hash code' for the bucket to store rows in.

Using the lowest precision of 1 will give us 32 grids. From these 32 grids we could have 1024 boxes for every combination of of two bounding box GeoHash codes.

Create a GeoDataFrame for every bucket and concat the record to it.

Each bucket will also have a file called info.json that will contain the details of the bucket.

In [None]:
counter = Counter()
parquet_data = {}

for index, row in gdf.iterrows():
    geometry = row['geometry']
    
    # Access other attributes
    for column in gdf.columns:
        if column != 'geometry':
            value = row[column]
            #print(f"{column}: {value}")
            break

    minx, miny, maxx, maxy = geometry.bounds

    hash1 = pgh.encode(latitude=minx, longitude=miny, precision=1)
    hash2 = pgh.encode(latitude=maxx, longitude=maxy, precision=1)
    distance = pgh.geohash_approximate_distance(geohash_1=parts[0], geohash_2=parts[1])
    hash = f"{hash1}-{hash2}"
    details = {'hash1': hash1,
      'hash2': hash2,
      'hash': hash,
      'distance': distance,
      'bounds': geometry.bounds}
    make_geo_box(hash, details)
    counter[hash] += 1

    # maybe change this to instead open and write the file in one function so that nothing is in
    # memory.
    if hash not in parquet_data:
      if os.path.exists(f"{os.getcwd()}/data/{hash}/{hash}.parquet"):
        print(f"reading {hash} from disk")
        parquet_data[hash] = gpd.read_parquet(selected_file)
      else:
        print(f"creating a new dataframe as {hash}")
        parquet_data[hash] = gpd.GeoDataFrame()
    #add this record
    parquet_data[hash] = gpd.GeoDataFrame(pd.concat([parquet_data[hash], pd.DataFrame([row])], ignore_index=True))
    
    limit_records = 500000
    if len(counter) > limit_records:
      print(f"breaking after {limit_records}")
      break
    elif len(counter) % 10000 == 0:
      print(f"{len(counter)} records processed")


In [None]:
c = 1
for key, value in parquet_data.items():
    print(f"writing {key} to disk. {c} of {len(parquet_data.keys())}")
    parquet_data[key].to_parquet(f"{os.getcwd()}/data/{key}/{key}.parquet")
    c += 1

In [None]:
data = f"{os.getcwd()}/data"

if not os.path.exists(data):
  os.makedirs(data)

for i in counter:
  if not os.path.exists(f"{data}/{i}"):
    os.makedirs(f"{data}/{i}")

Lets have a look at one of these boxes, just so we know what we are dealing with

In [None]:
polygon = geometry

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 10))

# Plot the polygon
x, y = polygon.exterior.xy
ax.plot(x, y)

# Fill the polygon
ax.fill(x, y, alpha=0.3)

# Set the aspect of the plot to equal
ax.set_aspect('equal')

# Add title
ax.set_title("Polygon Visualization")

# Add labels
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

# Show the plot
plt.show()
