In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import box
from keplergl import KeplerGl
from pyspark.sql import SparkSession
from pyspark import SparkConf
from sedona.spark import SedonaContext

# Create Spark session with Iceberg support
spark_conf = SparkConf().setAll([
    ("spark.jars.packages", "org.apache.sedona:sedona-spark-3.5_2.12:1.7.2,org.datasyslab:geotools-wrapper:1.7.2-28.5,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2,org.apache.iceberg:iceberg-aws-bundle:1.4.2"),
    ("spark.jars.repositories", "https://artifacts.unidata.ucar.edu/repository/unidata-all"),
    ("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog"),
    ("spark.sql.catalog.spark_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog"),
    ("spark.master", "local[2]")
])

spark_session = SparkSession.builder \
    .appName("ChipViewer") \
    .config(conf=spark_conf) \
    .getOrCreate()

spark = SedonaContext.create(spark_session)

# Read chip metadata from Iceberg table
# Read chips from one geohash area
chips_df = spark.sql("""
    SELECT 
        chip_id,
        datetime,
        geohash,
        is_complete,
        cloud_coverage,
        created_at
    FROM sentinel.chips_raw 
    WHERE geohash = '6wht19'
    LIMIT 20
""").toPandas()


# Extract chip bounds from chip_id (format: chip_x_y)
def get_chip_bounds(chip_id):
    parts = chip_id.split('_')
    if len(parts) >= 3:
        x, y = int(parts[1]), int(parts[2])
        # Calculate bounds using chip_size_degrees = 0.023
        minx = -180 + (x * 0.023)
        miny = -90 + (y * 0.023)
        maxx = minx + 0.023
        maxy = miny + 0.023
        return box(minx, miny, maxx, maxy)
    return None

# Create GeoDataFrame
chips_df['geometry'] = chips_df['chip_id'].apply(get_chip_bounds)
chips_gdf = gpd.GeoDataFrame(chips_df.dropna(subset=['geometry']))

# Convert datetime to string for Kepler
chips_gdf['datetime_str'] = chips_gdf['datetime'].astype(str)
chips_gdf['cloud_coverage'] = chips_gdf['cloud_coverage'].fillna(0)

print(f"Loaded {len(chips_gdf)} chips")
print(f"Date range: {chips_gdf['datetime'].min()} to {chips_gdf['datetime'].max()}")
print(f"Cloud coverage range: {chips_gdf['cloud_coverage'].min():.1f}% to {chips_gdf['cloud_coverage'].max():.1f}%")

# Create Kepler map
map_1 = KeplerGl(height=600)
map_1.add_data(data=chips_gdf, name='sentinel_chips')

# Display map
map_1


25/08/14 12:10:01 WARN UDTRegistration: Cannot register UDT for org.geotools.coverage.grid.GridCoverage2D, which is already registered.
25/08/14 12:10:01 WARN SimpleFunctionRegistry: The function rs_union_aggr replaced a previously registered function.
25/08/14 12:10:01 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.geom.Geometry, which is already registered.
25/08/14 12:10:01 WARN UDTRegistration: Cannot register UDT for org.apache.sedona.common.geometryObjects.Geography, which is already registered.
25/08/14 12:10:01 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.index.SpatialIndex, which is already registered.
25/08/14 12:10:01 WARN SimpleFunctionRegistry: The function st_envelope_aggr replaced a previously registered function.
25/08/14 12:10:01 WARN SimpleFunctionRegistry: The function st_intersection_aggr replaced a previously registered function.
25/08/14 12:10:01 WARN SimpleFunctionRegistry: The function st_union_aggr replaced a previously

Loaded 8 chips
Date range: 2023-01-02 08:25:38.546000 to 2023-01-22 08:25:32.146000
Cloud coverage range: 4.0% to 8.9%
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'sentinel_chips': {'index': [0, 1, 2, 3, 4, 5, 6, 7], 'columns': ['chip_id', 'datetime', 'geohaâ€¦

In [4]:
# Enable Jupyter extension (run this first)
import jupyter
from keplergl import KeplerGl

# Keep only the essential columns for mapping
display_cols = ['chip_id', 'geohash', 'is_complete', 'cloud_coverage', 'geometry']
chips_display = chips_gdf[display_cols].copy()
chips_display['cloud_coverage'] = chips_display['cloud_coverage'].astype(float)

map_1 = KeplerGl(height=600)
map_1.add_data(data=chips_display, name='sentinel_chips')
map_1.save_to_html(file_name='chip_viewer.html')

# Also print the data to verify it's working
print("\nChip locations:")
for idx, row in chips_gdf.iterrows():
    bounds = row.geometry.bounds
    print(f"{row.chip_id}: ({bounds[0]:.3f}, {bounds[1]:.3f}) to ({bounds[2]:.3f}, {bounds[3]:.3f})")


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to chip_viewer.html!

Chip locations:
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)
chip_5169_3462: (-61.113, -10.374) to (-61.090, -10.351)


In [6]:
import time, rasterio
import numpy as np

# Use your rasterio config
rasterio_env = {
    'GDAL_CACHEMAX': 512,
    'CPL_VSIL_CURL_CACHE_SIZE': 200000000,
    'GDAL_HTTP_MULTIPLEX': 'YES',
    'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR'
}
print(f"Using rasterio config: {rasterio_env}")

urls = [
    's3://sentinel-cogs/sentinel-s2-l2a-cogs/20/L/PN/2023/1/S2A_20LPN_20230102_0_L2A/B02.tif',
    's3://sentinel-cogs/sentinel-s2-l2a-cogs/20/L/PP/2023/1/S2A_20LPP_20230102_0_L2A/B02.tif',
    's3://sentinel-cogs/sentinel-s2-l2a-cogs/20/L/QP/2023/1/S2A_20LQP_20230112_0_L2A/B02.tif'
]

sizes = [256, 512, 1024, 1536, 2048, 3072]

# Test with your rasterio environment
with rasterio.Env(**rasterio_env):
    print("\n=== With Production Rasterio Config ===")
    
    for url in urls:
        scene_name = url.split('/')[-2]
        print(f"\nTesting {scene_name}:")
        
        # Keep connection open for multiple reads (like your UDF does)
        with rasterio.open(url) as src:
            for size in sizes:
                times = []
                for i in range(3):
                    start = time.time()
                    try:
                        data = src.read(1, window=((0, size), (0, size)))
                        times.append(time.time() - start)
                    except Exception as e:
                        print(f"  {size}x{size}: ERROR - {e}")
                        break
                
                if times:
                    avg_time = np.mean(times)
                    mb_size = (size * size * 2) / 1024**2
                    chips_covered = (size // 256) ** 2
                    print(f"  {size}x{size}: {avg_time:.3f}s, {mb_size:.1f}MB, {chips_covered:2d} chips, {avg_time/chips_covered:.4f}s/chip")

# Also test connection overhead
print("\n=== Connection Overhead Test ===")
with rasterio.Env(**rasterio_env):
    url = urls[0]
    
    # Multiple connections (current approach)
    start = time.time()
    for i in range(10):
        with rasterio.open(url) as src:
            data = src.read(1, window=((0, 256), (0, 256)))
    multi_conn_time = time.time() - start
    
    # Single connection (optimized approach)
    start = time.time()
    with rasterio.open(url) as src:
        for i in range(10):
            data = src.read(1, window=((i*256, (i+1)*256), (0, 256)))
    single_conn_time = time.time() - start
    
    print(f"10x 256x256 reads:")
    print(f"  Multiple connections: {multi_conn_time:.3f}s ({multi_conn_time/10:.4f}s per read)")
    print(f"  Single connection: {single_conn_time:.3f}s ({single_conn_time/10:.4f}s per read)")
    print(f"  Connection reuse speedup: {multi_conn_time/single_conn_time:.1f}x")


Using rasterio config: {'GDAL_CACHEMAX': 512, 'CPL_VSIL_CURL_CACHE_SIZE': 200000000, 'GDAL_HTTP_MULTIPLEX': 'YES', 'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR'}

=== With Production Rasterio Config ===

Testing S2A_20LPN_20230102_0_L2A:
  256x256: 0.063s, 0.1MB,  1 chips, 0.0627s/chip
  512x512: 0.000s, 0.5MB,  4 chips, 0.0001s/chip
  1024x1024: 0.001s, 2.0MB, 16 chips, 0.0001s/chip
  1536x1536: 0.414s, 4.5MB, 36 chips, 0.0115s/chip
  2048x2048: 0.137s, 8.0MB, 64 chips, 0.0021s/chip
  3072x3072: 0.707s, 18.0MB, 144 chips, 0.0049s/chip

Testing S2A_20LPP_20230102_0_L2A:
  256x256: 0.068s, 0.1MB,  1 chips, 0.0678s/chip
  512x512: 0.000s, 0.5MB,  4 chips, 0.0001s/chip
  1024x1024: 0.001s, 2.0MB, 16 chips, 0.0000s/chip
  1536x1536: 0.204s, 4.5MB, 36 chips, 0.0057s/chip
  2048x2048: 0.124s, 8.0MB, 64 chips, 0.0019s/chip
  3072x3072: 0.228s, 18.0MB, 144 chips, 0.0016s/chip

Testing S2A_20LQP_20230112_0_L2A:
  256x256: 0.131s, 0.1MB,  1 chips, 0.1307s/chip
  512x512: 0.001s, 0.5MB,  4 chips, 0

In [11]:
import time, rasterio, math
import numpy as np
from rasterio.warp import transform_bounds

rasterio_env = {
    'GDAL_CACHEMAX': 512,
    'CPL_VSIL_CURL_CACHE_SIZE': 200000000,
    'GDAL_HTTP_MULTIPLEX': 'YES',
    'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR'
}

url = 's3://sentinel-cogs/sentinel-s2-l2a-cogs/20/L/PN/2023/1/S2A_20LPN_20230102_0_L2A/B02.tif'

with rasterio.Env(**rasterio_env):
    with rasterio.open(url) as src:
        # Test multiple tile sizes to find optimal
        tile_sizes = [
            (1, 256),   # 1x1 = individual chips
            (2, 512),   # 2x2 = 4 chips  
            (3, 768),   # 3x3 = 9 chips
            (4, 1024),  # 4x4 = 16 chips
            (6, 1536),  # 6x6 = 36 chips
            (8, 2048),  # 8x8 = 64 chips
        ]

        print("=== Tile Size Optimization ===")
        results = {}

        for tile_factor, pixel_size in tile_sizes:
            chips_per_tile = tile_factor * tile_factor
            print(f"\n{tile_factor}x{tile_factor} tiles ({pixel_size}x{pixel_size}, {chips_per_tile} chips):")
            
            times = []
            for test in range(5):  # 5 tests per size
                # Random location that fits the tile size
                max_x = src.width - pixel_size
                max_y = src.height - pixel_size
                if max_x <= 0 or max_y <= 0:
                    print(f"  Tile too large for image")
                    break
                    
                x = np.random.randint(0, max_x)
                y = np.random.randint(0, max_y)
                
                start = time.time()
                data = src.read(1, window=((y, y + pixel_size), (x, x + pixel_size)))
                times.append(time.time() - start)
            
            if times:
                avg_time = np.mean(times)
                per_chip_time = avg_time / chips_per_tile
                mb_size = (pixel_size * pixel_size * 2) / 1024**2
                
                print(f"  Avg time: {avg_time:.4f}s")
                print(f"  Per chip: {per_chip_time:.4f}s") 
                print(f"  Data size: {mb_size:.1f}MB")
                print(f"  Throughput: {mb_size/avg_time:.1f}MB/s")
                
                results[tile_factor] = {
                    'per_chip_time': per_chip_time,
                    'total_time': avg_time,
                    'chips_per_tile': chips_per_tile,
                    'mb_size': mb_size
                }

        # Find optimal
        if results:
            print(f"\n=== Summary ===")
            best_tile = min(results.keys(), key=lambda k: results[k]['per_chip_time'])
            
            for tile_factor in sorted(results.keys()):
                r = results[tile_factor]
                speedup = results[1]['per_chip_time'] / r['per_chip_time']
                print(f"{tile_factor}x{tile_factor}: {r['per_chip_time']:.4f}s/chip, {speedup:.1f}x speedup, {r['mb_size']:.1f}MB")
            
            print(f"\nOptimal tile size: {best_tile}x{best_tile} ({results[best_tile]['chips_per_tile']} chips)")
            
            # Memory consideration for SageMaker
            print(f"\n=== Memory Analysis ===")
            for tile_factor in sorted(results.keys()):
                r = results[tile_factor]
                memory_per_batch = r['chips_per_tile'] * 0.6  # 0.6MB per processed chip
                print(f"{tile_factor}x{tile_factor}: {memory_per_batch:.1f}MB per batch")


=== Tile Size Optimization ===

1x1 tiles (256x256, 1 chips):
  Avg time: 0.4044s
  Per chip: 0.4044s
  Data size: 0.1MB
  Throughput: 0.3MB/s

2x2 tiles (512x512, 4 chips):
  Avg time: 0.2784s
  Per chip: 0.0696s
  Data size: 0.5MB
  Throughput: 1.8MB/s

3x3 tiles (768x768, 9 chips):
  Avg time: 0.2917s
  Per chip: 0.0324s
  Data size: 1.1MB
  Throughput: 3.9MB/s

4x4 tiles (1024x1024, 16 chips):
  Avg time: 0.2924s
  Per chip: 0.0183s
  Data size: 2.0MB
  Throughput: 6.8MB/s

6x6 tiles (1536x1536, 36 chips):
  Avg time: 0.4473s
  Per chip: 0.0124s
  Data size: 4.5MB
  Throughput: 10.1MB/s

8x8 tiles (2048x2048, 64 chips):
  Avg time: 0.8072s
  Per chip: 0.0126s
  Data size: 8.0MB
  Throughput: 9.9MB/s

=== Summary ===
1x1: 0.4044s/chip, 1.0x speedup, 0.1MB
2x2: 0.0696s/chip, 5.8x speedup, 0.5MB
3x3: 0.0324s/chip, 12.5x speedup, 1.1MB
4x4: 0.0183s/chip, 22.1x speedup, 2.0MB
6x6: 0.0124s/chip, 32.5x speedup, 4.5MB
8x8: 0.0126s/chip, 32.1x speedup, 8.0MB

Optimal tile size: 6x6 (36 chip