# Setup

In [2]:
import pandas as pd
import geopandas as gpd
import rasterio as rio
import concurrent.futures
import threading
import os
from tqdm import tqdm

# Source

## Download

### Retrieve geotiff paths

In [55]:
# Get survey array
# Read the google sheet MPG Aerial Survey Manifest
sheet = pd.read_csv(
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vTgk2SSxgZ76UTYOlmf5UtjF1q59ZpKXCNH1Mn3rzbxQ_f6MvoTUirnUIKdcKxd-NIZk-MN8bsG3SP6/pub?output=csv"
)

# filter by sensors 'RGB' and site 'Upper Partridge'
upper_partridge = sheet[(sheet.sensor == "RGB") & (sheet.site == "Upper Partridge")]

# Create array of cloud_storage paths from the orthomosaic column excluding nulls
cloud_storage_paths = upper_partridge.orthomosaic.dropna().unique()

### Download geotiffs

In [None]:
# Download cloud_storage_paths array with wget to ../data/raster/geotiffs
# for url in cloud_storage_paths:
#     filename = url.split('/')[-1]
#     os.system(f'wget -q {url} -O ../data/raster/geotiffs/{filename}')

## Local resources

### Geotiff

In [3]:
# Local geotiff array
geotiffs = [f'../data/raster/geotiffs/{x}' for x in os.listdir('../data/raster/geotiffs') if not x.endswith('.DS_Store')]

In [4]:
# Local ground truth file
groundtruth = '../data/vector/groundtruth.geojson'

# Functions

In [5]:
# see https://colab.research.google.com/drive/15LFRMVOfEiF__FswVqTQstZC5ocC6Ur0?usp=sharing for cropping functions

# Define the function that creates a bounding box and returns its min and max coordinates
def bbox_side_len(point, side_len=10):
    if point is None or point.is_empty:
        return None, None, None, None  # Return None for all four values if the point is invalid
    half_side = side_len / 2
    minx, miny = point.x - half_side, point.y - half_side
    maxx, maxy = point.x + half_side, point.y + half_side
    return minx, miny, maxx, maxy

def process_bbox(raster_path, bbox, output_dir, read_lock, write_lock, idx):
    minx, miny, maxx, maxy = bbox
    if minx is None:
        return "Invalid point"

    try:
        with rio.open(raster_path) as src:
            # Check if the bbox is within the raster bounds
            raster_bounds = src.bounds
            if (minx >= raster_bounds.right or maxx <= raster_bounds.left or
                miny >= raster_bounds.top or maxy <= raster_bounds.bottom):
                return "Bounding box outside raster extent"

            window = src.window(minx, miny, maxx, maxy)

            # Ensure the window has a valid size
            if window.width < 1 or window.height < 1:
                return "Resulting window too small"

            with read_lock:
                src_array = src.read(window=window)

            # If the read array is empty, skip this bbox
            if src_array.size == 0:
                return "Empty array read"

            profile = src.profile.copy()
            profile.update({
                "height": src_array.shape[1],
                "width": src_array.shape[2],
                "transform": rio.windows.transform(window, src.transform)
            })

            output_file = f"{output_dir}/{idx}.tif"  # Changed to use idx
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            with write_lock:
                with rio.open(output_file, "w", **profile) as dst:
                    dst.write(src_array)

        return "Success"
    except Exception as e:
        return f"Error: {str(e)}"

def do_tiling(raster_path, points_gdf, output_dir, num_workers=4, side_len=10):
    # Create list of (bbox, idx) tuples
    bbox_data = [(bbox_side_len(point, side_len=side_len), idx) 
                 for idx, point in zip(points_gdf.idx, points_gdf.geometry)]
    valid_bbox_data = [(bbox, idx) for bbox, idx in bbox_data if bbox[0] is not None]

    read_lock = threading.Lock()
    write_lock = threading.Lock()

    pbar = tqdm(total=len(valid_bbox_data), desc="Processing bounding boxes", unit="bbox")
    errors = []

    def process_and_update(bbox_info):
        bbox, idx = bbox_info
        result = process_bbox(raster_path, bbox, output_dir, read_lock, write_lock, idx)
        pbar.update(1)
        if result != "Success":
            errors.append(result)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        list(executor.map(process_and_update, valid_bbox_data))

    pbar.close()

    if errors:
        print(f"\nEncountered {len(errors)} errors:")
        for error in errors[:10]:  # Print first 10 errors
            print(f"  - {error}")
        if len(errors) > 10:
            print(f"  ... and {len(errors) - 10} more.")

# Crop Tiles

In [None]:
target_crop_sz_meters = 5 # how big to make crops in meters
num_cores = os.cpu_count()
print(f'Using {num_cores} cores')

In [None]:
# Crop tiles for each geotiff
for geotiff in geotiffs:
    survey_name = os.path.basename(geotiff).split('.')[0].replace('-visible', '')
    
    # Set up output directories
    output_base = f'../data/raster/tiles/{survey_name}'
    for label in ['presence', 'absence']:
        os.makedirs(f'{output_base}/{label}', exist_ok=True)

    # Read and align CRS of raster and points
    with rio.open(geotiff) as src:
        raster_crs = src.crs
        print(f"Raster CRS: {raster_crs}")

    gdf = gpd.read_file(groundtruth)
    print(f"GeoJSON CRS: {gdf.crs}")

    if gdf.crs != raster_crs:
        print(f"Reprojecting from {gdf.crs} to {raster_crs}")
        gdf = gdf.to_crs(raster_crs)

    # Process both presence and absence points
    for presence_val in [1, 0]:
        label = 'presence' if presence_val == 1 else 'absence'
        points = gdf[gdf['Presence'] == presence_val]
        output_dir = f'{output_base}/{label}'
        
        do_tiling(geotiff, points, output_dir,
                 num_workers=num_cores, side_len=target_crop_sz_meters)

# Export

In [17]:
!zip -rq ../data/raster/tiles/mpg-horses_tiles.zip ../data/raster/tiles