In [1]:
# Use autoreload to automatically reload modules
%load_ext autoreload
%autoreload 2
import rootutils
root_path = rootutils.find_root()

In [None]:
import pandas as pd
import numpy as np
from shapely import wkt
from scipy.spatial.distance import cdist
import geopandas as gpd

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate distance between two points on Earth in km
    """
    R = 6371  # Earth radius in km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def filter_by_distance(df_tile, min_distance_km=5):
    """
    Keep points that are at least min_distance_km apart, preserving original indices.
    Prioritizes larger areas (size_pixels).
    """
    if len(df_tile) == 0:
        return df_tile
    
    # Sort by size_pixels descending to prioritize larger areas
    df_sorted = df_tile.sort_values('size_pixels', ascending=False).reset_index(drop=False)
    
    coords = df_sorted[['centroid_y', 'centroid_x']].values
    original_indices = df_sorted['index'].tolist()
    keep_positions = []
    remaining_positions = list(range(len(df_sorted)))
    
    while remaining_positions:
        current_pos = remaining_positions[0]
        keep_positions.append(current_pos)
        
        current_coords = coords[current_pos:current_pos+1]
        
        distances = []
        for rem_pos in remaining_positions:
            dist = haversine_distance(
                current_coords[0, 0], current_coords[0, 1],
                coords[rem_pos, 0], coords[rem_pos, 1]
            )
            distances.append(dist)
        
        remaining_positions = [
            remaining_positions[i] 
            for i, dist in enumerate(distances) 
            if dist >= min_distance_km
        ]
    
    keep_original_indices = [original_indices[pos] for pos in keep_positions]
    return df_tile.loc[keep_original_indices]

Tile 00N_050W_radd_alerts: 80 -> 45 samples
Tile 00N_060W_radd_alerts: 209 -> 114 samples
Tile 00N_070W_radd_alerts: 39 -> 33 samples
Tile 00N_080W_radd_alerts: 44 -> 35 samples
Tile 10N_060W_radd_alerts: 4 -> 4 samples
Tile 10N_070W_radd_alerts: 38 -> 33 samples
Tile 10N_080W_radd_alerts: 67 -> 49 samples
Tile 10S_060W_radd_alerts: 565 -> 165 samples
Tile 10S_070W_radd_alerts: 1440 -> 256 samples
Tile 20S_060W_radd_alerts: 180 -> 57 samples
Tile 20S_070W_radd_alerts: 10 -> 4 samples

Total: 2676 -> 795 samples


Unnamed: 0.1,Unnamed: 0,geometry,component_id,size_pixels,area_hectares,earliest_alert,latest_alert,duration_days,centroid_x,centroid_y,confidence,tile_name,date_raster_file
3,3,b'\x01\x01\x00\x00\x00\xec^\xe8\xdacYF\xc0\xca...,25,1220,15.031620,2020-05-27,2020-06-25,29,-44.698360,-5.696953,high,00N_050W_radd_alerts,sample_11_dates.tif
60,60,b'\x01\x01\x00\x00\x00\xf0\x9b9\xaf\x9d\x82H\x...,674,533,6.567093,2021-08-29,2021-09-28,30,-49.020437,-2.491951,high,00N_050W_radd_alerts,sample_23_dates.tif
7,7,b'\x01\x01\x00\x00\x00f\xef4\xed\x0cHF\xc0\x81...,264,508,6.259068,2020-01-04,2020-01-28,24,-44.562894,-5.742744,high,00N_050W_radd_alerts,sample_11_dates.tif
59,59,b'\x01\x01\x00\x00\x00\xd9\x1d\x17\xa1\x98\x8d...,380,373,4.595733,2024-11-17,2024-11-29,12,-49.106220,-2.468518,high,00N_050W_radd_alerts,sample_23_dates.tif
70,70,b'\x01\x01\x00\x00\x00\xae\xdc\xb3Z\nrH\xc0\xb...,537,344,4.238424,2020-07-05,2020-07-17,12,-48.890941,-3.735393,high,00N_050W_radd_alerts,sample_25_dates.tif
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,2547,b'\x01\x01\x00\x00\x00Q2%|\xbc?L\xc0\x14\xb8\x...,141,101,1.244421,2020-08-09,2020-08-09,0,-56.497940,-23.596298,high,20S_060W_radd_alerts,sample_33_dates.tif
2669,2670,b'\x01\x01\x00\x00\x00VT9<\xe6\xacO\xc0?\xa2k\...,172,450,5.544450,2020-09-28,2020-10-22,24,-63.350776,-20.649194,high,20S_070W_radd_alerts,sample_37_dates.tif
2674,2675,b'\x01\x01\x00\x00\x00R\xd5n\x8fM\xadO\xc0PC;\...,372,146,1.798866,2020-10-04,2020-10-28,24,-63.353929,-20.704417,high,20S_070W_radd_alerts,sample_37_dates.tif
2666,2667,"b""\x01\x01\x00\x00\x00\xf3\xd0{\xa4C\xe0O\xc0\...",624,112,1.379952,2020-10-03,2020-10-28,25,-63.752064,-21.886446,high,20S_070W_radd_alerts,sample_01_dates.tif


In [10]:
# Load your CSV
df = pd.read_csv(root_path / 'data/processed/radd/south_america_combined_clean.csv')

# Process each tile
cleaned_dfs = []
for tile_name in df['tile_name'].unique():
    df_tile = df[df['tile_name'] == tile_name].copy()
    df_filtered = filter_by_distance(df_tile, min_distance_km=5)
    cleaned_dfs.append(df_filtered)
    print(f"Tile {tile_name}: {len(df_tile)} -> {len(df_filtered)} samples")

# Combine all tiles back together
df_cleaned = pd.concat(cleaned_dfs, ignore_index=False)

# Convert to GeoDataFrame using centroid coordinates
gdf_cleaned = gpd.GeoDataFrame(
    df_cleaned, 
    geometry=gpd.points_from_xy(df_cleaned.centroid_x, df_cleaned.centroid_y),
    crs='EPSG:4326'
)

# write to parquet
gdf_cleaned.to_parquet(root_path / 'data/processed/radd/south_america_combined_clean_sampled.parquet')

print(f"\nTotal: {len(df)} -> {len(gdf_cleaned)} samples")

gdf_cleaned

Tile 00N_050W_radd_alerts: 80 -> 45 samples
Tile 00N_060W_radd_alerts: 209 -> 114 samples
Tile 00N_070W_radd_alerts: 39 -> 33 samples
Tile 00N_080W_radd_alerts: 44 -> 35 samples
Tile 10N_060W_radd_alerts: 4 -> 4 samples
Tile 10N_070W_radd_alerts: 38 -> 33 samples
Tile 10N_080W_radd_alerts: 67 -> 49 samples
Tile 10S_060W_radd_alerts: 565 -> 165 samples
Tile 10S_070W_radd_alerts: 1440 -> 256 samples
Tile 20S_060W_radd_alerts: 180 -> 57 samples
Tile 20S_070W_radd_alerts: 10 -> 4 samples

Total: 2676 -> 795 samples


Unnamed: 0.1,Unnamed: 0,geometry,component_id,size_pixels,area_hectares,earliest_alert,latest_alert,duration_days,centroid_x,centroid_y,confidence,tile_name,date_raster_file
3,3,POINT (-44.69836 -5.69695),25,1220,15.031620,2020-05-27,2020-06-25,29,-44.698360,-5.696953,high,00N_050W_radd_alerts,sample_11_dates.tif
60,60,POINT (-49.02044 -2.49195),674,533,6.567093,2021-08-29,2021-09-28,30,-49.020437,-2.491951,high,00N_050W_radd_alerts,sample_23_dates.tif
7,7,POINT (-44.56289 -5.74274),264,508,6.259068,2020-01-04,2020-01-28,24,-44.562894,-5.742744,high,00N_050W_radd_alerts,sample_11_dates.tif
59,59,POINT (-49.10622 -2.46852),380,373,4.595733,2024-11-17,2024-11-29,12,-49.106220,-2.468518,high,00N_050W_radd_alerts,sample_23_dates.tif
70,70,POINT (-48.89094 -3.73539),537,344,4.238424,2020-07-05,2020-07-17,12,-48.890941,-3.735393,high,00N_050W_radd_alerts,sample_25_dates.tif
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,2547,POINT (-56.49794 -23.5963),141,101,1.244421,2020-08-09,2020-08-09,0,-56.497940,-23.596298,high,20S_060W_radd_alerts,sample_33_dates.tif
2669,2670,POINT (-63.35078 -20.64919),172,450,5.544450,2020-09-28,2020-10-22,24,-63.350776,-20.649194,high,20S_070W_radd_alerts,sample_37_dates.tif
2674,2675,POINT (-63.35393 -20.70442),372,146,1.798866,2020-10-04,2020-10-28,24,-63.353929,-20.704417,high,20S_070W_radd_alerts,sample_37_dates.tif
2666,2667,POINT (-63.75206 -21.88645),624,112,1.379952,2020-10-03,2020-10-28,25,-63.752064,-21.886446,high,20S_070W_radd_alerts,sample_01_dates.tif


In [8]:
negative_df = pd.read_parquet(root_path / 'data/processed/radd/negative_samples_sequential.parquet')

# rename columns lat lon to centroid_y centroid_x
negative_df = negative_df.rename(columns={'lat': 'centroid_y', 'lon': 'centroid_x'})

# Filter negative_df to keep only rows where positive_event_id is in the retained indices
retained_positive_ids = df_cleaned.index.tolist()
negative_df_filtered = negative_df[negative_df['positive_event_id'].isin(retained_positive_ids)]

print(f"Negative samples: {len(negative_df)} -> {len(negative_df_filtered)}")

# Convert to GeoDataFrame using centroid coordinates
gdf_negative = gpd.GeoDataFrame(
    negative_df_filtered, 
    geometry=gpd.points_from_xy(negative_df_filtered.centroid_x, negative_df_filtered.centroid_y),
    crs='EPSG:4326'
)

# Save as GeoPackage
gdf_negative.to_parquet(root_path / 'data/processed/radd/negative_samples_sequential_sampled.parquet')

negative_df_filtered

Negative samples: 2676 -> 794


Unnamed: 0,centroid_y,centroid_x,forest_fraction,earliest_alert,latest_alert,safety_buffer_months,is_deforestation,attempt_number,positive_event_id,positive_lat,positive_lon,positive_area_hectares,positive_duration_days,tile_id,geometry
0,-8.93585,-47.11885,0.339517,2021-08-29,2021-09-22,2,False,5,0,-8.150868,-49.487678,1.342989,24,00N_050W,"b""\x01\x01\x00\x00\x00\x97\x90\x0fz6\x8fG\xc0W..."
1,-2.62135,-49.45165,0.823114,2025-05-16,2025-06-09,2,False,5,1,-8.475450,-49.603209,3.844152,24,00N_050W,b'\x01\x01\x00\x00\x00\x84\x9e\xcd\xaa\xcf\xb9...
2,-5.03155,-49.44135,0.560891,2025-06-21,2025-07-15,2,False,5,2,-8.481419,-49.554126,1.281384,24,00N_050W,b'\x01\x01\x00\x00\x00x\x0b$(~\xb8H\xc0U\xc1\x...
3,-8.05495,-47.82575,0.267312,2020-05-27,2020-06-25,2,False,1,3,-5.696953,-44.698360,15.031620,29,00N_050W,b'\x01\x01\x00\x00\x00\x04V\x0e-\xb2\xe9G\xc0\...
4,-2.83135,-42.77795,1.000000,2022-11-06,2022-11-18,2,False,2,4,-5.717239,-44.836128,1.306026,12,00N_050W,b'\x01\x01\x00\x00\x00+\xf6\x97\xdd\x93cE\xc0\...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2664,-23.46795,-58.33565,0.317011,2021-10-27,2021-11-08,2,False,2,2665,-22.801068,-57.072580,1.872792,12,20S_060W,b'\x01\x01\x00\x00\x00\x82sF\x94\xf6*M\xc0\xc8...
2665,-20.14365,-59.34705,0.488323,2021-07-23,2021-08-16,2,False,4,2666,-22.889063,-57.015969,3.043287,24,20S_060W,"b'\x01\x01\x00\x00\x00\xd5\th""l\xacM\xc0\x06\x..."
2666,-25.88785,-64.65255,0.630448,2020-10-03,2020-10-28,2,False,1,2667,-21.886446,-63.752064,1.379952,25,20S_070W,b'\x01\x01\x00\x00\x00O@\x13a\xc3)P\xc0\xec\xc...
2668,-24.46745,-63.35915,0.933223,2020-09-28,2020-10-22,2,False,2,2669,-20.638933,-63.340133,2.377953,24,20S_070W,b'\x01\x01\x00\x00\x00\xe0-\x90\xa0\xf8\xadO\x...
