In [5]:
# Use autoreload to automatically reload modules
%load_ext autoreload
%autoreload 2
import rootutils
root_path = rootutils.find_root()

In [6]:
import pandas as pd
import numpy as np
from shapely import wkt
from scipy.spatial.distance import cdist
import geopandas as gpd

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate distance between two points on Earth in km
    """
    R = 6371  # Earth radius in km
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def filter_by_distance(df_tile, min_distance_km=5):
    """
    Keep points that are at least min_distance_km apart, preserving original indices.
    Prioritizes larger areas (size_pixels).
    """
    if len(df_tile) == 0:
        return df_tile
    
    # Sort by size_pixels descending to prioritize larger areas
    df_sorted = df_tile.sort_values('size_pixels', ascending=False).reset_index(drop=False)
    
    coords = df_sorted[['centroid_y', 'centroid_x']].values
    original_indices = df_sorted['index'].tolist()
    keep_positions = []
    remaining_positions = list(range(len(df_sorted)))
    
    while remaining_positions:
        current_pos = remaining_positions[0]
        keep_positions.append(current_pos)
        
        current_coords = coords[current_pos:current_pos+1]
        
        distances = []
        for rem_pos in remaining_positions:
            dist = haversine_distance(
                current_coords[0, 0], current_coords[0, 1],
                coords[rem_pos, 0], coords[rem_pos, 1]
            )
            distances.append(dist)
        
        remaining_positions = [
            remaining_positions[i] 
            for i, dist in enumerate(distances) 
            if dist >= min_distance_km
        ]
    
    keep_original_indices = [original_indices[pos] for pos in keep_positions]
    return df_tile.loc[keep_original_indices]

In [None]:
# # Load your CSV with proper index handling
# df = pd.read_csv(root_path / 'data/processed/radd/south_america_combined_clean.csv', index_col=0)

# # Process each tile
# cleaned_dfs = []
# for tile_name in df['tile_name'].unique():
#     df_tile = df[df['tile_name'] == tile_name].copy()
#     df_filtered = filter_by_distance(df_tile, min_distance_km=5)
#     cleaned_dfs.append(df_filtered)
#     print(f"Tile {tile_name}: {len(df_tile)} -> {len(df_filtered)} samples")

# # Combine all tiles back together
# df_cleaned = pd.concat(cleaned_dfs, ignore_index=False)

# # Store retained indices before any transformations
# retained_positive_ids = df_cleaned.index.tolist()

# # Convert to GeoDataFrame using centroid coordinates
# gdf_cleaned = gpd.GeoDataFrame(
#     df_cleaned, 
#     geometry=gpd.points_from_xy(df_cleaned.centroid_x, df_cleaned.centroid_y),
#     crs='EPSG:4326'
# )

# # Sort by index
# gdf_cleaned = gdf_cleaned.sort_index()

# # Write to parquet
# gdf_cleaned.to_parquet(root_path / 'data/processed/radd/south_america_combined_clean_sampled.parquet')

# print(f"\nTotal: {len(df)} -> {len(gdf_cleaned)} samples")

# gdf_cleaned

Tile 00N_050W_radd_alerts: 80 -> 45 samples
Tile 00N_060W_radd_alerts: 209 -> 114 samples
Tile 00N_070W_radd_alerts: 39 -> 33 samples
Tile 00N_080W_radd_alerts: 44 -> 35 samples
Tile 10N_060W_radd_alerts: 4 -> 4 samples
Tile 10N_070W_radd_alerts: 38 -> 33 samples
Tile 10N_080W_radd_alerts: 67 -> 49 samples
Tile 10S_060W_radd_alerts: 565 -> 165 samples
Tile 10S_070W_radd_alerts: 1440 -> 256 samples
Tile 20S_060W_radd_alerts: 180 -> 57 samples
Tile 20S_070W_radd_alerts: 10 -> 4 samples

Total: 2676 -> 795 samples


Unnamed: 0,geometry,component_id,size_pixels,area_hectares,earliest_alert,latest_alert,duration_days,centroid_x,centroid_y,confidence,tile_name,date_raster_file
0,POINT (-49.48768 -8.15087),32,109,1.342989,2021-08-29,2021-09-22,24,-49.487678,-8.150868,high,00N_050W_radd_alerts,sample_07_dates.tif
1,POINT (-49.60321 -8.47545),111,312,3.844152,2025-05-16,2025-06-09,24,-49.603209,-8.475450,high,00N_050W_radd_alerts,sample_07_dates.tif
2,POINT (-49.55413 -8.48142),128,104,1.281384,2025-06-21,2025-07-15,24,-49.554126,-8.481419,high,00N_050W_radd_alerts,sample_07_dates.tif
3,POINT (-44.69836 -5.69695),25,1220,15.031620,2020-05-27,2020-06-25,29,-44.698360,-5.696953,high,00N_050W_radd_alerts,sample_11_dates.tif
4,POINT (-44.83613 -5.71724),145,106,1.306026,2022-11-06,2022-11-18,12,-44.836128,-5.717239,high,00N_050W_radd_alerts,sample_11_dates.tif
...,...,...,...,...,...,...,...,...,...,...,...,...
2666,POINT (-57.01597 -22.88906),2340,247,3.043287,2021-07-23,2021-08-16,24,-57.015969,-22.889063,high,20S_060W_radd_alerts,sample_36_dates.tif
2667,POINT (-63.75206 -21.88645),624,112,1.379952,2020-10-03,2020-10-28,25,-63.752064,-21.886446,high,20S_070W_radd_alerts,sample_01_dates.tif
2668,POINT (-64.13058 -22.33156),12,110,1.355310,2021-10-22,2021-11-16,25,-64.130581,-22.331562,high,20S_070W_radd_alerts,sample_28_dates.tif
2670,POINT (-63.35078 -20.64919),172,450,5.544450,2020-09-28,2020-10-22,24,-63.350776,-20.649194,high,20S_070W_radd_alerts,sample_37_dates.tif


In [24]:
df = pd.read_parquet(root_path / "data/processed/radd/south_america_combined_clean_sampled_15.parquet")
retained_positive_ids = df.index.tolist()
df

Unnamed: 0,geometry,component_id,size_pixels,area_hectares,earliest_alert,latest_alert,duration_days,centroid_x,centroid_y,confidence,tile_name,date_raster_file
0,b'\x01\x01\x00\x00\x00F}\x92;l\xbeH\xc0\x9e\x0...,32,109,1.342989,2021-08-29,2021-09-22,24,-49.487678,-8.150868,high,00N_050W_radd_alerts,sample_07_dates.tif
1,b'\x01\x01\x00\x00\x00\x91\xd3\xd7\xf35\xcdH\x...,111,312,3.844152,2025-05-16,2025-06-09,24,-49.603209,-8.475450,high,00N_050W_radd_alerts,sample_07_dates.tif
2,b'\x01\x01\x00\x00\x00\x80\xee\xcb\x99\xed\xc6...,128,104,1.281384,2025-06-21,2025-07-15,24,-49.554126,-8.481419,high,00N_050W_radd_alerts,sample_07_dates.tif
3,b'\x01\x01\x00\x00\x00\xd3jH\xdccYF\xc0we\x17\...,25,1220,15.031620,2020-05-27,2020-06-25,29,-44.698360,-5.696953,high,00N_050W_radd_alerts,sample_11_dates.tif
4,b'\x01\x01\x00\x00\x00\x8c\xa2\x07>\x06kF\xc0\...,145,106,1.306026,2022-11-06,2022-11-18,12,-44.836128,-5.717239,high,00N_050W_radd_alerts,sample_11_dates.tif
...,...,...,...,...,...,...,...,...,...,...,...,...
2669,b'\x01\x01\x00\x00\x00+\xa5gz\x89\xabO\xc0\xce...,137,193,2.377953,2020-09-28,2020-10-22,24,-63.340133,-20.638933,high,20S_070W_radd_alerts,sample_37_dates.tif
2670,b'\x01\x01\x00\x00\x00a\x1c\\:\xe6\xacO\xc0j\x...,172,450,5.544450,2020-09-28,2020-10-22,24,-63.350776,-20.649194,high,20S_070W_radd_alerts,sample_37_dates.tif
2673,b'\x01\x01\x00\x00\x00\x81\x93m\xe0\x0e\xacO\x...,265,182,2.242422,2020-10-10,2020-11-09,30,-63.344204,-20.675970,high,20S_070W_radd_alerts,sample_37_dates.tif
2675,b'\x01\x01\x00\x00\x00\x91\r\xa4\x8bM\xadO\xc0...,372,146,1.798866,2020-10-04,2020-10-28,24,-63.353929,-20.704417,high,20S_070W_radd_alerts,sample_37_dates.tif


In [25]:
# Load negative samples
negative_df = pd.read_parquet(root_path / 'data/processed/radd/negative_samples_sequential.parquet')
negative_df
# Rename columns
negative_df = negative_df.rename(columns={'lat': 'centroid_y', 'lon': 'centroid_x',
                                          'tile_id': 'tile_name'})

# Filter negative_df to keep only rows where positive_event_id is in the retained indices
negative_df_filtered = negative_df[negative_df['positive_event_id'].isin(retained_positive_ids)]

print(f"Negative samples: {len(negative_df)} -> {len(negative_df_filtered)}")

# Convert to GeoDataFrame
gdf_negative = gpd.GeoDataFrame(
    negative_df_filtered, 
    geometry=gpd.points_from_xy(negative_df_filtered.centroid_x, negative_df_filtered.centroid_y),
    crs='EPSG:4326'
)

# Add duration_days column
gdf_negative['earliest_alert'] = pd.to_datetime(gdf_negative['earliest_alert'])
gdf_negative['latest_alert'] = pd.to_datetime(gdf_negative['latest_alert'])
gdf_negative['duration_days'] = (gdf_negative['latest_alert'] - gdf_negative['earliest_alert']).dt.days

# Set index to match positive_event_id (same as gdf_cleaned)
gdf_negative = gdf_negative.set_index('positive_event_id')
gdf_negative.index.name = None

# Save to parquet
gdf_negative.to_parquet(root_path / 'data/processed/radd/negative_samples_sequential_sampled_15.parquet')

gdf_negative

Negative samples: 2676 -> 1408


Unnamed: 0,centroid_y,centroid_x,forest_fraction,earliest_alert,latest_alert,safety_buffer_months,is_deforestation,attempt_number,positive_lat,positive_lon,positive_area_hectares,positive_duration_days,tile_name,geometry,duration_days
0,-8.93585,-47.11885,0.339517,2021-08-29,2021-09-22,2,False,5,-8.150868,-49.487678,1.342989,24,00N_050W,POINT (-47.11885 -8.93585),24
1,-2.62135,-49.45165,0.823114,2025-05-16,2025-06-09,2,False,5,-8.475450,-49.603209,3.844152,24,00N_050W,POINT (-49.45165 -2.62135),24
2,-5.03155,-49.44135,0.560891,2025-06-21,2025-07-15,2,False,5,-8.481419,-49.554126,1.281384,24,00N_050W,POINT (-49.44135 -5.03155),24
3,-8.05495,-47.82575,0.267312,2020-05-27,2020-06-25,2,False,1,-5.696953,-44.698360,15.031620,29,00N_050W,POINT (-47.82575 -8.05495),29
4,-2.83135,-42.77795,1.000000,2022-11-06,2022-11-18,2,False,2,-5.717239,-44.836128,1.306026,12,00N_050W,POINT (-42.77795 -2.83135),12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2669,-24.46745,-63.35915,0.933223,2020-09-28,2020-10-22,2,False,2,-20.638933,-63.340133,2.377953,24,20S_070W,POINT (-63.35915 -24.46745),24
2670,-27.67665,-60.78455,0.237898,2020-09-28,2020-10-22,2,False,3,-20.649194,-63.350776,5.544450,24,20S_070W,POINT (-60.78455 -27.67665),24
2673,-23.74985,-65.00385,0.895994,2020-10-10,2020-11-09,2,False,1,-20.675970,-63.344204,2.242422,30,20S_070W,POINT (-65.00385 -23.74985),30
2675,-28.77165,-60.04075,0.687660,2020-10-04,2020-10-28,2,False,2,-20.704417,-63.353929,1.798866,24,20S_070W,POINT (-60.04075 -28.77165),24
