In [None]:
import os
import numpy as np
from datetime import datetime, timedelta
import h5py

# Create a directory for intermediate storage
os.makedirs('../Datasets/Testing/Processed/HDF5-TemporalPairs', exist_ok=True)

# Process in smaller chunks and save to HDF5
with h5py.File('../Datasets/Testing/Processed/HDF5-TemporalPairs/pairs.h5', 'w') as f:
    # Create datasets
    f.create_dataset('pairs', shape=(0, 2, *patches[0].shape), maxshape=(None, 2, *patches[0].shape), chunks=True)
    f.create_dataset('labels', shape=(0,), maxshape=(None,), dtype=bool)
    
    pair_count = 0
    chunk_size = 10  # Process 10 pairs at a time
    
    for i in range(0, len(patches)-1, chunk_size):
        chunk_pairs = []
        chunk_labels = []
        
        chunk_end = min(i + chunk_size, len(patches)-1)
        for j in range(i, chunk_end):
            date1 = patch_dates[j]
            for k in range(j+1, len(patches)):
                date2 = patch_dates[k]
                if (date2 - date1).astype('timedelta64[D]').astype(int) <= 30:
                    chunk_pairs.append([patches[j], patches[k]])
                    has_event = any(date1 <= event_date <= date2 for event_date in load_geojson_dates())
                    chunk_labels.append(has_event)
        
        if chunk_pairs:
            # Resize datasets
            new_size = pair_count + len(chunk_pairs)
            f['pairs'].resize(new_size, axis=0)
            f['labels'].resize(new_size, axis=0)
            
            # Store chunk
            f['pairs'][pair_count:new_size] = chunk_pairs
            f['labels'][pair_count:new_size] = chunk_labels
            
            pair_count = new_size

print(f"Total pairs saved: {pair_count}")

In [None]:
import random
from sklearn.model_selection import train_test_split

# Create temporal pairs
temporal_pairs = []
labels = []
max_time_diff = 30  # Maximum days between image pairs

for i, (date1, patch1) in enumerate(zip(patch_dates[:-1], patches[:-1])):
    for j, (date2, patch2) in enumerate(zip(patch_dates[i+1:], patches[i+1:]), i+1):
        time_diff = (date2 - date1).astype('timedelta64[D]').astype(int)
        if time_diff <= max_time_diff:
            temporal_pairs.append((patch1, patch2))
            # Binary label: 1 if deforestation event exists between dates
            has_event = any(date1 <= event_date <= date2 for event_date in load_geojson_dates())
            labels.append(has_event)

# Convert to numpy arrays
X = np.array(temporal_pairs)
y = np.array(labels)

# Split data: 70% train, 15% validation, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training pairs: {len(X_train)}")
print(f"Validation pairs: {len(X_val)}")
print(f"Testing pairs: {len(X_test)}")
print(f"Positive samples: {sum(y)}/{len(y)} ({sum(y)/len(y)*100:.2f}%)")

In [None]:
import os
import numpy as np
import json
from datetime import datetime
from glob import glob
from pathlib import Path

def load_geojson_dates(print_loading=False):
    # Load the most recent sampled events file
    sample_files = glob('../Datasets/Testing/Samples/*.geojson')
    if not sample_files:
        raise FileNotFoundError("No .geojson files found in Testing/Samples/")
    latest_file = max(sample_files, key=os.path.getctime)
    if print_loading == True:
        print(f"Loading events from {latest_file}")

    with open(latest_file) as f:
        data = json.load(f)

    # Extract dates and convert to datetime objects
    event_dates = []
    for feature in data['features']:
        date_str = feature['properties']['img_date']
        try:
            event_date = datetime.strptime(date_str, '%Y-%m-%d')
            event_dates.append(event_date)
        except ValueError:
            print(f"Date format error in {date_str}")
            # You can choose to skip or handle the error as needed
            continue

    return sorted(event_dates)

def get_tile_date(patch_file_path):
    # Extract date from Sentinel-2 tile path
    tile_name = Path(patch_file_path).parent.parent.name
    parts = tile_name.split('_')
    if len(parts) < 3:
        raise ValueError(f"Unexpected tile name format: {tile_name}")
    date_str = parts[2][:8]
    try:
        return datetime.strptime(date_str, '%Y%m%d')
    except ValueError:
        raise ValueError(f"Invalid date format in tile name: {date_str}")


def load_and_sort_patches():
    patches = []
    patch_dates = []

    # Use pathlib for better path handling
    base_path = Path('../Datasets/Testing/Tiles')
    print(f"Looking for tiles in: {base_path}")
    
    tile_dirs = list(base_path.glob('S2*'))
    
    if not tile_dirs:
        raise FileNotFoundError(f"No tile directories found in {base_path}")

    print(f"Found {len(tile_dirs)} tile directories")

    for tile_dir in tile_dirs:
        print(f"\nProcessing directory: {tile_dir}")
        patch_files = []
        
        # Find both single and multi-patch files
        # Pattern 1: PLOT-XXXXX.npy (single patches)
        patch_files.extend(list(tile_dir.rglob('*.npy')))
        
        if not patch_files:
            print(f"No patches found in {tile_dir}")
            continue

        print(f"Found {len(patch_files)} patches in {tile_dir}")

        # Print first few file paths to verify
        print("Sample file paths:")
        for f in patch_files[:3]:
            print(f"  {f}")

        # Custom sorting function for plot files
        def sort_key(x):
            filename = x.stem  # Get filename without extension
            if '_P' in filename:
                # For multi-patch files (PLOT-XXXXX_PX)
                base_num = int(filename.split('-')[1].split('_')[0])
                patch_num = int(filename.split('_P')[1])
                return (base_num, patch_num)
            else:
                # For single patch files (PLOT-XXXXX)
                return (int(filename.split('-')[1]), 0)

        for patch_file in sorted(patch_files, key=sort_key):
            try:
                patch = np.load(patch_file)
                patch_date = get_tile_date(patch_file)
                print(f"Loaded patch from {patch_file} with date {patch_date}")
                patch_dates.append(patch_date)
                patches.append(patch)
            except Exception as e:
                print(f"Error loading {patch_file}: {e}")
                continue

    if not patches:
        print("\nDebug information:")
        print(f"Total tile directories found: {len(tile_dirs)}")
        print(f"Tile directory paths:")
        for td in tile_dirs:
            print(f"  {td}")
        raise RuntimeError("No patches loaded successfully.")

    # Convert patch_dates to numpy datetime64 for sorting
    patch_dates_np = np.array(patch_dates, dtype='datetime64')
    patches_np = np.array(patches)

    # Sort patches by date
    sorted_indices = np.argsort(patch_dates_np)
    patches_sorted = patches_np[sorted_indices]
    patch_dates_sorted = patch_dates_np[sorted_indices]

    print(f"\nTotal patches loaded: {len(patches_sorted)}")
    return patches_sorted, patch_dates_sorted

event_dates = load_geojson_dates(print_loading=True)
base_path = Path('../Datasets/Testing/Tiles')
tile_dirs = list(base_path.glob('S2*'))
patches, patch_dates = load_and_sort_patches()
# Example processing: print the number of patches and first few dates
print("First 5 patch dates:")
for date in patch_dates[:5]:
    print(date)