# Setup Required Libraries and Paths
Import required libraries (os, numpy, json, datetime, h5py, glob) and set up paths to data directories.

In [7]:
import os
import numpy as np
import json
from datetime import datetime, timedelta
import h5py
from glob import glob

# Set up paths to data directories
processed_data_dir = '../Datasets/Testing/Processed/'
samples_data_dir = '../Datasets/Testing/Samples/'
data_dir = '../Datasets/Testing/'

# Create a directory for intermediate storage if it doesn't exist
os.makedirs(os.path.join(data_dir, 'HDF5-TemporalPairs'), exist_ok=True)

# Load and Process GeoJSON Data
Load the latest GeoJSON file from samples directory and extract event dates. Include error handling for missing files.

In [8]:
# Load and Process GeoJSON Data

# Function to load the latest GeoJSON file and extract event dates
def load_geojson_dates(samples_data_dir, print_loading=False):
    # Load the most recent sampled events file
    sample_files = glob(os.path.join(samples_data_dir, '*.geojson'))
    if not sample_files:
        raise FileNotFoundError("No .geojson files found in the Samples directory.")
    
    latest_file = max(sample_files, key=os.path.getctime)
    if print_loading:
        print(f"Loading events from {latest_file}")
    
    with open(latest_file) as f:
        data = json.load(f)
    
    # Extract dates and convert to datetime objects
    event_dates = {}
    for feature in data['features']:
        # Extract and format plot ID (remove 'PLOT-' prefix)
        plot_id = feature['properties']['name'].replace('PLOT-', '').strip()
        img_date = feature['properties']['img_date']
        event_dates[plot_id] = datetime.strptime(img_date, '%Y-%m-%d')
    
    return event_dates

# Load event dates from the latest GeoJSON file
event_dates = load_geojson_dates(samples_data_dir, print_loading=True)

Loading events from ../Datasets/Testing/Samples/sampled_events_20241216_153135.geojson


# Load Image Patches with Dates
Load image patches from the processed directory and their corresponding dates. Organize them in a sorted temporal sequence.

In [9]:
def load_image_patches_with_dates(processed_data_dir):
    """
    Load image patches and their corresponding dates from .npy files
    
    Args:
        processed_data_dir (str): Path to directory containing processed .npy files
        
    Returns:
        dict: Dictionary mapping plot IDs to lists of (patch, date) tuples
    """
    # Dictionary to store patches by plot ID
    plot_data = {}
    
    # Load image patches and their dates
    for file in sorted(glob(os.path.join(processed_data_dir, '*.npy'))):
        # Parse filename components
        basename = os.path.basename(file)
        # Split "20180726T084009_PLOT-00001.npy" into datetime and plot parts
        datetime_str, plot_part = basename.split('_')
        plot_id = plot_part.replace('PLOT-', '').replace('.npy', '')
        
        # Convert datetime string to datetime object
        # Format: "20180726T084009" -> datetime
        date = datetime.strptime(datetime_str, '%Y%m%dT%H%M%S')
        
        # Load the patch data
        patch = np.load(file)
        
        # Initialize list for this plot if it doesn't exist
        if plot_id not in plot_data:
            plot_data[plot_id] = []
            
        # Add (patch, date) tuple to this plot's list
        plot_data[plot_id].append((patch, date))
    
    # Sort patches by date for each plot
    for plot_id in plot_data:
        plot_data[plot_id].sort(key=lambda x: x[1])  # Sort by date
    
    return plot_data

# Load image patches and their dates
plot_data = load_image_patches_with_dates(processed_data_dir)

# Create Temporal Pairs Function
Implement function to find closest pre-event and post-event images for each deforestation event date, respecting max_days_diff constraint.

In [10]:
def organize_temporal_pairs(plot_data, event_dates, max_days_diff=30):
    """
    Organize temporal pairs around deforestation events
    
    Args:
        plot_data (dict): Dictionary mapping plot IDs to (patch, date) tuples
        event_dates (dict): Dictionary mapping plot IDs to event dates
        max_days_diff (int): Maximum allowed days between image pairs
        
    Returns:
        list: List of (pre_img, post_img, plot_id, event_date) tuples
    """
    temporal_pairs = []
    
    for plot_id, event_date in event_dates.items():
        if plot_id not in plot_data:
            continue
            
        # Get sorted (patch, date) pairs for this plot
        plot_patches = plot_data[plot_id]
        
        # Find closest pre-event and post-event images
        pre_event = None
        post_event = None
        
        for patch, date in plot_patches:
            if date < event_date:
                pre_event = (patch, date)
            elif date > event_date:
                post_event = (patch, date)
                break
                
        # Validate temporal pairs
        if pre_event and post_event:
            time_diff = (post_event[1] - pre_event[1]).days
            if time_diff <= max_days_diff:
                temporal_pairs.append({
                    'plot_id': plot_id,
                    'pre_image': pre_event[0],
                    'post_image': post_event[0],
                    'pre_date': pre_event[1],
                    'post_date': post_event[1],
                    'event_date': event_date
                })
    
    return temporal_pairs

# Usage
temporal_pairs = organize_temporal_pairs(plot_data, event_dates)

# Save Organized Pairs to HDF5
Save the organized temporal pairs to HDF5 file with proper chunking and compression. Include metadata about pair relationships.

In [11]:
def save_pairs_to_hdf5(temporal_pairs, output_dir):
    """
    Save temporal pairs to HDF5 with plot information and dates
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.join(output_dir, 'HDF5-TemporalPairs'), exist_ok=True)
    hdf5_path = os.path.join(output_dir, 'HDF5-TemporalPairs', 'pairs.h5')
    
    with h5py.File(hdf5_path, 'w') as f:
        # Add file-level metadata
        f.attrs['description'] = 'Temporal pairs of satellite images with event dates'
        f.attrs['total_pairs'] = len(temporal_pairs)
        f.attrs['creation_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        # Create main groups
        pairs_group = f.create_group('temporal_pairs')
        
        # Store each temporal pair
        for idx, pair_data in enumerate(temporal_pairs):
            pair_group = pairs_group.create_group(f'pair_{idx}')
            
            # Store images with compression
            pair_group.create_dataset('pre_image', 
                                    data=pair_data['pre_image'],
                                    chunks=True, 
                                    compression='gzip')
            pair_group.create_dataset('post_image', 
                                    data=pair_data['post_image'],
                                    chunks=True, 
                                    compression='gzip')
            
            # Store metadata
            pair_group.attrs['plot_id'] = pair_data['plot_id']
            pair_group.attrs['pre_date'] = pair_data['pre_date'].strftime('%Y-%m-%dT%H:%M:%S')
            pair_group.attrs['post_date'] = pair_data['post_date'].strftime('%Y-%m-%dT%H:%M:%S')
            pair_group.attrs['event_date'] = pair_data['event_date'].strftime('%Y-%m-%dT%H:%M:%S')

# Usage
save_pairs_to_hdf5(temporal_pairs, data_dir)

# Validate Results
Check temporal relationships in saved pairs, verify event dates fall between image pairs, and ensure max_days_diff is respected.

In [12]:
# Validate Results

import h5py

# Load the organized temporal pairs from HDF5 file
def load_pairs_from_hdf5(data_dir):
    """Load temporal pairs from HDF5 file with plot information"""
    hdf5_path = os.path.join(data_dir, 'HDF5-TemporalPairs', 'pairs.h5')
    
    pair_data = []
    with h5py.File(hdf5_path, 'r') as f:
        total_pairs = f.attrs['total_pairs']
        pairs_group = f['temporal_pairs']
        
        for i in range(total_pairs):
            pair_group = pairs_group[f'pair_{i}']
            pair_info = {
                'plot_id': pair_group.attrs['plot_id'],
                'pre_image': np.array(pair_group['pre_image']),
                'post_image': np.array(pair_group['post_image']),
                'pre_date': datetime.strptime(pair_group.attrs['pre_date'], '%Y-%m-%dT%H:%M:%S'),
                'post_date': datetime.strptime(pair_group.attrs['post_date'], '%Y-%m-%dT%H:%M:%S'),
                'event_date': datetime.strptime(pair_group.attrs['event_date'], '%Y-%m-%dT%H:%M:%S')
            }
            pair_data.append(pair_info)
    
    return pair_data

def validate_pairs(pair_data, max_days_diff=30):
    """Validate temporal pairs and their relationships"""
    validation_results = {
        'valid_pairs': [],
        'invalid_pairs': [],
        'validation_summary': {
            'total_pairs': len(pair_data),
            'invalid_time_diff': 0,
            'invalid_event_date': 0
        }
    }
    
    for pair in pair_data:
        is_valid = True
        issues = []
        
        # Check temporal sequence
        if not (pair['pre_date'] < pair['event_date'] < pair['post_date']):
            is_valid = False
            issues.append('Event date not between image dates')
            validation_results['validation_summary']['invalid_event_date'] += 1
        
        # Check temporal distance
        time_diff = (pair['post_date'] - pair['pre_date']).days
        if time_diff > max_days_diff:
            is_valid = False
            issues.append(f'Time difference ({time_diff} days) exceeds maximum ({max_days_diff} days)')
            validation_results['validation_summary']['invalid_time_diff'] += 1
        
        # Store result
        pair_result = {
            'plot_id': pair['plot_id'],
            'pre_date': pair['pre_date'],
            'post_date': pair['post_date'],
            'event_date': pair['event_date'],
            'time_diff': time_diff
        }
        
        if is_valid:
            validation_results['valid_pairs'].append(pair_result)
        else:
            pair_result['issues'] = issues
            validation_results['invalid_pairs'].append(pair_result)
    
    return validation_results

# Load and validate pairs
pair_data = load_pairs_from_hdf5(data_dir)
validation_results = validate_pairs(pair_data)

# Print validation summary
print("Validation Summary:")
print(f"Total pairs: {validation_results['validation_summary']['total_pairs']}")
print(f"Valid pairs: {len(validation_results['valid_pairs'])}")
print(f"Invalid pairs: {len(validation_results['invalid_pairs'])}")
print(f"Invalid time difference: {validation_results['validation_summary']['invalid_time_diff']}")
print(f"Invalid event date sequence: {validation_results['validation_summary']['invalid_event_date']}")

# Print details of invalid pairs
if validation_results['invalid_pairs']:
    print("\nInvalid Pairs Details:")
    for pair in validation_results['invalid_pairs']:
        print(f"\nPlot ID: {pair['plot_id']}")
        print(f"Issues: {', '.join(pair['issues'])}")
        print(f"Pre-date: {pair['pre_date']}")
        print(f"Event date: {pair['event_date']}")
        print(f"Post-date: {pair['post_date']}")
        print(f"Time difference: {pair['time_diff']} days")

Validation Summary:
Total pairs: 49
Valid pairs: 49
Invalid pairs: 0
Invalid time difference: 0
Invalid event date sequence: 0
