In [5]:
import ee
import time

# Initialize the Earth Engine API
try:
    ee.Initialize(project='ee-rodneyroberts')
    print("✓ Earth Engine initialized successfully for project: ee-rodneyroberts")
except Exception as e:
    print(f"✗ Earth Engine initialization failed: {e}")

✓ Earth Engine initialized successfully for project: ee-rodneyroberts


In [6]:
ROI = ee.Geometry.Rectangle([-120.5, 36.5, -119.5, 37.5])

START_DATE = '2019-01-01'
END_DATE = '2025-07-31'

CLOUDY_THRESHOLD = 20  # Images with MORE than this % cloud cover are "cloudy"
CLEAR_THRESHOLD = 1    # Images with LESS than this % cloud cover are "clear"

# Time window to search for a clear pair (in days)
SEARCH_WINDOW_DAYS = 45

EXPORT_FOLDER = 'cloud_removal_dataset_california'
MAX_PAIRS_TO_EXPORT = 1500

In [7]:
def create_cloud_removal_dataset(roi, start_date, end_date, folder_name, max_pairs):
    collection = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterBounds(roi) \
        .filterDate(start_date, end_date) \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 80))

    cloudy_images = collection.filter(ee.Filter.gt('CLOUDY_PIXEL_PERCENTAGE', CLOUDY_THRESHOLD))
    clear_images = collection.filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', CLEAR_THRESHOLD))

    print("Searching for image pairs...")
    cloudy_list = cloudy_images.limit(1500).getInfo()['features']
    
    tasks = []
    pair_count = 0

    for cloudy_feature in cloudy_list:
        if pair_count >= max_pairs:
            break
        
        cloudy_image = ee.Image(cloudy_feature['id'])
        cloudy_date = ee.Date(cloudy_feature['properties']['system:time_start'])
        
        time_window = ee.DateRange(
            cloudy_date.advance(-SEARCH_WINDOW_DAYS, 'day'),
            cloudy_date.advance(SEARCH_WINDOW_DAYS, 'day')
        )
        
        nearest_clear = clear_images.filterDate(time_window).sort('system:time_start').first()

        if nearest_clear.getInfo():
            pair_count += 1
            cloudy_id_short = cloudy_feature['id'].split('/')[-1]
            print(f"  -> Found pair {pair_count}. Starting export tasks for: {cloudy_id_short}")

            try:
                cloudy_export = cloudy_image.clip(roi).select(['B4', 'B3', 'B2'])
                clear_export = nearest_clear.clip(roi).select(['B4', 'B3', 'B2'])
                
                cloudy_task = ee.batch.Export.image.toDrive(
                    image=cloudy_export,
                    description=f'cloudy_{pair_count-1}',
                    folder=folder_name,
                    scale=10,
                    region=roi,
                    maxPixels=1e9
                )
                clear_task = ee.batch.Export.image.toDrive(
                    image=clear_export,
                    description=f'clear_{pair_count-1}',
                    folder=folder_name,
                    scale=10,
                    region=roi,
                    maxPixels=1e9
                )
                
                cloudy_task.start()
                clear_task.start()
                tasks.extend([cloudy_task, clear_task])

            except Exception as export_error:
                print(f"    └─ Error during export: {export_error}")

    print(f"\n All done! A total of {len(tasks)} export tasks ({pair_count} pairs) have been started.")

In [8]:
create_cloud_removal_dataset(
    roi=ROI,
    start_date=START_DATE,
    end_date=END_DATE,
    folder_name=EXPORT_FOLDER,
    max_pairs=MAX_PAIRS_TO_EXPORT
)

Searching for image pairs...
  -> Found pair 1. Starting export tasks for: 20190107T184741_20190107T185039_T10SGF
  -> Found pair 2. Starting export tasks for: 20190107T184741_20190107T185039_T11SKA
  -> Found pair 3. Starting export tasks for: 20190110T185741_20190110T185950_T10SGF
  -> Found pair 4. Starting export tasks for: 20190110T185741_20190110T185950_T10SGG
  -> Found pair 5. Starting export tasks for: 20190110T185741_20190110T185950_T11SKB
  -> Found pair 6. Starting export tasks for: 20190112T184739_20190112T184850_T10SGF
  -> Found pair 7. Starting export tasks for: 20190112T184739_20190112T184850_T11SKA
  -> Found pair 8. Starting export tasks for: 20190122T184659_20190122T185050_T10SGF
  -> Found pair 9. Starting export tasks for: 20190122T184659_20190122T185050_T10SGG
  -> Found pair 10. Starting export tasks for: 20190122T184659_20190122T185050_T11SKA
  -> Found pair 11. Starting export tasks for: 20190122T184659_20190122T185050_T11SKB
  -> Found pair 12. Starting expor