In [1]:
import ee
import pandas as pd
import requests
import os
import time
from datetime import datetime, timedelta
import glob



In [2]:
# ==========================================
# 1. INITIALIZATION & CONFIGURATION
# ==========================================

# Initialize Earth Engine
try:
    ee.Initialize(project='vae-wgan')
except Exception as e:
    print("Authenticating Earth Engine...")
    ee.Authenticate()
    ee.Initialize()

In [3]:
# Configuration
CSV_FILE = "./modis_2024_Brazil.csv"  # CHANGE THIS to your actual filename
OUTPUT_DIR = 'modis_dataset_brazil'
FIRE_DIR = os.path.join(OUTPUT_DIR, 'fire_anomalies')
NORMAL_DIR = os.path.join(OUTPUT_DIR, 'normal_reference')

# Image Settings
IMG_SIZE = 64        # Input size for VAE (64x64 pixels)
SCALE = 500          # MODIS resolution (500 meters per pixel)
ROI_RADIUS = (IMG_SIZE * SCALE) / 2  # Radius in meters to get the correct crop

# MODIS Collection (Terra Surface Reflectance)
MODIS_COLLECTION = 'MODIS/061/MOD09GA'

# Create directories
os.makedirs(FIRE_DIR, exist_ok=True)
os.makedirs(NORMAL_DIR, exist_ok=True)

In [4]:
# ==========================================
# 2. HELPER FUNCTIONS
# ==========================================

def get_cloud_percentage(image, region):
    """
    Calculates the percentage of cloudy pixels in the given region
    using the 'state_1km' QA band.
    """
    try:
        qa = image.select('state_1km')

        # Bits 0-1: Cloud state (00=Clear, 01=Cloudy, 10=Mixed, 11=Not set)
        # We perform bitwise AND with 3 (binary 11) to isolate these bits.
        cloud_state = qa.bitwiseAnd(3)

        # Mask where value is 1 (Cloudy) or 2 (Mixed)
        is_cloudy = cloud_state.eq(1).Or(cloud_state.eq(2))

        # Calculate mean (percentage) of cloudy pixels
        cloud_stats = is_cloudy.reduceRegion(
            reducer=ee.Reducer.mean(),
            geometry=region,
            scale=SCALE,
            maxPixels=1e9
        )

        pct = cloud_stats.get('state_1km').getInfo()
        return pct * 100 if pct is not None else 100
    except Exception as e:
        print(f"Error checking clouds: {e}")
        return 100.0  # Assume cloudy if error

In [5]:
def get_clear_image(lat, lon, target_date_str, search_window_days=14):
    """
    Searches for a clear image (<10% clouds) around the target date.
    Returns: (ee.Image object, actual_date_string) or (None, None)
    """
    target_dt = datetime.strptime(target_date_str, '%Y-%m-%d')
    point = ee.Geometry.Point([lon, lat])
    region = point.buffer(ROI_RADIUS).bounds()

    start_search = (target_dt - timedelta(days=search_window_days)).strftime('%Y-%m-%d')
    end_search = (target_dt + timedelta(days=search_window_days)).strftime('%Y-%m-%d')

    # Get collection sorted by time
    collection = (ee.ImageCollection(MODIS_COLLECTION)
                  .filterDate(start_search, end_search)
                  .filterBounds(point))

    # Get list of images (Client-side iteration required for logic)
    # We limit to 20 images to prevent timeouts
    img_list_size = collection.size().getInfo()
    if img_list_size == 0:
        return None, None

    ee_list = collection.toList(min(img_list_size, 20))
    count = ee_list.size().getInfo()

    for i in range(count):
        img = ee.Image(ee_list.get(i))

        # Check Cloud Cover
        cloud_pct = get_cloud_percentage(img, region)

        if cloud_pct < 10.0:  # Strict threshold: < 10% clouds
            date_found = ee.Date(img.get('system:time_start')).format('YYYY-MM-dd').getInfo()
            return img, date_found

    return None, None

In [6]:
def download_image(ee_image, region, output_path):
    """
    Generates a URL and downloads the image.
    """
    if os.path.exists(output_path):
        print(f"Skipping (Exists): {os.path.basename(output_path)}")
        return

    # Visualization: Band 7 (SWIR-Heat), Band 2 (NIR-Veg), Band 1 (Red)
    vis_params = {
        'min': -100.0,
        'max': 8000.0,
        'bands': ['sur_refl_b07', 'sur_refl_b02', 'sur_refl_b01'],
    }

    try:
        url = ee_image.getThumbURL({
            'region': region,
            'dimensions': f'{IMG_SIZE}x{IMG_SIZE}',
            'format': 'png',
            **vis_params
        })

        response = requests.get(url, timeout=1)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            print(f"Saved: {os.path.basename(output_path)}") # Optional: quiet mode
        else:
            print(f"Failed to download (Status {response.status_code})")

    except Exception as e:
        print(f"Download Error: {e}")

In [7]:
def get_existing_indices(directory, prefix):
    """
    Scans the directory and returns a set of indices that are already downloaded.
    Filename format expected: prefix_{index}_{date}.png
    """
    existing = set()
    # Pattern to match: prefix_NUMBER_*.png
    # We use glob to find all files matching the pattern
    files = glob.glob(os.path.join(directory, f"{prefix}_*.png"))
    
    for f in files:
        try:
            # Extract the filename only
            filename = os.path.basename(f)
            # Split by '_' and get the index (2nd part: prefix_INDEX_date.png)
            parts = filename.split('_')
            if len(parts) >= 2:
                idx = int(parts[1])
                existing.add(idx)
        except ValueError:
            continue # Skip files that don't match the format
            
    return existing

In [8]:
def main():
    # 1. Load Data
    if not os.path.exists(CSV_FILE):
        print(f"ERROR: File {CSV_FILE} not found.")
        return

    df = pd.read_csv(CSV_FILE)
    
    # Filter Confidence > 80
    if 'confidence' in df.columns:
        df = df[df['confidence'] > 80]
    elif 'confidence_perc' in df.columns:
        df = df[df['confidence_perc'] > 80]
        
    # Reset index so we have a consistent ID (0, 1, 2...) for filenames
    # IMPORTANT: We do NOT shuffle here if we want to resume reliably, 
    # OR we must ensure the shuffle is deterministic (random_state=42).
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"Total target images to download: {len(df)}")

    # 2. FAST RESUME: Scan folders once
    print("Scanning existing files to skip...")
    existing_fire = get_existing_indices(FIRE_DIR, "fire")
    existing_normal = get_existing_indices(NORMAL_DIR, "normal")
    
    print(f"Found {len(existing_fire)} existing Fire images.")
    print(f"Found {len(existing_normal)} existing Normal images.")

    print("\nStarting download process... (Press Ctrl+C to stop)")
    
    success_fire = 0
    success_normal = 0

    for index, row in df.iterrows():
        try:
            # --- INSTANT SKIP CHECK ---
            # If we have BOTH files for this index, skip entirely.
            # No Earth Engine calls, no delay.
            if (index in existing_fire) and (index in existing_normal):
                continue 

            lat = row['latitude']
            lon = row['longitude']
            fire_date = row['acq_date']
            
            # --- 1. DOWNLOAD FIRE ANOMALY ---
            if index not in existing_fire:
                point = ee.Geometry.Point([lon, lat])
                region = point.buffer(ROI_RADIUS).bounds()
                
                fire_img = (ee.ImageCollection(MODIS_COLLECTION)
                            .filterDate(fire_date, datetime.strptime(fire_date, '%Y-%m-%d') + timedelta(days=1))
                            .filterBounds(point)
                            .first())
                
                # Check if exists on server (using .getInfo usually needed for None check, 
                # but we'll try/except the download to keep it fast)
                fire_out = os.path.join(FIRE_DIR, f"fire_{index}_{fire_date}.png")
                
                # Only run download if we verified we don't have it
                download_image(fire_img, region, fire_out)
                success_fire += 1
                
                # Rate limit slightly after a download
                time.sleep(0.1)

            # --- 2. DOWNLOAD NORMAL REFERENCE ---
            if index not in existing_normal:
                point = ee.Geometry.Point([lon, lat])
                region = point.buffer(ROI_RADIUS).bounds()
                
                fire_dt = datetime.strptime(fire_date, '%Y-%m-%d')
                ideal_date = (fire_dt - timedelta(days=365)).strftime('%Y-%m-%d')
                
                # Only NOW do we ask Google for the cloud-free date
                normal_img, normal_date_found = get_clear_image(lat, lon, ideal_date)
                
                if normal_img:
                    normal_out = os.path.join(NORMAL_DIR, f"normal_{index}_{normal_date_found}.png")
                    download_image(normal_img, region, normal_out)
                    success_normal += 1
                    print(f"[{index}] Downloaded Pair (Fire: {fire_date} | Normal: {normal_date_found})")
                else:
                    print(f"[{index}] Skipped Normal: Too cloudy")
                
                # Rate limit after an API-heavy search
                time.sleep(0.5)

        except KeyboardInterrupt:
            print("\nStopping download...")
            break
        except Exception as e:
            print(f"[{index}] Error: {e}")
            continue

    print(f"\nSession Finished.")
    print(f"New Fire Images: {success_fire}")
    print(f"New Normal Images: {success_normal}")

In [9]:
if __name__ == "__main__":
    main()

Total target images to download: 191698
Scanning existing files to skip...
Found 6738 existing Fire images.
Found 6402 existing Normal images.

Starting download process... (Press Ctrl+C to stop)
[10] Skipped Normal: Too cloudy
[29] Skipped Normal: Too cloudy
[37] Skipped Normal: Too cloudy
[38] Skipped Normal: Too cloudy
[62] Skipped Normal: Too cloudy
[128] Skipped Normal: Too cloudy
[136] Skipped Normal: Too cloudy
[158] Skipped Normal: Too cloudy
[171] Skipped Normal: Too cloudy
[185] Skipped Normal: Too cloudy
[205] Skipped Normal: Too cloudy
[213] Skipped Normal: Too cloudy
[215] Skipped Normal: Too cloudy
[252] Skipped Normal: Too cloudy
[279] Skipped Normal: Too cloudy
[295] Skipped Normal: Too cloudy
[322] Skipped Normal: Too cloudy
[332] Skipped Normal: Too cloudy
[346] Skipped Normal: Too cloudy
[351] Skipped Normal: Too cloudy
[379] Skipped Normal: Too cloudy
[387] Skipped Normal: Too cloudy
[446] Skipped Normal: Too cloudy
[453] Skipped Normal: Too cloudy
[484] Skipped Nor

In [None]:
!zip -r modis_dataset_brazil.zip /content/modis_dataset_brazil

In [None]:
import shutil
# shutil.rmtree("/content/modis_dataset")