In [1]:
import os
import glob
import shutil
import numpy as np
from PIL import Image
from tqdm import tqdm  # Progress bar

In [10]:
# ================= CONFIGURATION =================
BASE_DIR = 'modis_dataset_brazil'
FIRE_DIR = os.path.join(BASE_DIR, 'fire_anomalies')
NORMAL_DIR = os.path.join(BASE_DIR, 'normal_reference')
QUARANTINE_DIR = os.path.join(BASE_DIR, 'quarantine')

In [11]:
# thresholds
MIN_VARIANCE = 5.0       # If image variance < 5, it's likely a solid color (bad)
MAX_MEAN_BRIGHTNESS = 230 # If mean pixel > 230 (out of 255), it's likely pure cloud/white
MIN_MEAN_BRIGHTNESS = 5   # If mean pixel < 5, it's likely pure black
BLACK_PIXEL_THRESHOLD = 0.5 # If > 50% of pixels are pure black (0,0,0), it's a border/error

In [12]:
# Ensure quarantine exists
os.makedirs(QUARANTINE_DIR, exist_ok=True)

In [13]:
def get_index_from_filename(filename):
    """Extracts index from 'prefix_123_date.png'"""
    try:
        return int(filename.split('_')[1])
    except:
        return None

In [14]:
def is_image_bad(filepath):
    """
    Returns (True, Reason) if image is bad, (False, None) if good.
    """
    try:
        # 1. Check if file is empty or too small
        if os.path.getsize(filepath) == 0:
            return True, "Empty File (0kb)"

        with Image.open(filepath) as img:
            img = img.convert('RGB')
            data = np.array(img)

            # 2. Check for "No Data" / Solid Black Borders
            # Count pixels that are exactly [0, 0, 0]
            black_pixels = np.sum(np.all(data == [0, 0, 0], axis=2))
            total_pixels = data.shape[0] * data.shape[1]
            if (black_pixels / total_pixels) > BLACK_PIXEL_THRESHOLD:
                return True, "Mostly Black/No Data"

            # 3. Check for Flat Colors (Error from API)
            # Calculate standard deviation of pixel values
            std_dev = np.std(data)
            if std_dev < MIN_VARIANCE:
                return True, "Solid Color / Low Variance"

            # 4. Check for Pure White (Clouds or Glare) - mainly for Normal data
            # (We are lenient here, only flagging extreme cases)
            mean_brightness = np.mean(data)
            if mean_brightness > MAX_MEAN_BRIGHTNESS:
                return True, "Too Bright / Solid Cloud"

            if mean_brightness < MIN_MEAN_BRIGHTNESS:
                return True, "Too Dark / Empty"

            return False, None

    except Exception as e:
        return True, f"Corrupted: {str(e)}"

In [15]:
def move_to_quarantine(filepath, reason):
    filename = os.path.basename(filepath)
    # Create subfolder for specific reason to help you review
    reason_clean = reason.split(':')[0].replace(" ", "_").replace("/", "_")
    dest_folder = os.path.join(QUARANTINE_DIR, reason_clean)
    os.makedirs(dest_folder, exist_ok=True)

    shutil.move(filepath, os.path.join(dest_folder, filename))

In [16]:
def main():
    print(f"Scanning dataset in: {BASE_DIR}")

    # --- STEP 1: IMAGE QUALITY CHECK ---
    print("\n--- Step 1: Checking Image Quality ---")
    all_files = glob.glob(os.path.join(FIRE_DIR, "*.png")) + \
                glob.glob(os.path.join(NORMAL_DIR, "*.png"))

    bad_count = 0
    for filepath in tqdm(all_files):
        is_bad, reason = is_image_bad(filepath)
        if is_bad:
            move_to_quarantine(filepath, reason)
            bad_count += 1

    print(f"Moved {bad_count} bad images to quarantine.")

    # --- STEP 2: PAIR MATCHING ---
    print("\n--- Step 2: Checking for Missing Pairs ---")
    # Refresh lists after moving files
    fire_files = glob.glob(os.path.join(FIRE_DIR, "*.png"))
    normal_files = glob.glob(os.path.join(NORMAL_DIR, "*.png"))

    # Create Dictionaries: {index: filepath}
    fire_map = {get_index_from_filename(os.path.basename(f)): f for f in fire_files}
    normal_map = {get_index_from_filename(os.path.basename(f)): f for f in normal_files}

    # Remove None keys if any
    fire_map.pop(None, None)
    normal_map.pop(None, None)

    # Find orphans
    orphan_fire = []
    orphan_normal = []

    for idx in fire_map:
        if idx not in normal_map:
            orphan_fire.append(fire_map[idx])

    for idx in normal_map:
        if idx not in fire_map:
            orphan_normal.append(normal_map[idx])

    print(f"Found {len(orphan_fire)} Fire images without a Normal pair.")
    print(f"Found {len(orphan_normal)} Normal images without a Fire pair.")

    # --- STEP 3: OPTIONAL CLEANUP ---
    if len(orphan_fire) > 0 or len(orphan_normal) > 0:
        ans = input("\nDo you want to move these unpaired 'orphans' to quarantine? (y/n): ")
        if ans.lower() == 'y':
            count = 0
            for f in orphan_fire:
                move_to_quarantine(f, "Unpaired_Fire")
                count += 1
            for f in orphan_normal:
                move_to_quarantine(f, "Unpaired_Normal")
                count += 1
            print(f"Moved {count} unpaired images.")
        else:
            print("Orphans kept in dataset.")

    print("\nCleaning Complete!")
    print(f"Check the '{QUARANTINE_DIR}' folder to review removed files.")

In [18]:
if __name__ == "__main__":
    main()

Scanning dataset in: modis_dataset_brazil

--- Step 1: Checking Image Quality ---


100%|██████████| 18597/18597 [00:27<00:00, 668.83it/s]


Moved 0 bad images to quarantine.

--- Step 2: Checking for Missing Pairs ---
Found 0 Fire images without a Normal pair.
Found 0 Normal images without a Fire pair.

Cleaning Complete!
Check the 'modis_dataset_brazil\quarantine' folder to review removed files.
