In [4]:
import boto3
import os

# Create S3 client
s3 = boto3.client(
    's3',
    endpoint_url='https://data.source.coop',
    aws_access_key_id='',
    aws_secret_access_key=''
)

bucket = 'nasa'
prefix = 'marine-debris/labels/'  # where your .geojson files are
local_dir = 'labels'

# Make local folder
os.makedirs(local_dir, exist_ok=True)

# List and download all label files
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
for obj in response.get('Contents', []):
    key = obj['Key']
    filename = os.path.join(local_dir, os.path.basename(key))
    print(f"Downloading {key} → {filename}")
    s3.download_file(bucket, key, filename)


Downloading marine-debris/labels/20160928_153233_0e16_16816-29821-16.geojson → labels\20160928_153233_0e16_16816-29821-16.geojson
Downloading marine-debris/labels/20160928_153233_0e16_16816-29821-16.npy → labels\20160928_153233_0e16_16816-29821-16.npy
Downloading marine-debris/labels/20160928_153233_0e16_16816-29824-16.geojson → labels\20160928_153233_0e16_16816-29824-16.geojson
Downloading marine-debris/labels/20160928_153233_0e16_16816-29824-16.npy → labels\20160928_153233_0e16_16816-29824-16.npy
Downloading marine-debris/labels/20160928_153233_0e16_16816-29825-16.geojson → labels\20160928_153233_0e16_16816-29825-16.geojson
Downloading marine-debris/labels/20160928_153233_0e16_16816-29825-16.npy → labels\20160928_153233_0e16_16816-29825-16.npy
Downloading marine-debris/labels/20160928_153233_0e16_16816-29828-16.geojson → labels\20160928_153233_0e16_16816-29828-16.geojson
Downloading marine-debris/labels/20160928_153233_0e16_16816-29828-16.npy → labels\20160928_153233_0e16_16816-29828

In [8]:
import os
print(os.getcwd())


c:\Users\nayee\OneDrive\Desktop\python\marine debris


Takes NASA’s raw format (.npy satellite images + .geojson bounding boxes)

Converts .npy → .jpg

Reads .geojson and converts bounding boxes into YOLO label format

Stores them neatly in /images/nasa and /labels/nasa

Takes your RoboFlow dataset (already YOLO format)

Copies it into /images/roboflow and /labels/roboflow inside the merged dataset

Takes your SeaClear dataset (already YOLO format)

Copies it into /images/seaclear and /labels/seaclear inside the merged dataset

Outputs one clean, unified YOLO dataset (final_dataset/)

Every image + label across all 3 datasets is in the same format

Ready to train in YOLOv5, YOLOv8, or similar

Basically, this code is data preprocessing + dataset merging so you can train one model with satellite + drone + underwater images all together.

If you skip this step, NASA’s data stays in a w

In [25]:
import os
import json
import shutil
import numpy as np
from PIL import Image
from glob import glob
from tqdm import tqdm

# =============================
# CONFIGURE YOUR PATHS
# =============================
NASA_DATA_DIR = r"C:\Users\nayee\OneDrive\Desktop\python\marine debris"
ROBOFLOW_DIR = r"C:\Users\nayee\Downloads\roboflow data"
SEACLEAR_DIR = r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo"
OUTPUT_DIR = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset"

# Classes mapping (edit if needed)
CLASSES = ["plastic", "algae", "sargassum", "wood", "other"]

# =============================
# HELPER FUNCTIONS
# =============================
def convert_geojson_to_yolo(geojson_path, img_width, img_height):
    """Convert geojson bounding boxes to YOLO format."""
    with open(geojson_path, 'r') as f:
        data = json.load(f)

    yolo_labels = []
    for feature in data.get("features", []):
        props = feature.get("properties", {})
        cls_name = props.get("class", "other").lower()
        if cls_name not in CLASSES:
            cls_name = "other"
        cls_id = CLASSES.index(cls_name)

        # bbox in geojson: [min_lon, min_lat, max_lon, max_lat] (pixel coords expected)
        coords = feature["geometry"]["coordinates"][0]
        xs = [p[0] for p in coords]
        ys = [p[1] for p in coords]
        xmin, xmax = min(xs), max(xs)
        ymin, ymax = min(ys), max(ys)

        # Convert to YOLO (normalized)
        x_center = ((xmin + xmax) / 2) / img_width
        y_center = ((ymin + ymax) / 2) / img_height
        box_width = (xmax - xmin) / img_width
        box_height = (ymax - ymin) / img_height

        yolo_labels.append(f"{cls_id} {x_center} {y_center} {box_width} {box_height}")

    return yolo_labels


def process_nasa_dataset():
    """Convert NASA npy+geojson to YOLO and save in OUTPUT_DIR."""
    print("Processing NASA dataset...")
    nasa_img_dir = os.path.join(OUTPUT_DIR, "images", "nasa")
    nasa_lbl_dir = os.path.join(OUTPUT_DIR, "labels", "nasa")
    os.makedirs(nasa_img_dir, exist_ok=True)
    os.makedirs(nasa_lbl_dir, exist_ok=True)

    geojson_files = glob(os.path.join(NASA_DATA_DIR, "labels", "*.geojson"))

    for gj_path in tqdm(geojson_files):
        base_name = os.path.splitext(os.path.basename(gj_path))[0]
        npy_path = gj_path.replace(".geojson", ".npy")

        if not os.path.exists(npy_path):
            continue

        # Load npy image
        img_array = np.load(npy_path)
        img = Image.fromarray(img_array.astype(np.uint8))
        img_width, img_height = img.size

        # Save image as JPG
        img_out_path = os.path.join(nasa_img_dir, base_name + ".jpg")
        img.save(img_out_path)

        # Convert labels
        yolo_labels = convert_geojson_to_yolo(gj_path, img_width, img_height)
        lbl_out_path = os.path.join(nasa_lbl_dir, base_name + ".txt")
        with open(lbl_out_path, "w") as f:
            f.write("\n".join(yolo_labels))


def merge_yolo_datasets(src_dir, name):
    """Copy existing YOLO dataset into OUTPUT_DIR/images/name and OUTPUT_DIR/labels/name."""
    img_out_dir = os.path.join(OUTPUT_DIR, "images", name)
    lbl_out_dir = os.path.join(OUTPUT_DIR, "labels", name)
    os.makedirs(img_out_dir, exist_ok=True)
    os.makedirs(lbl_out_dir, exist_ok=True)

    img_files = glob(os.path.join(src_dir, "**", "images", "**", "*.*"), recursive=True)
    lbl_files = glob(os.path.join(src_dir, "**", "labels", "**", "*.txt"), recursive=True)

    for img in tqdm(img_files, desc=f"Merging images from {name}"):
        shutil.copy(img, img_out_dir)

    for lbl in tqdm(lbl_files, desc=f"Merging labels from {name}"):
        shutil.copy(lbl, lbl_out_dir)


# =============================
# MAIN
# =============================
if __name__ == "__main__":
    # 1. Process NASA
    process_nasa_dataset()

    # 2. Merge RoboFlow
    merge_yolo_datasets(ROBOFLOW_DIR, "roboflow")

    # 3. Merge SeaClear
    merge_yolo_datasets(SEACLEAR_DIR, "seaclear")

    print(f"✅ All datasets merged into: {OUTPUT_DIR}")
    print("Now you can create a data.yaml for YOLO training.")


Processing NASA dataset...


100%|██████████| 500/500 [00:06<00:00, 79.55it/s] 
Merging images from roboflow: 100%|██████████| 1725/1725 [00:10<00:00, 157.43it/s]
Merging labels from roboflow: 100%|██████████| 1725/1725 [00:07<00:00, 221.99it/s]
Merging images from seaclear: 0it [00:00, ?it/s]
Merging labels from seaclear: 0it [00:00, ?it/s]

✅ All datasets merged into: C:\Users\nayee\OneDrive\Desktop\python\final_dataset
Now you can create a data.yaml for YOLO training.





In [10]:
import json
import os
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# === CONFIGURATION ===
SEACLEAR_DIR = r"C:\Users\nayee\Downloads\sea clear data\Seaclear Marine Debris Dataset"
OUTPUT_DIR = r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo"
TRAIN_SPLIT = 0.8  # 80% train / 20% val

# Paths
JSON_PATH = os.path.join(SEACLEAR_DIR, "dataset.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create YOLO folders
for split in ["train", "val"]:
    os.makedirs(os.path.join(OUTPUT_DIR, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, split, "labels"), exist_ok=True)

# === Load COCO annotations ===
with open(JSON_PATH, "r") as f:
    coco = json.load(f)

images_info = {img["id"]: img for img in coco["images"]}

# Map category IDs to YOLO IDs (0-based)
cat_id_to_yolo = {cat["id"]: idx for idx, cat in enumerate(coco["categories"])}

# Group annotations by image
image_annotations = {}
for ann in coco["annotations"]:
    img_id = ann["image_id"]
    if img_id not in image_annotations:
        image_annotations[img_id] = []
    image_annotations[img_id].append(ann)

# === Prepare train/val split ===
all_image_ids = list(images_info.keys())
train_ids, val_ids = train_test_split(all_image_ids, train_size=TRAIN_SPLIT, random_state=42)

def process_images(ids_list, split):
    for img_id in tqdm(ids_list, desc=f"Processing {split}"):
        img_info = images_info[img_id]
        file_name = img_info["file_name"]  # relative path
        img_w, img_h = img_info["width"], img_info["height"]

        # Find source image
        src_img_path = os.path.join(SEACLEAR_DIR, file_name)
        if not os.path.exists(src_img_path):
            continue  # skip if missing

        # Destination paths
        dst_img_path = os.path.join(OUTPUT_DIR, split, "images", os.path.basename(file_name))
        dst_label_path = os.path.join(OUTPUT_DIR, split, "labels", os.path.splitext(os.path.basename(file_name))[0] + ".txt")

        # Copy image
        shutil.copy2(src_img_path, dst_img_path)

        # Write YOLO labels
        with open(dst_label_path, "w") as label_file:
            if img_id in image_annotations:
                for ann in image_annotations[img_id]:
                    cat_id = ann["category_id"]
                    yolo_id = cat_id_to_yolo[cat_id]
                    x, y, w, h = ann["bbox"]  # COCO format
                    # Convert to YOLO format
                    x_center = (x + w / 2) / img_w
                    y_center = (y + h / 2) / img_h
                    w_norm = w / img_w
                    h_norm = h / img_h
                    label_file.write(f"{yolo_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")

# Process train and val sets
process_images(train_ids, "train")
process_images(val_ids, "val")

print(f"✅ Conversion complete! YOLO dataset saved to: {OUTPUT_DIR}")


Processing train: 100%|██████████| 6888/6888 [00:00<00:00, 43342.74it/s]
Processing val: 100%|██████████| 1722/1722 [00:00<00:00, 49202.90it/s]

✅ Conversion complete! YOLO dataset saved to: C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo





In [22]:
merge_yolo_datasets(r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo", "seaclear")


Merging seaclear dataset...


Merging images from seaclear: 0it [00:00, ?it/s]
Merging labels from seaclear: 0it [00:00, ?it/s]


In [27]:
if __name__ == "__main__":
    # 1. Process NASA
    process_nasa_dataset()

    # 2. Merge RoboFlow
    merge_yolo_datasets(ROBOFLOW_DIR, "roboflow")

    # 3. Merge SeaClear train and val separately
    merge_yolo_datasets(r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo\train", "seaclear")
    merge_yolo_datasets(r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo\val", "seaclear")


    print(f"✅ All datasets merged into: {OUTPUT_DIR}")
    print("Now you can create a data.yaml for YOLO training.")


Processing NASA dataset...


100%|██████████| 500/500 [00:01<00:00, 395.51it/s]
Merging images from roboflow: 100%|██████████| 1725/1725 [00:01<00:00, 876.94it/s]
Merging labels from roboflow: 100%|██████████| 1725/1725 [00:01<00:00, 1083.29it/s]
Merging images from seaclear: 0it [00:00, ?it/s]
Merging labels from seaclear: 0it [00:00, ?it/s]
Merging images from seaclear: 0it [00:00, ?it/s]
Merging labels from seaclear: 0it [00:00, ?it/s]

✅ All datasets merged into: C:\Users\nayee\OneDrive\Desktop\python\final_dataset
Now you can create a data.yaml for YOLO training.





In [28]:
def merge_yolo_datasets(src_dir, name):
    img_out_dir = os.path.join(OUTPUT_DIR, "images", name)
    lbl_out_dir = os.path.join(OUTPUT_DIR, "labels", name)
    os.makedirs(img_out_dir, exist_ok=True)
    os.makedirs(lbl_out_dir, exist_ok=True)

    # Find all images recursively in the src_dir (any folder)
    img_files = glob(os.path.join(src_dir, "**", "*.*"), recursive=True)
    img_files = [f for f in img_files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    # Copy images
    for img in tqdm(img_files, desc=f"Merging images from {name}"):
        shutil.copy(img, img_out_dir)

    # If you have labels for SeaClear in YOLO txt format in same folder or subfolders, uncomment this and adjust:
    # lbl_files = glob(os.path.join(src_dir, "**", "*.txt"), recursive=True)
    # for lbl in tqdm(lbl_files, desc=f"Merging labels from {name}"):
    #     shutil.copy(lbl, lbl_out_dir)


In [29]:
import os
import shutil
from glob import glob
from tqdm import tqdm

def merge_yolo_datasets(src_dir, name):
    """Copy existing YOLO dataset images and labels from src_dir (recursive) into final dataset."""
    img_out_dir = os.path.join(OUTPUT_DIR, "images", name)
    lbl_out_dir = os.path.join(OUTPUT_DIR, "labels", name)
    os.makedirs(img_out_dir, exist_ok=True)
    os.makedirs(lbl_out_dir, exist_ok=True)

    # Recursively find all image files (jpg, jpeg, png)
    img_files = glob(os.path.join(src_dir, "**", "*.*"), recursive=True)
    img_files = [f for f in img_files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    # Recursively find all label files (*.txt)
    lbl_files = glob(os.path.join(src_dir, "**", "*.txt"), recursive=True)

    for img in tqdm(img_files, desc=f"Merging images from {name}"):
        shutil.copy(img, img_out_dir)

    for lbl in tqdm(lbl_files, desc=f"Merging labels from {name}"):
        shutil.copy(lbl, lbl_out_dir)


In [36]:
import json
import os
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# === CONFIGURATION ===
SEACLEAR_DIR = r"C:\Users\nayee\Downloads\sea clear data\Seaclear Marine Debris Dataset"
OUTPUT_DIR = r"C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo"
TRAIN_SPLIT = 0.8  # 80% train / 20% val

# Paths
JSON_PATH = os.path.join(SEACLEAR_DIR, "dataset.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create YOLO folders for train and val splits
for split in ["train", "val"]:
    os.makedirs(os.path.join(OUTPUT_DIR, split, "images"), exist_ok=True)
    os.makedirs(os.path.join(OUTPUT_DIR, split, "labels"), exist_ok=True)

# === Load COCO annotations ===
with open(JSON_PATH, "r") as f:
    coco = json.load(f)

images_info = {img["id"]: img for img in coco["images"]}

# Map category IDs to YOLO IDs (0-based)
cat_id_to_yolo = {cat["id"]: idx for idx, cat in enumerate(coco["categories"])}

# Group annotations by image
image_annotations = {}
for ann in coco["annotations"]:
    img_id = ann["image_id"]
    if img_id not in image_annotations:
        image_annotations[img_id] = []
    image_annotations[img_id].append(ann)

# === Prepare train/val split ===
all_image_ids = list(images_info.keys())
train_ids, val_ids = train_test_split(all_image_ids, train_size=TRAIN_SPLIT, random_state=42)

def process_images(ids_list, split):
    for img_id in tqdm(ids_list, desc=f"Processing {split}"):
        img_info = images_info[img_id]
        file_name = img_info["file_name"]  # relative path with nested folders
        img_w, img_h = img_info["width"], img_info["height"]

        # Correctly join base directory with nested subfolders and image filename
        src_img_path = os.path.join(SEACLEAR_DIR, *file_name.split('/'))

        if not os.path.exists(src_img_path):
            # Try to search image recursively inside SEACLEAR_DIR
            found = False
            for root, dirs, files in os.walk(SEACLEAR_DIR):
                if os.path.basename(file_name) in files:
                    src_img_path = os.path.join(root, os.path.basename(file_name))
                    found = True
                    break
            if not found:
                print(f"Missing image: {os.path.join(SEACLEAR_DIR, file_name)}")
                continue  # <--- valid here inside the for-loop

        # Destination paths for images and labels
        dst_img_path = os.path.join(OUTPUT_DIR, split, "images", os.path.basename(file_name))
        dst_label_path = os.path.join(OUTPUT_DIR, split, "labels", os.path.splitext(os.path.basename(file_name))[0] + ".txt")

        # Copy image
        shutil.copy2(src_img_path, dst_img_path)

        # Write YOLO labels in text file
        with open(dst_label_path, "w") as label_file:
            if img_id in image_annotations:
                for ann in image_annotations[img_id]:
                    cat_id = ann["category_id"]
                    yolo_id = cat_id_to_yolo[cat_id]
                    x, y, w, h = ann["bbox"]  # COCO format: [top-left x, y, width, height]
                    # Convert bbox to YOLO format (normalized center x,y and width,height)
                    x_center = (x + w / 2) / img_w
                    y_center = (y + h / 2) / img_h
                    w_norm = w / img_w
                    h_norm = h / img_h
                    label_file.write(f"{yolo_id} {x_center:.6f} {y_center:.6f} {w_norm:.6f} {h_norm:.6f}\n")

# Process train and val splits
process_images(train_ids, "train")
process_images(val_ids, "val")

print(f"✅ Conversion complete! YOLO dataset saved to: {OUTPUT_DIR}")


Processing train: 100%|██████████| 6888/6888 [03:24<00:00, 33.73it/s]
Processing val: 100%|██████████| 1722/1722 [00:36<00:00, 47.67it/s]

✅ Conversion complete! YOLO dataset saved to: C:\Users\nayee\OneDrive\Desktop\python\seaclear_yolo





In [3]:
import os
import shutil

# Paths for source datasets
NASA_IMAGES = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\images\nasa"
NASA_LABELS = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\labels\nasa"

ROBOFLOW_IMAGES = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\images\roboflow"
ROBOFLOW_LABELS = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\labels\roboflow"

SEACLEAR_TRAIN_IMAGES = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\seaclear_train\images"
SEACLEAR_TRAIN_LABELS = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\seaclear_train\labels"

SEACLEAR_VAL_IMAGES = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\seaclear_val\images"
SEACLEAR_VAL_LABELS = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset\seaclear_val\labels"

# Output folders
OUTPUT_ROOT = r"C:\Users\nayee\OneDrive\Desktop\python\final_dataset_split"
TRAIN_IMAGES_OUT = os.path.join(OUTPUT_ROOT, "train", "images")
TRAIN_LABELS_OUT = os.path.join(OUTPUT_ROOT, "train", "labels")
VAL_IMAGES_OUT = os.path.join(OUTPUT_ROOT, "val", "images")
VAL_LABELS_OUT = os.path.join(OUTPUT_ROOT, "val", "labels")

# Create output dirs if not exist
for folder in [TRAIN_IMAGES_OUT, TRAIN_LABELS_OUT, VAL_IMAGES_OUT, VAL_LABELS_OUT]:
    os.makedirs(folder, exist_ok=True)

def copy_files(src_img_dir, src_lbl_dir, dst_img_dir, dst_lbl_dir):
    for filename in os.listdir(src_img_dir):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            # Copy image
            shutil.copy2(os.path.join(src_img_dir, filename), os.path.join(dst_img_dir, filename))
            # Copy label if exists
            label_name = os.path.splitext(filename)[0] + ".txt"
            label_path = os.path.join(src_lbl_dir, label_name)
            if os.path.exists(label_path):
                shutil.copy2(label_path, os.path.join(dst_lbl_dir, label_name))
            else:
                print(f"Warning: Label file not found for image {filename}")

# Copy train data: nasa + roboflow + seaclear_train
copy_files(NASA_IMAGES, NASA_LABELS, TRAIN_IMAGES_OUT, TRAIN_LABELS_OUT)
copy_files(ROBOFLOW_IMAGES, ROBOFLOW_LABELS, TRAIN_IMAGES_OUT, TRAIN_LABELS_OUT)
copy_files(SEACLEAR_TRAIN_IMAGES, SEACLEAR_TRAIN_LABELS, TRAIN_IMAGES_OUT, TRAIN_LABELS_OUT)

# Copy val data: seaclear_val
copy_files(SEACLEAR_VAL_IMAGES, SEACLEAR_VAL_LABELS, VAL_IMAGES_OUT, VAL_LABELS_OUT)

print("✅ Merging complete!")


✅ Merging complete!
