In [2]:
import os
import json
from PIL import Image
from pylabel import importer, exporter, dataset
from sklearn.model_selection import train_test_split
import pandas as pd

### NWPU VHR-10 Dataset

In [None]:
# Define paths
images_dir = "/home/gridsan/manderson/ovdsat/data/nwpu/positive_image_set/"
annotations_dir = "/home/gridsan/manderson/ovdsat/data/nwpu/ground_truth/"

# COCO dataset format
coco_data = {
    "info": {
        "description": "NWPU VHR-10 Dataset",
        "version": "1.0",
        "year": 2025,
        "contributor": "NWPU",
        "date_created": "2025-03-09"
    },
    "licenses": [],
    "images": [],
    "annotations": [],
    "categories": [
        {"id": 1, "name": "airplane"},
        {"id": 2, "name": "ship"},
        {"id": 3, "name": "storage tank"},
        {"id": 4, "name": "baseball diamond"},
        {"id": 5, "name": "tennis court"},
        {"id": 6, "name": "basketball court"},
        {"id": 7, "name": "ground track field"},
        {"id": 8, "name": "harbor"},
        {"id": 9, "name": "bridge"},
        {"id": 10, "name": "vehicle"}
    ]
}

# Helper function to convert bbox format
def convert_bbox(x1, y1, x2, y2):
    """Convert (x1, y1, x2, y2) to COCO format (x, y, width, height)."""
    return [x1, y1, x2 - x1, y2 - y1]

# Process images and annotations
image_id = 0
annotation_id = 0

for annotation_file in sorted(os.listdir(annotations_dir)):
    if annotation_file.endswith(".txt"):
        # Get corresponding image file
        image_file = annotation_file.replace(".txt", ".jpg")
        image_path = os.path.join(images_dir, image_file)

        if not os.path.exists(image_path):
            continue  # Skip if image is missing

        # Load image for dimensions
        with Image.open(image_path) as img:
            width, height = img.size

        # Add image info to COCO format
        coco_data["images"].append({
            "id": image_id,
            "file_name": image_file,
            "width": width,
            "height": height
        })

        # Read annotation file
        with open(os.path.join(annotations_dir, annotation_file), "r") as f:
            lines = f.readlines()

        for line in lines:
            line = line.strip().replace("(", "").replace(")", "")
            parts = line.split(",")

            if len(parts) != 5:
                continue  # Skip invalid lines

            x1, y1, x2, y2, category_id = map(int, parts)

            # Convert bbox format
            bbox = convert_bbox(x1, y1, x2, y2)

            # Add annotation
            coco_data["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": bbox,
                "area": bbox[2] * bbox[3],  # width * height
                "iscrowd": 0
            })

            annotation_id += 1

        image_id += 1

# Save to COCO JSON file
output_file = "/home/gridsan/manderson/ovdsat/data/nwpu/nwpu_vhr10_coco.json"
with open(output_file, "w") as f:
    json.dump(coco_data, f, indent=4)

print(f"COCO annotations saved to {output_file}")


In [35]:
dataset = importer.ImportCoco(output_file)
df = dataset.df

In [36]:
# Create data splits
save_dir = '/home/gridsan/manderson/ovdsat/data/nwpu'

for M in [1, 2, 3, 4, 5]:
    img_ids = set()
    for N in [5, 10, 30]:
        train_file = f'{save_dir}/train_coco_subset_N{N}-{M}.json'
        val_file = f'{save_dir}/val_coco-{M}.json'
        finetune_val_file = f'{save_dir}/train_coco_finetune_val-{M}.json'
        
        N_samples_list = []

        for value, group in df.groupby('cat_id'):
            sampled_group = group.sample(n=min(N, len(group)))
            N_samples_list.append(sampled_group)
            for img_id in sampled_group['img_id'].tolist():
                img_ids.add(img_id)
        print(len(img_ids))

        N_samples_df = pd.concat(N_samples_list, ignore_index=True)
        N_samples_df = N_samples_df.reset_index(drop=True)
        pylabel_df_N = importer.Dataset(N_samples_df)
        exp = exporter.Export(pylabel_df_N)
        exp.ExportToCoco(output_path=train_file, cat_id_index=0)
        
        with open(train_file, 'r') as f:
            train_data = json.load(f)
        sorted_cat = sorted(train_data['categories'], key=lambda x: x['id'])
        train_data['categories'] = sorted_cat
        with open(f'{train_file[:-5]}.json', 'w') as f:
            json.dump(train_data, f, indent=4)
        print(f'Saved {train_file[:-5]}.json')


    # Remaining do not overlap with the N-shots
    remaining_df = df[~df['img_id'].isin(img_ids)]
    remaining_img_ids = remaining_df['img_id'].unique()

    # Split into train and test
    ft_percent = 0.1
    val_ids, ft_ids = train_test_split(remaining_img_ids, test_size=ft_percent)

    val_df = df[df['img_id'].isin(val_ids)]
    val_df = val_df.reset_index(drop=True)
    pylabel_val_df = importer.Dataset(val_df)
    exp = exporter.Export(pylabel_val_df)
    exp.ExportToCoco(output_path=val_file, cat_id_index=0)

    ft_df = df[df['img_id'].isin(ft_ids)]
    ft_df = ft_df.reset_index(drop=True)
    pylabel_ft_df = importer.Dataset(ft_df)
    exp = exporter.Export(pylabel_ft_df)
    exp.ExportToCoco(output_path=finetune_val_file, cat_id_index=0)
        
    ### Fix categories
    with open(finetune_val_file, 'r') as f:
        finetune_val_data = json.load(f)
    sorted_cat = sorted(finetune_val_data['categories'], key=lambda x: x['id'])
    finetune_val_data['categories'] = sorted_cat  
    with open(f'{finetune_val_file[:-5]}.json', 'w') as f:
        json.dump(finetune_val_data, f, indent=4)
    print(f'Saved {finetune_val_file[:-5]}.json')

    val_file = f'{save_dir}/val_coco-{M}.json'
    with open(val_file, 'r') as f:
        val_data = json.load(f)
    sorted_cat = sorted(val_data['categories'], key=lambda x: x['id'])
    val_data['categories'] = sorted_cat
    with open(f'{val_file[:-5]}.json', 'w') as f:
        json.dump(val_data, f, indent=4)
    print(f'Saved {val_file[:-5]}.json')

  df = df.replace(r"^\s*$", np.nan, regex=True)


47


Exporting to COCO file...: 100%|██████████| 50/50 [00:00<00:00, 1505.98it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N5-1.json
132


Exporting to COCO file...: 100%|██████████| 100/100 [00:00<00:00, 1631.55it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N10-1.json
293


Exporting to COCO file...: 100%|██████████| 300/300 [00:00<00:00, 1979.01it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N30-1.json


Exporting to COCO file...: 100%|██████████| 1382/1382 [00:00<00:00, 2200.33it/s]
Exporting to COCO file...: 100%|██████████| 131/131 [00:00<00:00, 2219.79it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_finetune_val-1.json
Saved /home/gridsan/manderson/ovdsat/data/nwpu/val_coco-1.json
48


Exporting to COCO file...: 100%|██████████| 50/50 [00:00<00:00, 1636.48it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N5-2.json
128


Exporting to COCO file...: 100%|██████████| 100/100 [00:00<00:00, 1788.37it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N10-2.json
297


  df = df.replace(r"^\s*$", np.nan, regex=True)
Exporting to COCO file...: 100%|██████████| 300/300 [00:00<00:00, 1947.93it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N30-2.json


Exporting to COCO file...: 100%|██████████| 1376/1376 [00:00<00:00, 1771.87it/s]
Exporting to COCO file...: 100%|██████████| 179/179 [00:00<00:00, 2320.32it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_finetune_val-2.json
Saved /home/gridsan/manderson/ovdsat/data/nwpu/val_coco-2.json
48


Exporting to COCO file...: 100%|██████████| 50/50 [00:00<00:00, 1641.02it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N5-3.json
126


Exporting to COCO file...: 100%|██████████| 100/100 [00:00<00:00, 1813.67it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N10-3.json
291


  df = df.replace(r"^\s*$", np.nan, regex=True)
Exporting to COCO file...: 100%|██████████| 300/300 [00:00<00:00, 1968.08it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N30-3.json


Exporting to COCO file...: 100%|██████████| 1323/1323 [00:00<00:00, 2924.19it/s]
Exporting to COCO file...: 100%|██████████| 147/147 [00:00<00:00, 3186.06it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_finetune_val-3.json
Saved /home/gridsan/manderson/ovdsat/data/nwpu/val_coco-3.json
49


Exporting to COCO file...: 100%|██████████| 50/50 [00:00<00:00, 373.65it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N5-4.json
126


Exporting to COCO file...: 100%|██████████| 100/100 [00:00<00:00, 2512.27it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N10-4.json
279


Exporting to COCO file...: 100%|██████████| 300/300 [00:00<00:00, 2756.03it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N30-4.json


Exporting to COCO file...: 100%|██████████| 1377/1377 [00:00<00:00, 3057.64it/s]
Exporting to COCO file...: 100%|██████████| 177/177 [00:00<00:00, 2305.24it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_finetune_val-4.json
Saved /home/gridsan/manderson/ovdsat/data/nwpu/val_coco-4.json
47


Exporting to COCO file...: 100%|██████████| 50/50 [00:00<00:00, 2189.32it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N5-5.json
128


Exporting to COCO file...: 100%|██████████| 100/100 [00:00<00:00, 1795.36it/s]
  df = df.replace(r"^\s*$", np.nan, regex=True)


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N10-5.json
299


Exporting to COCO file...: 100%|██████████| 300/300 [00:00<00:00, 2273.57it/s]


Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_subset_N30-5.json


Exporting to COCO file...: 100%|██████████| 1271/1271 [00:00<00:00, 2498.43it/s]
Exporting to COCO file...: 100%|██████████| 146/146 [00:00<00:00, 3211.73it/s]

Saved /home/gridsan/manderson/ovdsat/data/nwpu/train_coco_finetune_val-5.json
Saved /home/gridsan/manderson/ovdsat/data/nwpu/val_coco-5.json





### SIMD

In [3]:
def merge_coco_json(json_files, output_file):
    merged_data = {
        "info": None,
        "licenses": [],
        "images": [],
        "annotations": [],
        "categories": []
    }

    for idx, json_file in enumerate(json_files):
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Take 'info' from the first file only
        if idx == 0 and "info" in data:
            merged_data["info"] = data["info"]

        # Merge lists
        merged_data["licenses"].extend(data.get("licenses", []))
        merged_data["images"].extend(data["images"])
        merged_data["annotations"].extend(data["annotations"])
        merged_data["categories"].extend(data["categories"])

    # Save the merged JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, indent=4)

    print(f"Merged COCO JSON saved as {output_file}")

In [4]:
# Example Usage
json_files = ["/home/gridsan/manderson/ovdsat/data/simd/train_coco_finetune_val.json", 
              "/home/gridsan/manderson/ovdsat/data/simd/train_coco_subset_N5.json",
              "/home/gridsan/manderson/ovdsat/data/simd/train_coco_subset_N10.json",
              "/home/gridsan/manderson/ovdsat/data/simd/train_coco_subset_N30.json"]  # Replace with your actual JSON file paths
output_file = "/home/gridsan/manderson/ovdsat/data/simd/simd_coco.json"
merge_coco_json(json_files, output_file)

Merged COCO JSON saved as /home/gridsan/manderson/ovdsat/data/simd/simd_coco.json
