In [None]:
import json
import random
import os
import urllib.request
import zipfile
import shutil

def download_coco(destination_folder):
    os.makedirs(destination_folder, exist_ok=True)
    
    urls = {
        "train": "http://images.cocodataset.org/zips/train2017.zip",
        "val": "http://images.cocodataset.org/zips/val2017.zip",
        "test": "http://images.cocodataset.org/zips/test2017.zip",
        "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
    }
    
    for key, url in urls.items():
        zip_path = os.path.join(destination_folder, f"{key}.zip")
        if not os.path.exists(zip_path):
            print(f"Downloading {key}...")
            urllib.request.urlretrieve(url, zip_path)
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
    
    print("COCO dataset downloaded and extracted.")

def create_coco_subset(original_json_path, output_json_path, image_folder, output_image_folder, fraction=0.5):
    # Load the original COCO dataset
    with open(original_json_path, 'r') as f:
        coco_data = json.load(f)
    
    # Extract images and annotations
    images = coco_data['images']
    annotations = coco_data['annotations']
    
    # Select a random subset of images
    num_images = len(images)
    selected_images = random.sample(images, int(num_images * fraction))
    selected_image_ids = {img['id'] for img in selected_images}
    
    # Filter annotations corresponding to the selected images
    selected_annotations = [ann for ann in annotations if ann['image_id'] in selected_image_ids]
    
    # Create new dataset
    subset_coco_data = {
        "info": coco_data.get("info", {}),
        "licenses": coco_data.get("licenses", {}),
        "categories": coco_data.get("categories", {}),
        "images": selected_images,
        "annotations": selected_annotations
    }
    
    # Save subset dataset
    with open(output_json_path, 'w') as f:
        json.dump(subset_coco_data, f, indent=4)
    
    # Copy selected images to new folder
    os.makedirs(output_image_folder, exist_ok=True)
    for img in selected_images:
        src_path = os.path.join(image_folder, img['file_name'])
        dst_path = os.path.join(output_image_folder, img['file_name'])
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)
    
    print(f"Subset saved to {output_json_path} with {len(selected_images)} images and {len(selected_annotations)} annotations.")

# Example usage
datasets = ['train', 'val', 'test']
download_folder = '/path/to/coco/'
download_coco(download_folder)
original_base_path = os.path.join(download_folder, 'annotations/')
output_base_path = '/path/to/coco_subset/'

os.makedirs(output_base_path, exist_ok=True)

for dataset in datasets:
    create_coco_subset(
        original_json_path=os.path.join(original_base_path, f'instances_{dataset}2017.json'),
        output_json_path=os.path.join(output_base_path, f'annotations/instances_{dataset}_subset.json'),
        image_folder=os.path.join(download_folder, f'{dataset}2017/'),
        output_image_folder=os.path.join(output_base_path, f'images/{dataset}2017_subset/')
    )
