In [4]:
import os
import json
import xmltodict
from tqdm import tqdm
from PIL import Image
import argparse

In [10]:
def get_image_ids(txt_file):
    with open(txt_file, 'r') as f:
        lines = f.readlines()
    image_ids = [line.split()[0] for line in lines]
    return set(image_ids)

def get_categories(voc_dir):
    classes = set()
    for xml_file in tqdm(os.listdir(voc_dir), desc="Extracting classes"):
        if not xml_file.endswith('.xml'):
            continue
        with open(os.path.join(voc_dir, xml_file), 'r') as f:
            xml_data = xmltodict.parse(f.read())
            objects = xml_data['annotation'].get('object')
            if not isinstance(objects, list):
                objects = [objects]
            for obj in objects:
                classes.add(obj['name'])
                parts = obj.get('part')
                if parts:
                    if not isinstance(parts, list):
                        parts = [parts]
                    for part in parts:
                        classes.add(part['name'])
    classes = list(classes)
    classes.sort()
    categories = [{'id': i + 1, 'name': cls} for i, cls in enumerate(classes)]
    return categories

def convert_voc_to_coco(voc_dir, output_json, img_dir, image_ids):
    categories = get_categories(voc_dir)
    class_map = {cat['name']: cat['id'] for cat in categories}
    
    coco_data = {
        'images': [],
        'annotations': [],
        'categories': categories
    }
    
    ann_id = 1
    
    for img_id, xml_file in enumerate(tqdm(os.listdir(voc_dir), desc="Converting to COCO format")):
        if not xml_file.endswith('.xml'):
            continue

        with open(os.path.join(voc_dir, xml_file), 'r') as f:
            xml_data = xmltodict.parse(f.read())
            img_filename = xml_data['annotation']['filename']
            img_id_base = os.path.splitext(img_filename)[0]
            if img_id_base not in image_ids:
                continue
            
            img_path = os.path.join(img_dir, img_filename)
            if not os.path.exists(img_path):
                continue

            img = Image.open(img_path)
            width, height = img.size

            coco_data['images'].append({
                'id': img_id,
                'file_name': img_filename,
                'width': width,
                'height': height
            })

            objects = xml_data['annotation'].get('object')
            if not isinstance(objects, list):
                objects = [objects]

            for obj in objects:
                bbox = obj['bndbox']
                xmin = round(float(bbox['xmin']))
                ymin = round(float(bbox['ymin']))
                xmax = round(float(bbox['xmax']))
                ymax = round(float(bbox['ymax']))
                width = xmax - xmin
                height = ymax - ymin
                
                coco_data['annotations'].append({
                    'id': ann_id,
                    'image_id': img_id,
                    'category_id': class_map[obj['name']],
                    'bbox': [xmin, ymin, width, height],
                    'area': width * height,
                    'iscrowd': 0
                })
                ann_id += 1

                parts = obj.get('part')
                if parts:
                    if not isinstance(parts, list):
                        parts = [parts]
                    for part in parts:
                        part_bbox = part['bndbox']
                        part_xmin = round(float(part_bbox['xmin']))
                        part_ymin = round(float(part_bbox['ymin']))
                        part_xmax = round(float(part_bbox['xmax']))
                        part_ymax = round(float(part_bbox['ymax']))
                        part_width = part_xmax - part_xmin
                        part_height = part_ymax - part_ymin
                        
                        coco_data['annotations'].append({
                            'id': ann_id,
                            'image_id': img_id,
                            'category_id': class_map[part['name']],
                            'bbox': [part_xmin, part_ymin, part_width, part_height],
                            'area': part_width * part_height,
                            'iscrowd': 0
                        })
                        ann_id += 1

    with open(output_json, 'w') as f:
        json.dump(coco_data, f, indent=4)

In [12]:
# if __name__ == '__main__':
#     parser = argparse.ArgumentParser(description="Convert PASCAL VOC annotations to COCO format with train/val split")
#     parser.add_argument('--voc_dir', type=str, required=True, help="Directory with VOC XML files")
#     parser.add_argument('--train_txt', type=str, required=True, help="File with train image ids")
#     parser.add_argument('--val_txt', type=str, required=True, help="File with val image ids")
#     parser.add_argument('--output_train_json', type=str, required=True, help="Output COCO JSON file for train set")
#     parser.add_argument('--output_val_json', type=str, required=True, help="Output COCO JSON file for val set")
#     parser.add_argument('--img_dir', type=str, required=True, help="Directory with images")

#     args = parser.parse_args()

#     train_image_ids = get_image_ids(args.train_txt)
#     val_image_ids = get_image_ids(args.val_txt)

#     convert_voc_to_coco(args.voc_dir, args.output_train_json, args.img_dir, train_image_ids)
#     convert_voc_to_coco(args.voc_dir, args.output_val_json, args.img_dir, val_image_ids)
voc_dir = '/data2/eranario/data/PASCAL-VOC-2012/VOCdevkit/VOC2012/Annotations'
train_txt = '/data2/eranario/data/PASCAL-VOC-2012/VOCdevkit/VOC2012/ImageSets/Layout/train.txt'
val_txt = '/data2/eranario/data/PASCAL-VOC-2012/VOCdevkit/VOC2012/ImageSets/Layout/val.txt'
output_train_json = '/data2/eranario/data/PASCAL-VOC-2012_COCO/Annotations/train.json'
output_val_json = '/data2/eranario/data/PASCAL-VOC-2012_COCO/Annotations/val.json'
img_dir = '/data2/eranario/data/PASCAL-VOC-2012/VOCdevkit/VOC2012/JPEGImages'

train_image_ids = get_image_ids(train_txt)
val_image_ids = get_image_ids(val_txt)

convert_voc_to_coco(voc_dir, output_train_json, img_dir, train_image_ids)
convert_voc_to_coco(voc_dir, output_val_json, img_dir, val_image_ids)

Extracting classes: 100%|██████████| 17125/17125 [00:05<00:00, 3058.84it/s]
Converting to COCO format: 100%|██████████| 17125/17125 [00:05<00:00, 2931.26it/s]
Extracting classes: 100%|██████████| 17125/17125 [00:05<00:00, 3069.25it/s]
Converting to COCO format: 100%|██████████| 17125/17125 [00:05<00:00, 2977.16it/s]
