In [1]:
from PIL import Image

import pandas as pd
import json
import os

In [14]:
def get_coco_tpl() -> dict:
    return {
        "info": {
            "description": "HAM10000 Dataset",
            "url": "",
            "version": "1.0",
            "year": 2024,
            "contributor": "",
            "date_created": ""
        },
        "licenses": [
            {
                "id": 1,
                "name": "",
                "url": ""
            }
        ],
        "images": [],
        "annotations": [],
        "categories": [
            {
                "id": 1,
                "name": "lesion",
                "supercategory": "none"
            }
        ]
    }

In [15]:
csv_file = os.path.join("data", "HAM10000_metadata_ext.csv")
df = pd.read_csv(csv_file)

In [16]:
def build_coco(prefix: str) -> None:
    images_folder = os.path.join("data", f"{prefix}_images")
    coco_format = get_coco_tpl()
    image_id_mapping = {}
    annotation_id = 1
    
    for index, row in df.iterrows():
        image_id = row["image_id"]
        image_path = os.path.join(images_folder, f"{image_id}.jpg")
        
        if not os.path.exists(image_path):
            continue
        
        if image_id not in image_id_mapping:
            image_id_mapping[image_id] = len(image_id_mapping) + 1
            image_entry = {
                "id": image_id_mapping[image_id],
                "width": 200,
                "height": 150,
                "file_name": f"{image_id}.jpg",
                "license": 1,
                "flickr_url": "",
                "coco_url": "",
                "date_captured": ""
            }
            
            try:
                with Image.open(image_path) as img:
                    image_entry["width"], image_entry["height"] = img.size
            except Exception as e:
                print(f"Error opening image {image_path}: {e}")

            annotation_entry = {
                "id": annotation_id,
                "image_id": image_id_mapping[image_id],
                "category_id": 1,
                "bbox": [row["left"], row["top"], row["right"] - row["left"], row["bottom"] - row["top"]],
                "area": (row["right"] - row["left"]) * (row["bottom"] - row["top"]),
                "segmentation": [],
                "iscrowd": 0
            }
            annotation_id += 1
            
            coco_format["annotations"].append(annotation_entry)
            coco_format["images"].append(image_entry)
    
    print(f"Entries count: {len(coco_format['annotations'])}")
    
    output_file = os.path.join("data", f"{prefix}_coco_annotations.json")
    
    with open(output_file, "w") as f:
        json.dump(coco_format, f, indent=4)
    
    print(f"COCO format annotations saved to {output_file}")

In [17]:
build_coco("train")
build_coco("test")

Entries count: 8513
COCO format annotations saved to data\train_coco_annotations.json
Entries count: 1502
COCO format annotations saved to data\test_coco_annotations.json
