# Transformer un dataset format COCO en un dataset format YOLO

## Function definitions

In [64]:
def cocoToYolo(dir_path):
    import os
    import json
    import cv2
    import numpy as np
    import shutil
    from tqdm import tqdm
    from collections import defaultdict

    # Load the COCO annotations
    with open(os.path.join(dir_path, 'annotations.json')) as f:
        data = json.load(f)

    # Create a dictionary to map class names to class ids
    class_map = {}
    for i, category in enumerate(data['categories']):
        class_map[category['id']] = i

    # Create a dictionary to map image ids to image file names
    image_map = {}
    for image in data['images']:
        image_map[image['id']] = image['file_name']

    # Create a dictionary to map image file names to image ids
    image_id_map = {}
    for image in data['images']:
        image_id_map[image['file_name']] = image['id']

    # Create a dictionary to map image ids to image sizes
    image_size_map = {}
    for image in data['images']:
        image_size_map[image['id']] = (image['width'], image['height'])

    # Create a dictionary to map image ids to bounding boxes
    segs = defaultdict(list)
    for annotation in data['annotations']:
        
        image_id = annotation['image_id']
        class_id = class_map[annotation['category_id']]
    
        # Convert COCO segmentation to Yolov8 segmentation (no bounding box, just polygon)
        polygon = annotation['segmentation'][0]
        segs[image_id].append((class_id, polygon))

    # Create a directory to store the Yolov8 annotations
    yolo_dir = os.path.join(dir_path, 'yolo')
    if os.path.exists(yolo_dir):
        shutil.rmtree(yolo_dir)
    os.makedirs(yolo_dir)
    
    # Create a directory to store the Yolov8 images
    yolo_img_dir = os.path.join(yolo_dir, 'images')
    os.makedirs(yolo_img_dir)

    # Create a directory to store the Yolov8 labels
    yolo_label_dir = os.path.join(yolo_dir, 'labels')
    os.makedirs(yolo_label_dir)

    # Convert the COCO annotations to Yolov8 annotations
    for image_id, seg in tqdm(segs.items()):
        # Load the image
        img = cv2.imread(os.path.join(dir_path, image_map[image_id]))
        img_h, img_w, _ = img.shape

        # Create a file to store the Yolov8 annotations

        yolo_label_file = os.path.join(yolo_label_dir, image_map[image_id].replace('jpg', 'txt').replace('/', '_').replace('JPG', 'txt'))
        with open(yolo_label_file, 'w') as f:
            for class_id, polygon in seg:
                # Convert COCO polygon to Yolov8 polygon

                polygon = np.array(polygon).reshape(-1, 2).astype(float)
                polygon[:, 0] = polygon[:, 0] / (1.0 * img_w)
                polygon[:, 1] = polygon[:, 1] / (1.0 * img_h)
                polygon = polygon.reshape(-1)
                polygon = ' '.join([str(p) for p in polygon])

                # Write the Yolov8 annotation to the file
                f.write(f'{class_id} {polygon}\n')

        # Save the Yolov8 image)
        cv2.imwrite(os.path.join(yolo_img_dir, image_map[image_id].replace('/', '_')), img)

    print('Done!')



In [83]:
#split the dataset into train, test and validation

# j'ai tenté des trucs avant ça sur base d'ouvrir les dirs, shuffle les images et les copier dans les bons dossiers mais même avec des combine et en faisant le shuffle 
# seulement sur le combine les images et labels ne correspondaient pas, donc je suis partie sur un super module proposé dans une réponse stackoverflow - si vous avez un
# outil privilégié pour faire ce split autrement je suis preneuse!

def split_dataset(dir_path, train_ratio, test_ratio, val_ratio):

    import splitfolders
    splitfolders.ratio(dir_path, output=dir_path, seed=1337, ratio=(train_ratio, test_ratio, val_ratio))

    

## Example d'utilisation

In [79]:
cocoToYolo('./Notebooks/TACO/data/')

100%|██████████| 1500/1500 [04:49<00:00,  5.19it/s]

Done!





In [81]:
%pip install split-folders

[33mDEPRECATION: Loading egg at /home/xaelee/Documents/default/lib/python3.11/site-packages/pycocotools-2.0-py3.11-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [82]:
split_dataset('./Notebooks/TACO/data/yolo', 0.7, 0.2, 0.1)

Copying files: 3000 files [00:09, 301.17 files/s] 
