In [3]:
import os
import json
import requests
import base64
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import uuid

In [5]:
class PrepareYoloData:
    
    def _extract_unique_categories(self, df):
        """
        Extract unique category labels from the DataFrame.
        
        Args:
            df: DataFrame with a "annotations" column containing lists of label dictionaries
            
        Returns:
            Dictionary mapping category names to category IDs
        """
        unique_labels = set()
        for labels in df["annotations"]:
            if isinstance(labels, list):
                for label in labels:
                    if isinstance(label, dict) and "category" in label:
                        unique_labels.add(label["category"])
                    else: 
                        raise ValueError(f"Unrecognized label format: {label}, expecting dict with 'category' and 'bbox'")
            else:
                raise ValueError(f"Label is not a list of dicts: {labels}")
        
        return {cat: i+1 for i, cat in enumerate(sorted(unique_labels))}
    
    def process_df(self, df, output_dir="yolo_dataset", category_mapping=None):
        """
        Convert a DataFrame with image and label information to COCO format.
        
        Args:
            df: DataFrame with columns ["image", "annotations", "image_type", "label_type", "split"]
            output_dir: Directory to save the dataset
            annotations is a list of dicts, each dict contains "category" and "bbox"
            category_mapping: Optional dictionary mapping label names to category IDs
            
        Returns:
            Dictionary containing paths to the created dataset files
        """
        # Collect unique values in split
        unique_splits = df["split"].unique()
        
        # Create output directories
        os.makedirs(output_dir, exist_ok=True)
        for split in unique_splits:
            os.makedirs(os.path.join(output_dir, "images", split), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "annotations"), exist_ok=True)
        
        # Initialize COCO format dictionaries for train and val
        coco_data = {
            split: {"images": [], "annotations": [], "categories": []} for split in unique_splits
        }
        
        # Create category list if not provided
        if category_mapping is None:
            category_mapping = self._extract_unique_categories(df)
        
        # Populate categories in COCO format
        for category_name, category_id in category_mapping.items():
            category_info = {
                "id": category_id,
                "name": category_name,
                "supercategory": "none"
            }
            for split in unique_splits:
                coco_data[split]["categories"].append(category_info)
        
        # Process each row in the DataFrame
        annotation_id = 1
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
            split = row["split"]
            if split not in unique_splits:
                continue
                
            # Get image
            image_data = row["image"]
            image_type = row["image_type"]
            
            # Load image based on type
            if image_type == "url":
                try:
                    response = requests.get(image_data, timeout=10)
                    img = Image.open(BytesIO(response.content))
                except Exception as e:
                    print(f"Error downloading image from URL {image_data}: {e}")
                    continue
            elif image_type == "image_path":
                try:
                    img = Image.open(image_data)
                except Exception as e:
                    print(f"Error opening image from path {image_data}: {e}")
                    continue
            elif image_type == "base64":
                try:
                    img_bytes = base64.b64decode(image_data)
                    img = Image.open(BytesIO(img_bytes))
                except Exception as e:
                    print(f"Error decoding base64 image: {e}")
                    continue
            else:
                print(f"Unsupported image type: {image_type}")
                continue
            
            # Generate a unique filename
            image_id = idx
            file_name = f"{image_id:012d}.jpg"
            img_path = os.path.join(output_dir, "images", split, file_name)
            
            # Save image
            img = img.convert("RGB")
            img.save(img_path)
            
            # Get image dimensions
            width, height = img.size
            
            # Add image info to COCO format
            image_info = {
                "id": image_id,
                "file_name": file_name,
                "width": width,
                "height": height,
                "date_captured": "",
                "license": 1,
                "coco_url": "",
                "flickr_url": ""
            }
            coco_data[split]["images"].append(image_info)
            
            # Process labels
            annotations = row["annotations"]
            if not isinstance(annotations, list):
                continue
                
            for annotation in annotations:
                if not isinstance(annotation, dict):
                    continue
                    
                if "category" not in annotation or "bbox" not in annotation:
                    continue
                    
                category_name = annotation["category"]
                if category_name not in category_mapping:
                    continue
                    
                category_id = category_mapping[category_name]
                bbox = annotation["bbox"]
                
                # Convert bbox based on label_type
                label_type = row["label_type"]
                if label_type == "xyxy":
                    # [x1, y1, x2, y2] to COCO [x, y, width, height]
                    x1, y1, x2, y2 = bbox
                    x, y = x1, y1
                    w, h = x2 - x1, y2 - y1
                elif label_type == "xywh":
                    # [x, y, w, h] to COCO [x, y, width, height]
                    x, y, w, h = bbox
                else:
                    print(f"Unsupported label type: {label_type}")
                    continue
                
                # Create segmentation (simple polygon from bbox)
                segmentation = [[
                    x, y,
                    x + w, y,
                    x + w, y + h,
                    x, y + h
                ]]
                
                # Calculate area
                area = w * h
                
                # Add annotation to COCO format
                annotation_info = {
                    "id": annotation_id,
                    "image_id": image_id,
                    "category_id": category_id,
                    "bbox": [float(x), float(y), float(w), float(h)],
                    "area": float(area),
                    "segmentation": segmentation,
                    "iscrowd": 0
                }
                coco_data[split]["annotations"].append(annotation_info)
                annotation_id += 1
        
        # Save COCO format annotations
        train_json_path = os.path.join(output_dir, "annotations", "instances_train.json")
        val_json_path = os.path.join(output_dir, "annotations", "instances_val.json")
        
        with open(train_json_path, 'w') as f:
            json.dump(coco_data["train"], f)
            
        with open(val_json_path, 'w') as f:
            json.dump(coco_data["val"], f)
            
        return {
            "train_images": os.path.join(output_dir, "images", "train"),
            "val_images": os.path.join(output_dir, "images", "val"),
            "train_annotations": train_json_path,
            "val_annotations": val_json_path
        }

In [17]:
"""
Example of using the PrepareYoloData class:

Assuming df_train and df_val are already loaded and have the following columns:
- image_path: str, the path of the image
- annotations: str, the annotations of the image, which is a list of dicts, 
    - item: {
        "left": int,
        "top": int,
        "width": int,
        "height": int,
        "label": str
    }
df_all = pd.concat([df_train, df_val])
df_all['image'] = df_all['image_path']
df_all['image_type'] = 'image_path'
df_all['annotations'] = df_all['annotations'].apply(lambda x_list: [{"category": x['label'], "bbox": [x['left'], x['top'], x['width'], x['height']]} for x in eval(x_list)])
df_all['label_type'] = 'xywh'

PrepareYoloData().process_df(df_all, output_dir="data/custom_dataset")

"""

'\nExample of using the PrepareYoloData class:\n\nAssuming df_train and df_val are already loaded and have the following columns:\n- image_path: str, the path of the image\n- annotations: str, the annotations of the image, which is a list of dicts, \n    - item: {\n        "left": int,\n        "top": int,\n        "width": int,\n        "height": int,\n        "label": str\n    }\ndf_all = pd.concat([df_train, df_val])\ndf_all[\'image\'] = df_all[\'image_path\']\ndf_all[\'image_type\'] = \'image_path\'\ndf_all[\'annotations\'] = df_all[\'annotations\'].apply(lambda x_list: [{"category": x[\'label\'], "bbox": [x[\'left\'], x[\'top\'], x[\'width\'], x[\'height\']]} for x in eval(x_list)])\ndf_all[\'label_type\'] = \'xywh\'\n\nPrepareYoloData().process_df(df_all, output_dir="data/custom_dataset")\n\n'

COCO format exported from label-studio annotation tool:

In [7]:
json_path = "/home/lucas/VSCodeProjects/YOLO/data/label-studio/coco-format/project-1-at-2025-12-03-11-42-0ef0a05f/result.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# quick inspection
print(type(data))
try:
    print("Number of top-level items:", len(data))
except Exception:
    pass

# normalize to a DataFrame for easy inspection (if it's a list of dicts)
try:
    df = pd.json_normalize(data)
    display(df.head())
except Exception as e:
    print("Could not normalize to DataFrame:", e)

<class 'dict'>
Number of top-level items: 4


Unnamed: 0,images,categories,annotations,info.year,info.version,info.description,info.contributor,info.url,info.date_created
0,"[{'width': 1080, 'height': 1350, 'id': 0, 'fil...","[{'id': 0, 'name': 'product'}]","[{'id': 0, 'image_id': 0, 'category_id': 0, 's...",2025,1.0,,Label Studio,,2025-12-03 11:42:14.213494


In [38]:
def convert_coco_export_to_dfs(json_path, val_frac=0.2, seed=42):
    """Convert COCO-style export (from Label‑Studio) to train/val DataFrames.

    Returns: df_train, df_val, df_all
    Each DataFrame has columns: ["image", "annotations", "image_type", "label_type", "split"]
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        exported = json.load(f)

    # Expecting COCO dict with 'images' and 'annotations'
    if isinstance(exported, dict) and 'images' in exported and 'annotations' in exported:
        images = {img['id']: img for img in exported['images']}
        annotations = exported['annotations']
        categories = {c['id']: c['name'] for c in exported.get('categories', [])}

        rows = {}
        for ann in annotations:
            img_id = ann['image_id']
            img = images.get(img_id, {})
            file_name = img.get('file_name') or img.get('coco_url') or img.get('flickr_url') or img.get('id')

            # Decide image_type
            if isinstance(file_name, str) and (file_name.startswith('http://') or file_name.startswith('https://')):
                image_type = 'url'
                image_ref = file_name
            else:
                image_type = 'image_path'
                image_ref = "/" + file_name.strip('../')

            if img_id not in rows:
                rows[img_id] = {
                    'image': image_ref,
                    'image_type': image_type,
                    'annotations': [],
                    'label_type': 'xywh'
                }

            bbox = ann.get('bbox', [])
            cat_name = categories.get(ann.get('category_id'), str(ann.get('category_id')))
            rows[img_id]['annotations'].append({'category': cat_name, 'bbox': bbox})

        records = []
        for img_id, v in rows.items():
            records.append({
                'image': v['image'],
                'image_type': v['image_type'],
                'annotations': v['annotations'],
                'label_type': v['label_type']
            })

        df_all = pd.DataFrame(records)
    else:
        raise ValueError("Unsupported export format. Expected COCO dict with 'images' and 'annotations'.")

    # Shuffle and split
    df_all = df_all.sample(frac=1, random_state=seed).reset_index(drop=True)
    n_val = int(len(df_all) * float(val_frac))
    df_val = df_all.iloc[:n_val].copy().reset_index(drop=True)
    df_train = df_all.iloc[n_val:].copy().reset_index(drop=True)

    df_train['split'] = 'train'
    df_val['split'] = 'val'

    # Combine preserving split labels so PrepareYoloData can read them
    df_all = pd.concat([df_train, df_val], ignore_index=True)

    return df_train, df_val, df_all


# Example usage with default validation fraction 0.2
df_train, df_val, df_all = convert_coco_export_to_dfs(json_path, val_frac=0.2)
print(f"Total images: {len(df_all)}, train: {len(df_train)}, val: {len(df_val)}")
display(df_train.head())

Total images: 11, train: 9, val: 2


Unnamed: 0,image,image_type,annotations,label_type,split
0,/home/lucas/.local/share/label-studio/media/up...,image_path,"[{'category': 'product', 'bbox': [235.96833183...",xywh,train
1,/home/lucas/.local/share/label-studio/media/up...,image_path,"[{'category': 'product', 'bbox': [59.521023137...",xywh,train
2,/home/lucas/.local/share/label-studio/media/up...,image_path,"[{'category': 'product', 'bbox': [70.404172099...",xywh,train
3,/home/lucas/.local/share/label-studio/media/up...,image_path,"[{'category': 'product', 'bbox': [40.482398956...",xywh,train
4,/home/lucas/.local/share/label-studio/media/up...,image_path,"[{'category': 'product', 'bbox': [229.02808677...",xywh,train


In [39]:
PrepareYoloData().process_df(df_all, output_dir="../data/custom_dataset")

Processing images: 100%|██████████| 11/11 [00:00<00:00, 13.94it/s]
Processing images: 100%|██████████| 11/11 [00:00<00:00, 13.94it/s]


{'train_images': '../data/custom_dataset/images/train',
 'val_images': '../data/custom_dataset/images/val',
 'train_annotations': '../data/custom_dataset/annotations/instances_train.json',
 'val_annotations': '../data/custom_dataset/annotations/instances_val.json'}