<h3>This notebook contains code used to create bounding boxes around cracks. Numpy files will be created to hold that information.</h3>

In [20]:
import os
import cv2
import json
import numpy as np

In [21]:
get_paths = lambda path: [f"{os.path.join(root, file)}" for root, dirs, files in os.walk(path) for file in files]
train_dir = os.path.join("data", "train")
valid_dir = os.path.join("data", "valid")
images_dir_train = os.path.join(train_dir, "images")
images_dir_valid = os.path.join(valid_dir, "images")
masks_dir_train = os.path.join(train_dir, "masks")
masks_dir_valid = os.path.join(valid_dir, "masks")
mask_paths_train = get_paths(masks_dir_train)
mask_paths_valid = get_paths(masks_dir_valid)

In [22]:
def get_coco_tpl() -> dict:
    return {
        "images": [],
        "annotations": [],
        "categories": [{
            "id": 1,
            "name": "crack",
            "supercategory": "defect"
        }]
    }

In [23]:
def find_boxes(masks_dir: str, images_dir: str, coco_format: dict) -> None:
    annotation_id = 1
    image_id_mapping = {}

    for mask_filename in os.listdir(masks_dir):
        image_id = os.path.splitext(mask_filename)[0]
        mask_path = os.path.join(masks_dir, mask_filename)
        image_path = os.path.join(images_dir, f"{image_id}.jpg")

        if not os.path.exists(image_path):
            continue

        image_id_mapping[image_id] = len(image_id_mapping) + 1
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        
        if mask is None:
            continue

        _, binary_image = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(binary_image, connectivity=8)

        if image_id not in coco_format["images"]:
            image_entry = {
                "id": image_id_mapping[image_id],
                "file_name": os.path.basename(image_path),
                "width": mask.shape[1],
                "height": mask.shape[0],
                "license": 1,
                "flickr_url": "",
                "coco_url": "",
                "date_captured": ""
            }
            coco_format["images"].append(image_entry)

        for j in range(1, num_labels):
            # cv2 also adds boxes covering full image for some reason
            if stats[j, cv2.CC_STAT_LEFT] == 0 and stats[j, cv2.CC_STAT_TOP] == 0 and \
               stats[j, cv2.CC_STAT_WIDTH] == mask.shape[1] and stats[j, cv2.CC_STAT_HEIGHT] == mask.shape[0]:
                continue

            if stats[j, cv2.CC_STAT_AREA] > 20:
                bbox = [
                    int(stats[j, cv2.CC_STAT_LEFT]),
                    int(stats[j, cv2.CC_STAT_TOP]),
                    int(stats[j, cv2.CC_STAT_WIDTH]),
                    int(stats[j, cv2.CC_STAT_HEIGHT])
                ]
                mask_region = (labels == j).astype(np.uint8)
                contours, _ = cv2.findContours(mask_region, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                segmentation = [contour.flatten().tolist() for contour in contours if contour.size >= 6]
                annotation_entry = {
                    "id": annotation_id,
                    "image_id": image_id_mapping[image_id],
                    "category_id": 1,
                    "bbox": bbox,
                    "area": int(stats[j, cv2.CC_STAT_AREA]),
                    "segmentation": segmentation,
                    "iscrowd": 0,
                    "label": f"crack {annotation_id}"
                }
                coco_format["annotations"].append(annotation_entry)
                annotation_id += 1

    print(f"Entries count: {len(coco_format['annotations'])}")

In [24]:
def build_coco(images_dir: str, masks_dir: str, prefix: str) -> None:
    coco_format = get_coco_tpl()
    find_boxes(masks_dir, images_dir, coco_format)
    output_file = os.path.join("data", prefix, f"coco_annotations.json")

    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    with open(output_file, "w") as f:
        json.dump(coco_format, f, indent=4)

    print(f"COCO format annotations saved to {output_file}")

In [25]:
build_coco(images_dir_train, masks_dir_train, "train")

Entries count: 18315
COCO format annotations saved to data\train\coco_annotations.json


In [28]:
build_coco(images_dir_valid, masks_dir_valid, "valid")

Entries count: 3244
COCO format annotations saved to data\valid\coco_annotations.json
