In [25]:
import cv2
import json
from copy import deepcopy
from os.path import basename
from tqdm import tqdm
from detectron2.utils.visualizer import Visualizer
from sklearn.model_selection import train_test_split
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.structures import BoxMode
from typing import List, Tuple
from pathlib import Path
import numpy as np

In [26]:
# autocropping images to minimize unannotated extent 

with open("hotspots_labeled.json", "r") as inf:
    hotspots = json.load(inf)

In [27]:
with open("signs.txt", "r") as sgns:
    signs = [sign.strip() for sign in sgns.readlines()]

In [28]:
autocropped_hotspots = []

outfolder = "/local/ecw/deepscribe-detectron/data/images_cropped"
annotated = "/local/ecw/deepscribe-detectron/data/images_annotated"

for hotspot in tqdm(hotspots):
    # don't keep if there's only one sign!
    # TODO: check thresholds here. 

    if len(hotspot["annotations"]) > 1:
        new_hotspot = deepcopy(hotspot)

        points = [anno["bbox"] for anno in hotspot["annotations"]]
        
        # dealing with negative-valued coordinates
        min_x0 = int(max(0, min([pt[0] for pt in points])))
        max_x1 = int(max([pt[2] for pt in points]))

        min_y0 = int(max(0, min([pt[1] for pt in points])))
        max_y1 = int(max([pt[3] for pt in points]))

        img = cv2.imread(hotspot["file_name"])


        cropped = img[min_y0:max_y1, min_x0:max_x1, :]


        # adjust points - new origin is min_x0, min_y0


        new_hotspot["height"] = cropped.shape[0]
        new_hotspot["width"] = cropped.shape[1]
        new_hotspot["file_name"] = basename(hotspot["file_name"])
        new_hotspot["bbox_mode"] = BoxMode.XYXY_ABS

        for anno in new_hotspot["annotations"]:
            old_bbox = anno["bbox"]

            anno["bbox"] = [
                max(0, old_bbox[0]) - min_x0,
                old_bbox[1] - min_y0,
                old_bbox[2] - min_x0,
                old_bbox[3] - min_y0,
            ]

        

        cv2.imwrite(outfolder + "/" + new_hotspot["file_name"], cropped)
        autocropped_hotspots.append(new_hotspot)
        visualizer = Visualizer(cropped[:, :, ::-1], scale=0.5, metadata={"thing_classes": signs})
        out = visualizer.draw_dataset_dict(new_hotspot)
        cv2.imwrite(annotated + "/" + basename(hotspot["file_name"]), out.get_image()[:, :, ::-1])




100%|██████████| 5010/5010 [33:00<00:00,  2.53it/s]


In [29]:
with open("data/hotspots_all.json", "w") as outf:
    json.dump(autocropped_hotspots, outf)

In [35]:
# produce train-test-validation splits
train_ratio = 0.75
validation_ratio = 0.15
test_ratio=0.1

hotspots_train, hotspots_test = train_test_split(autocropped_hotspots, test_size=1 - train_ratio)

hotspots_test, hotspots_val = train_test_split(hotspots_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

with open("data/hotspots_train.json", "w") as outf:
    json.dump(hotspots_train, outf)

with open("data/hotspots_val.json", "w") as outf:
    json.dump(hotspots_val, outf)

with open("data/hotspots_test.json", "w") as outf:
    json.dump(hotspots_test, outf)



In [38]:
def extract_hotspots(entry: dict) -> Tuple[List[np.ndarray], List[int]]:

    #load image 
    img = cv2.imread(outfolder +"/" + entry["file_name"])

    # get all bboxes

    bboxes = [annotation["bbox"] for annotation in entry["annotations"]]
    category_ids = [annotation["category_id"] for annotation in entry["annotations"]]

    hotspots = []
    for bbox in bboxes:
        hotspot = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
        hotspots.append(hotspot)

    return hotspots, category_ids

def save_hotspots(dataset: List[dict], outfolder: str):

    for entry in tqdm(dataset):
        hotspots, category_ids = extract_hotspots(entry)
        for i, (hotspot, cat_id) in enumerate(zip(hotspots, category_ids)):
            cv2.imwrite(f"{outfolder}/{Path(entry['file_name']).stem}_{i}_{cat_id}.jpg", hotspot)



In [39]:
save_hotspots(hotspots_train, "/local/ecw/deepscribe-detectron/data/hotspots/train")
save_hotspots(hotspots_test, "/local/ecw/deepscribe-detectron/data/hotspots/test")
save_hotspots(hotspots_val, "/local/ecw/deepscribe-detectron/data/hotspots/val")

100%|██████████| 3740/3740 [05:02<00:00, 12.38it/s]
100%|██████████| 748/748 [01:00<00:00, 12.32it/s]
100%|██████████| 499/499 [00:40<00:00, 12.41it/s]


In [40]:
def ochre_to_xywh(bbox: List) -> List:
    # ochre data provided in  [top left x position, top left y position, bottom right x position, bottom right y position].
    # coco format is [top left x position, top left y position, width, height].

    new_bbox = deepcopy(bbox)
    new_bbox[2] -= new_bbox[0]
    new_bbox[3] -= new_bbox[1]
    return new_bbox

def detectron_to_coco(detectron_dataset: List[dict], sign_list: List[str]):
    coco_json = {"images": [], "annotations": []}

    # add categories
    coco_json["categories"] = [{"id": i, "name": cat} for i, cat in enumerate(sign_list)]

    annotation_ids = 0 

    for i, entry in enumerate(tqdm(detectron_dataset)):
        # get image data
        # assigning image IDs here
        image_data = {"file_name": entry["file_name"], "height":entry["height"], "width": entry["width"], "id": i}
        coco_json["images"].append(image_data)
        # collecting annotations
        for annotation in entry["annotations"]:

            coco_bbox = ochre_to_xywh(annotation["bbox"])
            coco_annotation = {"image_id": i, 
                                "bbox": coco_bbox, 
                                "category_id": annotation["category_id"],
                                "iscrowd": 0,
                                "id": annotation_ids,
                                "area": coco_bbox[2]*coco_bbox[3]}
            coco_json["annotations"].append(coco_annotation)
            annotation_ids += 1

    return coco_json



In [41]:

with open("data/hotspots_train_coco.json", "w") as outf:
    json.dump(detectron_to_coco(hotspots_train, signs), outf)

with open("data/hotspots_val_coco.json", "w") as outf:
    json.dump(detectron_to_coco(hotspots_val, signs), outf)

with open("data/hotspots_test_coco.json", "w") as outf:
    json.dump(detectron_to_coco(hotspots_test, signs), outf)


100%|██████████| 3740/3740 [00:01<00:00, 2411.25it/s]
100%|██████████| 499/499 [00:00<00:00, 4792.06it/s]
100%|██████████| 748/748 [00:00<00:00, 4811.58it/s]
