In [2]:
import json
from sklearn.model_selection import train_test_split
import os
from os.path import basename
import shutil
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from typing import Dict, Tuple, List
from tqdm import tqdm
from copy import deepcopy
import cv2
import numpy as np
import pandas as pd
from pathlib import Path

Matplotlib is building the font cache; this may take a moment.


In [3]:
with open("../raw_ochre/imagesWithHotspots_nov_2020.txt", "r") as infile:
    annotations = [line.strip() for line in infile.readlines() if len(line.strip()) > 0]

In [4]:
imagefolder = "/local/ecw/deepscribe-detectron/archive/images_renamed"

In [5]:
dataset = []
# collecting all signs and readings to assign numerical IDs
# counting sign frequencies
sign_ids = list()
sign_frequencies = list()
reading_ids = list()
reading_frequencies = list()
for anno in annotations:

    textname, size, hotspots = anno.split(":")

    uuid =  textname.split("_")[0]

    height, width = [int(val) for val in size.split("x")]

    fname = f"{imagefolder}/{uuid}.jpg"

    assert os.path.exists(fname), f"image file {fname} does not exist!"

    annos = []

    for hotspot in hotspots.split(";"):
        if len(hotspot) > 0:
            classes, coordslist = hotspot.split("~")
            # Sandra Schloen, Nov 2019:
            #Please note: when I created the new hotspots I labeled them with the signName_signUUID 
            # so that hotspot cutouts of the same sign would sort together in a folder. Previously when I
            #  created this image list I used signUUID_signName, but this time I have swapped them to make
            #  the format more consistent with the hotspot labels; that is, the Name now precedes the UUID
            #  in this new image list too. You’ll need to adjust your code accordingly.

            sign,reading = [elem.split("_")[0] for elem in classes.split("/")]

            # assign the sign and reading ID 
    
            if sign not in sign_ids:
                sign_ids.append(sign)
                sign_frequencies.append(1)

            sign_id = sign_ids.index(sign)
            sign_frequencies[sign_id] += 1

            if reading not in reading_ids:
                reading_ids.append(reading)
                reading_frequencies.append(1)

            reading_id = reading_ids.index(reading)
            reading_frequencies[reading_id] += 1

            coords = [float(coord) for coord in coordslist.split(",")]
            # only 1 category for now - just sign or non-sign
            annos.append({"bbox": coords, "bbox_mode":0, "sign":sign, "reading": reading, "sign_id": sign_id, "reading_id": reading_id, "category_id": sign_id})

    dataset.append({"file_name":fname, "height":height, "width":width, "image_id": uuid, "annotations": annos})

In [6]:
# crop images to remove "empty" space of images
autocropped_hotspots = []

outfolder = "/local/ecw/deepscribe-detectron/data_nov_2020/images_cropped"
annotated = "/local/ecw/deepscribe-detectron/data_nov_2020/images_annotated"
hotspots_only = "/local/ecw/deepscribe-detectron/data_nov_2020/images_annotated_hotspots"

for hotspot in tqdm(dataset):
    # don't keep if there's only one sign!
    # TODO: check thresholds here. 

    if len(hotspot["annotations"]) > 1:
        new_hotspot = deepcopy(hotspot)

        points = [anno["bbox"] for anno in hotspot["annotations"]]
        
        # dealing with negative-valued coordinates
        min_x0 = int(max(0, min([pt[0] for pt in points])))
        max_x1 = int(max([pt[2] for pt in points]))

        min_y0 = int(max(0, min([pt[1] for pt in points])))
        max_y1 = int(max([pt[3] for pt in points]))

        img = cv2.imread(hotspot["file_name"])


        cropped = img[min_y0:max_y1, min_x0:max_x1, :]


        # adjust points - new origin is min_x0, min_y0


        new_hotspot["height"] = cropped.shape[0]
        new_hotspot["width"] = cropped.shape[1]
        new_hotspot["file_name"] = basename(hotspot["file_name"])
        new_hotspot["bbox_mode"] = 0

        for anno in new_hotspot["annotations"]:
            old_bbox = anno["bbox"]

            anno["bbox"] = [
                max(0, old_bbox[0]) - min_x0,
                old_bbox[1] - min_y0,
                old_bbox[2] - min_x0,
                old_bbox[3] - min_y0,
            ]

        

        # cv2.imwrite(outfolder + "/" + new_hotspot["file_name"], cropped)
        # autocropped_hotspots.append(new_hotspot)
        # visualizer = Visualizer(cropped[:, :, ::-1], scale=0.5, metadata={"thing_classes": sign_ids})
        # out = visualizer.draw_dataset_dict(new_hotspot)
        # cv2.imwrite(annotated + "/" + basename(hotspot["file_name"]), out.get_image()[:, :, ::-1])
        visualizer = Visualizer(cropped[:, :, ::-1], scale=0.5, metadata={"thing_classes": ["hotspot" for sign in sign_ids]})
        out = visualizer.draw_dataset_dict(new_hotspot)
        cv2.imwrite(hotspots_only + "/" + basename(hotspot["file_name"]), out.get_image()[:, :, ::-1])


  0%|          | 0/5013 [00:00<?, ?it/s]


IndexError: list index out of range

In [22]:
# split into train-test-val
# # produce train-test-validation splits
train_ratio = 0.75
validation_ratio = 0.15
test_ratio=0.1

hotspots_train, hotspots_test = train_test_split(autocropped_hotspots, test_size=1 - train_ratio)

hotspots_test, hotspots_val = train_test_split(hotspots_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

with open("/local/ecw/deepscribe-detectron/data_nov_2020/hotspots_train.json", "w") as outf:
    json.dump(hotspots_train, outf)

with open("/local/ecw/deepscribe-detectron/data_nov_2020/hotspots_val.json", "w") as outf:
    json.dump(hotspots_val, outf)

with open("/local/ecw/deepscribe-detectron/data_nov_2020/hotspots_test.json", "w") as outf:
    json.dump(hotspots_test, outf) 

In [23]:
# save sign list to disk

signs_df = pd.DataFrame({"sign": sign_ids, "frequency":sign_frequencies}) 

signs_df.to_csv("/local/ecw/deepscribe-detectron/data_nov_2020/signs_nov_2020.csv")

In [31]:
def extract_hotspots(entry: dict) -> Tuple[List[np.ndarray], List[int]]:

    #load image 
    img = cv2.imread(outfolder +"/" + entry["file_name"])

    # get all bboxes

    bboxes = [annotation["bbox"] for annotation in entry["annotations"]]
    category_ids = [annotation["category_id"] for annotation in entry["annotations"]]

    hotspots = []
    for bbox in bboxes:
        hotspot = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])]
        hotspots.append(hotspot)

    return hotspots, category_ids

def save_hotspots(dataset: List[dict], outfolder: str):

    os.makedirs(outfolder, exist_ok=True)

    for entry in tqdm(dataset):
        hotspots, category_ids = extract_hotspots(entry)
        for i, (hotspot, cat_id) in enumerate(zip(hotspots, category_ids)):
            os.makedirs(f"{outfolder}/{cat_id}", exist_ok=True)

            cv2.imwrite(f"{outfolder}/{cat_id}/{Path(entry['file_name']).stem}_{i}_{cat_id}.jpg", hotspot)


In [32]:
save_hotspots(hotspots_train, "/local/ecw/deepscribe-detectron/data_nov_2020/hotspots/train")
save_hotspots(hotspots_test, "/local/ecw/deepscribe-detectron/data_nov_2020/hotspots/test")
save_hotspots(hotspots_val, "/local/ecw/deepscribe-detectron/data_nov_2020/hotspots/val")

100%|██████████| 3742/3742 [06:18<00:00,  9.88it/s]
100%|██████████| 748/748 [01:26<00:00,  8.62it/s]
100%|██████████| 500/500 [00:44<00:00, 11.18it/s]
