TODO - DESCRIBE THIS FILE 

In [None]:
%pip install pillow

from PIL import Image, ImageDraw 
from PIL.TiffTags import TAGS

import os
from typing import Any
import json 

from typing import Any, NewType
import os
import csv
import math
import shutil
import json

Box = NewType('Box', tuple[int,int,int,int])

DATASET_SOURCE = "F:\\dataUtils\\raw_data" # FIXME change accordingly
OUTPUT_DATASET = "E:\\dataset" # FIXME change accordingly

Math utils

In [3]:
#utils
def CXCYWHtoXYXY(box: Box) -> Box:
    cx, cy, w, h = box
    xmin = cx-w//2
    ymin = cy-h//2
    xmax = cx+(w+1)//2  # ceil
    ymax = cy+(h+1)//2  # ceil
    return xmin, ymin, xmax, ymax


def get_res_metadata(img):
        tiff_tags = {}
        for tag, value in img.tag.items():
            tag_name = TAGS.get(tag, tag)
            tiff_tags[tag_name] = value
        
        res = {
                "xresolution": tiff_tags['XResolution'][0][0]/tiff_tags['XResolution'][0][1],
                "yresolution": tiff_tags['YResolution'][0][0]/tiff_tags['YResolution'][0][1],
                "resolutionUnit": tiff_tags['ResolutionUnit'] 
        }
        return res

class ImageUtil:
  
    def __init__(self, img):
        self.img = img
        self.res = get_res_metadata(img)
        self.xppu = float(self.res["xresolution"])
        self.yppu = float(self.res["yresolution"])
        
    def areaU2P(self, area):
        return float(area)*(max(self.xppu, self.yppu)**2)
       
    def areaP2U(self, area):
        return float(area)/(max(self.xppu, self.yppu)**2)
        

class ImageUtilOpener:
    def __init__(self, file_name):
        self.file_name = file_name

    def __enter__(self):
        self.img = Image.open(self.file_name)
        return ImageUtil(self.img)
 
    def __exit__(self, *args):
        self.img.close()

Read csv+tif ImageJ-created dataset, and convert to dict
of images_path as keys and corresponding bounding boxes

In [None]:
def sanitize_annotations(name:str, annotation: dict[str, Any]) -> None:
    
    sanitized_annotations = {
        "category_id": [],
        "boxes": []
    }
    
    bbox_dict: dict[Box, int] = dict()
    for label, bbox in zip(annotation["category_id"], annotation["boxes"]):
        if bbox in bbox_dict:
            if bbox_dict[bbox] == label:
                print(f"WARNING: Duplicate bbox found in {name}: {bbox}")
            else:
                print(f"ERROR: Same bbox with different label found in {name}: {bbox}")
            continue
        if min(bbox) < 0:
            print(f"Corrupted box found in {name}: {bbox}")
            continue

        # TODO check if it's all conditions
        bbox_dict[bbox] = label

    for key in bbox_dict:
        sanitized_annotations["category_id"].append(bbox_dict[key])
        sanitized_annotations["boxes"].append(key)

    return sanitized_annotations
        
def convert_annotations(csv_path: str, img_path: str) -> dict[str, Any]:
    """
    Converts csv, img pair to annotations list
    """
    annotations: dict[str, Any] = {}
    
    name = os.path.basename(img_path).removesuffix(".tif")
    
    annotations = {
        "category_id": [],
        "boxes": []
    }
    
    with ImageUtilOpener(img_path) as util:
        if os.path.exists(csv_path): 
            with open(csv_path) as data_file:
                data = csv.reader(data_file)
                line_count = 0
                try:
                    for (num, label, area, category_id, category_name)  in data:
                        if line_count != 0: #skip column titles
                            split = label.split(':')
                            (filename, (y, x)) = (split[0], split[-1].split('-'))
                            areaInPixels = util.areaU2P(float(area))
                            area2length = int(math.sqrt(areaInPixels / 3.14159) * 2) #assumes that nodule is a circle and calculates the width of square surrounding it
                            annotations["category_id"].append(int(category_id))
                            bbox = CXCYWHtoXYXY((int(x), int(y), area2length,area2length))
                            annotations["boxes"].append(bbox)
                        line_count+=1
                except Exception as inst:
                    annotations["boxes"] = []
                    annotations["category_id"] = []
                    print(type(inst))    
                    print(inst) 
                    print(f"{csv_path} Failed")
                     
    return annotations
    
    
def convert_dataset(data_path: str, imgs_path: str, res_path: str, replace_imgs=True):
    img_list = [ ".".join(f.split('.')[0:-1])  
            for f in os.listdir(imgs_path) if os.path.isfile(imgs_path+'\\'+f) and f.endswith(".tif")
            ] # creates list of imgs 
    
    if replace_imgs:
        try:
            shutil.rmtree(res_path) #clears previos attempts
        except:
            pass

    os.makedirs(f"{res_path}\\imgs\\", exist_ok=True)
       
    dataset = {
        "imgs": [],
        "annotations": []
    }
    id = 0
    for name in img_list:
        csv_path = f'{data_path}\\{name}.tif.csv'
        img_path = f'{imgs_path}\\{name}.tif'
        annotations = convert_annotations(csv_path, img_path)
        if not annotations["boxes"]:
            continue # skip background
        id+=1
        if replace_imgs:
            img = Image.open(img_path)
            img.save(f"{res_path}\\imgs\\{id}.jpeg",quality=100)
        
        annotations = sanitize_annotations(name, annotations)
        dataset["imgs"].append(f"{id}")
        dataset["annotations"].append(annotations)
    

    with open(f"{res_path}\\dataset.json", "w") as outfile: 
        json.dump(dataset, outfile, indent=4)
    print("finished converting")
 
convert_dataset(f"{DATASET_SOURCE}\\csv", f"{DATASET_SOURCE}\\imgs", OUTPUT_DATASET, replace_imgs=False)

Patch dataset into images of desired size

In [None]:

def box_area(box: Box):
    xmin, ymin, xmax, ymax = box
    return max(0, xmax - xmin) * max(0, ymax - ymin)
def intersection_area(box1: Box, box2: Box) -> int:
    xmin1, ymin1, xmax1, ymax1 = box1
    xmin2, ymin2, xmax2, ymax2 = box2
    inter_xmin = max(xmin1, xmin2)
    inter_ymin = max(ymin1, ymin2)
    inter_xmax = min(xmax1, xmax2)
    inter_ymax = min(ymax1, ymax2)
    return box_area((inter_xmin, inter_ymin, inter_xmax, inter_ymax))

def crop(img: Image.Image,
         crop_box: Box,
         annots: tuple[list[int], list[Box]],
         crop_tolerance: float,
         erase_not_tolerated: bool = True) -> tuple[Image.Image, tuple[list[int], list[Box]]]:
    cropped_img = img.crop(crop_box)
    cropped_annots = {"category_id": [], "boxes": []}
    labels, bboxes = annots["category_id"], annots["boxes"]
    
    draw_context = ImageDraw.Draw(cropped_img)
    for label, bbox in zip(labels, bboxes):
        relative_bbox = (max(crop_box[0], bbox[0]) - crop_box[0],
                         max(crop_box[1], bbox[1]) - crop_box[1],
                         min(crop_box[2], bbox[2]) - crop_box[0],
                         min(crop_box[3], bbox[3]) - crop_box[1])
        intersection = float(intersection_area(bbox, crop_box)) / box_area(bbox)
        if intersection > 1 - crop_tolerance:
            cropped_annots["category_id"].append(label)
            cropped_annots["boxes"].append(relative_bbox)
        elif intersection > 0 and erase_not_tolerated:
            draw_context.rectangle(relative_bbox, width=1, fill="purple")

    return cropped_img, cropped_annots

def patch_img(img: Image.Image,
          annots: dict[str, Any],
          desired_image_size: int,
          overlap: int,
          crop_tolerance: float) -> tuple[list[Image.Image], list[tuple[list[int], list[Box]]]]:
    imgs_per_width = math.ceil(float(img.width) / desired_image_size)
    imgs_per_height = math.ceil(float(img.height) / desired_image_size)

    padded_height = imgs_per_width * desired_image_size
    padded_width = imgs_per_width * desired_image_size

    padded_img = Image.new("RGB", (padded_width, padded_height))
    padded_img.paste(img)

    patched_imgs, patched_annots = [], []

    for row in range(imgs_per_height):
        for col in range(imgs_per_width):
            xmin, ymin = col * desired_image_size, row * desired_image_size
            xmax, ymax = int(xmin + desired_image_size * (1 + overlap)), int(ymin + desired_image_size * (1 + overlap))
            xmax, ymax = min(padded_width, xmax), min(padded_height, ymax)

            crop_box = (xmin, ymin, xmax, ymax)

            cropped_img, cropped_annots = crop(padded_img, crop_box, annots, crop_tolerance)

            if cropped_annots["category_id"]:
                patched_imgs.append(cropped_img)
                patched_annots.append(cropped_annots)
            else:
                print(f"{row} {col} ignored")

    return patched_imgs, patched_annots

def patch_dataset(dataset_root: str,
                  desired_image_size: int = 1024,
                  overlap: float = 0.2,
                  crop_tolerance: float=0.3):
    dataset_root = os.path.normpath(dataset_root)
    annot_file = dataset_root + "\\dataset.json"
    imgs_dir = dataset_root + "\\imgs"
    dataset_parent = "\\".join(dataset_root.split('\\')[:-1])
    patched_root = dataset_parent + '\\' + os.path.basename(dataset_root)+"_patched"
    os.makedirs(f"{patched_root}\\imgs", exist_ok=True)
    
    patched_dataset = {
        "imgs": [],
        "annotations": []
    }
    
    with open(annot_file, 'r') as annot_file:
        dataset = json.load(annot_file)
        
    image_id = 1
    for img_name, annotations in zip(dataset["imgs"], dataset["annotations"]):
        img = Image.open(f"{imgs_dir}\\{img_name}.jpeg")
        patched_imgs, patched_annots = patch_img(img, annotations,desired_image_size, overlap, crop_tolerance)
        for img, annots in zip(patched_imgs, patched_annots):
            patched_dataset["imgs"].append(f"{image_id}")
            patched_dataset["annotations"].append(annots)
            img.save(f"{patched_root}\\imgs\\{image_id}.jpeg", quality=100)
            image_id+=1
            
    with open(f"{patched_root}\\dataset.json", "w") as outfile: 
        json.dump(patched_dataset, outfile, indent=4)
        
patch_dataset(OUTPUT_DATASET, crop_tolerance=0.9)