In [1]:
import os
import cv2
import math
import random
import json
import pickle
import errno
import datetime
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy.stats import norm
# !pip3 install git+https://github.com/aleju/imgaug
from imgaug import augmenters as iaa

In [2]:


class SVObjectExtractor:
    """Prepare unprocessed Data downlaoded from supervisely

        SuperVisely Object Extractor
        For each raw image containing multiple objects markted by Point, Bounding Box or Polygon, we create images cropped to given 'size', with each object centered.
        We create directories for each class containing those cropped images and a single annotation file in COCO format.
        You can create a 'counterclass', with random images cropped from raw images (it is guaranteed those images wont overlap with objects of other classes).
        You can chose to use 'augmentation' to expand the size of the dataset.

        Final structure:

        root/
         ├───raw_data_directory/
         │
         └───Datasets/
                ├───<DatasetName>
                │       ├───annotation.json
               ...      ├───labelstrength.pickle
                        ├───Class1/
                        ├───Class2/
                       ...
                        └───ClassN/
                        └───Fake/ (created if 'counterclass' is True)
"""

    def __init__(self, raw_path, target_path=None, dataset_name=None, object_size=None, resize_to=None, augmentation=False, counterclass=False):
        """Args:
            raw_path        --      path to raw data
            target_path     --      path to save dataset at. If an existing dataset shall be expanded, give the location of the <Datasetname> directory.
            dataset_name    --      name of the dataset
            object_size     --      size of objects in images (single integer. only sqaures generated)
            resize_to       --      resize cropped images to resize_to (single integer. only squares generated)
            augmentation    --      if True, dataset will 5be expanded by augmentations of the cropped images (default False)
            counterclass    --      if True, a 'Fake' class with random images of size 'size' will be created

        Attributes:
            raw_data_path   --      path to raw data
            dataset_path    --      path to dataset directory
            dataset_name    --      name of the set to be created
            object_size     --      size of cropped images around segmentation
            final_size      --      final size of cropped images
            augmentation    --      boolean whether augmentations of cropped images shall be created or not
            counterclass    --      boolean, whether a fake class should be created or not
            classes         --      list of classes found
            class_to_idx    --      index value for each class
            annotation      --      contains json in coco format for dataset about to be made
            lstrength_anno  --      pickle dictionary with imagename as key and lebelstrength as value
            total_images    --      number of all objects found, will be used as image-name and image-id
            total_annotations   --  number of annotations made for 'self.annotation' json. Will start with 9000000000 to seperate from image-id
            DATA_CREATED    --      boolen whether create_data was alrdy called or not
            DIRS_MADE       --      boolen whether directories are created or not
            SET_ALRDY_EXISTS      --  boolean whether the Dataset directories already exist or not
        """

        if os.path.isdir(raw_path):
            self.raw_data_path = raw_path
            invalid_path = False
            abort = False
        else:
            print("Error. Given path to raw data does not exist.")
            raise ValueError
            invalid_path = True
        if not invalid_path:
            self.dataset_name = dataset_name
            self.DIRS_MADE = False
            self.SET_ALRDY_EXISTS = False

            # some cases to check, to get the right path
            setname_len = len(self.dataset_name)
            if target_path:
                self.dataset_path = target_path
            else:
                self.dataset_path = os.getcwd()

            if self.dataset_name in self.dataset_path[-(setname_len+1):]:
                self.SET_ALRDY_EXISTS = True

            elif 'Datasets' in self.dataset_path[-(len('Datasets')+1):]:
                if os.path.isdir(os.path.join(self.dataset_path,self.dataset_name)):
                    print("A directory for given datasetname already exists. The set will be expanded.")
                    if not self.__proceed("Do you want to expand the existing set?"):
                        raise ValueError
                        abort = True
                    else:
                        self.dataset_path = os.path.join(self.dataset_path,self.dataset_name)
                        self.SET_ALRDY_EXISTS = True
                else:
                    self.dataset_path = os.path.join(self.dataset_path, self.dataset_name)
            else:
                if os.path.isdir(os.path.join(self.dataset_path, 'Datasets', self.dataset_name)):
                    print("A path to given datasetname already exists. The set will be expanded.")
                    if not self.__proceed("Do you want to expand the existing set?"):
                        raise ValueError
                        abort = True
                    else:
                        self.SET_ALRDY_EXISTS = True
                self.dataset_path = os.path.join(self.dataset_path, 'Datasets', self.dataset_name)

            if not abort:
                self.object_size = object_size
                self.final_size = resize_to
                self.augmentation = augmentation
                self.counterclass = counterclass
                self.classes = self.__getClasses(raw_path)
                self.DATA_CREATED = False
                self.annotation = None
                self.lstrength_anno = None
                self.total_images = None
                self.total_annotations = None

                dir_ready = self._make_directories()
                if dir_ready:
                    self._create_data()



    def _make_directories(self):
        """Create a direcotry for each class in self.classes"""

        print("Create directories..")
        if self.SET_ALRDY_EXISTS:
            if self.DATA_CREATED:
                print("Everything is already done.")
                return False

        elif self.DIRS_MADE:
            print("Error. Directories were already created, yet could not be found.")
            return False

        try:
            for label in self.classes:
                os.makedirs(os.path.join(self.dataset_path, label))
        except OSError as e:
            if e.errno != errno.EEXIST:
                print(e)
                raise
                return False
            pass
        self.DIRS_MADE = True
        print("Directories created.")
        return True



    """Creates annotationfile and cropped images of raw image.
    """
    def _create_data(self):
        """Creates annotationfile and cropped images of raw image."""
        print("Process Data...")
        # Annotation in COCO format
        # either load existing annotation file..
        if self.SET_ALRDY_EXISTS:
            print("Searching for existing annotation files..")
            try:
                with open(os.path.join(self.dataset_path, 'annotation.json')) as annotation_file:
                    self.annotation = json.load(annotation_file)
                    found_json = True
                with open(os.path.join(self.dataset_path, 'labelstrength.pickle'), 'rb') as lstrength_anno:
                    self.lstrength_anno = pickle.load(lstrength_anno)
                    found_pickle = True
                if found_json and found_pickle:
                    print("Loading file succeeded.")
                    list(self.lstrength_anno.keys())[-1]
                    self.total_images = int(list(self.lstrength_anno.keys())[-1][:-4])
                    self.total_annotations = self.total_images + 9000000000
                else:
                    print("This should have threw anexception. Either the annotation file or the labelstrength file did not load.")
                    self.SET_ALRDY_EXISTS = False
                    return
            except Exception as e:
                print(e)
                print("We could not find an annotation or an lablestrength file. If the directory is empty continue, otherwise it is not reccomended to proceed")
                if not self.__proceed():
                    return
                self.SET_ALRDY_EXISTS = False

        # ..or create new None
        if not self.SET_ALRDY_EXISTS:
            today = str(datetime.datetime.utcnow())
            categories = []
            for idx, label in enumerate(self.classes):
                categories.append({'supercategory': 'object', 'id': idx, 'name': label})
            self.annotation = {
                'info': {'year': 2020,'version': None,'description': 'Pollenforager Detection','contributor': 'Mara Kortenkamp, Tim Feige','url': 'https://github.com/marakortenkamp/pollen-detection','date_created': today},
                'images': [],
                'annotations': [],
                'licenses': {'id': None,'name': None,'url': None,},
                'category': categories
                }
            self.lstrength_anno = {}
            self.total_images, self.total_annotations = 0, 9000000000

        # find image-annotation pairs
        # go find an image:
        for root, folders, files in os.walk(self.raw_data_path):
            for folder in folders:
                if folder == 'img':
                    for img_root, img_folder, img_files in os.walk(os.path.join(root, folder)):
                        for img_file in img_files:
                            # go find its realted annotation file:
                            found_ann = False
                            for ann_root, ann_folder, ann_files in os.walk(os.path.join(root, 'ann')):
                                if found_ann:
                                    break
                                for ann_file in ann_files:
                                    # found a pair!
                                    if img_file in ann_file:
                                        # create dictionary for current img with all classes as keys and coords of objects in current image as values:
                                        # class_coords { 'class1' : [(x,y),..], 'class2' : [(x,y), (x,y), ..], .. }
                                        class_coords = { i : [] for i in self.classes}
                                        with open(os.path.join(ann_root, ann_file)) as ann_json:
                                            ann_data = json.load(ann_json)
                                        if len(ann_data['objects']):
                                            for obj in ann_data['objects']:
                                                class_coords[obj['classTitle']].append(obj['points']['exterior'][0])
                                        # crop image around objects, get annotation information of each object
                                        img_path = os.path.join(img_root, img_file)
                                        self.__process_image(img_path, class_coords)
                                        # to prevent unnecessary looping:
                                        found_ann = True
                                        break
        # save annotation file
        with open(os.path.join(self.dataset_path, 'annotation.json'), 'w') as cocofp:
            json.dump(self.annotation, cocofp)
        with open(os.path.join(self.dataset_path, 'labelstrength.pickle'), 'wb') as picklefp:
            pickle.dump(self.lstrength_anno, picklefp)

        print("Data created.")



    #--------------------------#
    ##### Helper functions #####
    #--------------------------#

    def __getClasses(self, dir):
        """Return list of classes found in annotation file(s) and sets self.class_to_idx

        Args:
            dir        -- path to raw data

        Note: classnames must be stored in some.json: {.., 'objects': [{.., 'classTitle':<classname>, ..}, ..]}
        """
        classes = []
        if self.counterclass:
            classes.append('Fake')
        for root, dicts, files in os.walk(dir):
            for file in files:
                if (file[-5:] == '.json'):
                    with open(os.path.join(root, file)) as json_file:
                        data = json.load(json_file)
                        try:
                            if len(data['objects']):
                                for object in data['objects']:
                                    if object['classTitle'] == 'est':
                                        print("est location: ", root, file)
                                    if not object['classTitle'] in classes:
                                        classes.append(object['classTitle'])
                        except:
                            pass
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes



    def __proceed(self, string=None):
        """Ask for userinput"""
        while True:
            if string:
                proceed = input(string + " (Y/N): ")
            else:
                proceed = input("Do you want to continue?(Y/N): ")
            if proceed.upper() == 'Y':
                return True
            elif proceed.upper() == 'N':
                return False



    def __process_image(self, filepath, coord_dic):
        """Creates cropped images for each coord in raw image and updates the COCO file.
        If requested augmented duplicates of those images are made, annotations will be provided.
        If requested, an image around random coords without annotation is created for a fake class.
        """

        img = cv2.imread(filepath, 0)
        if self.counterclass:
            coord_dic = self.__add_rnd_coords(coord_dic)

        for label in self.classes:
            for coord in coord_dic[label]:
                self.__extract_object(img, coord, label)

        return



    def __add_rnd_coords(self, coord_dic):
        """Generates a set of random coordinates. We assume that each class in self.classes is about the same size,
        thus we create as many random coords as a fix class from self.classes has in the current 'coord_dic'.
        We assure that images cropped around random coordinates will not overlap with labeled objects.

        Note: Superviselys points are in (x,y) format, cv2 opens files in (y,x) format, we will save coords in (x,y) for persistence
        """

        if self.classes[0] != 'Fake':
            rnd_size = len(coord_dic[self.classes[0]])
        else:
            rnd_size = len(coord_dic[self.classes[1]])

        offset = self.object_size // 2

        for i in range(9*rnd_size):
            too_close = True
            while too_close:
                # generate rndm coord
                rnd_x = random.randint(0+offset,3990-offset)
                rnd_y = random.randint(0+offset,2999-offset)
                rnd_coord = [rnd_x, rnd_y]
                # interfere with any object?
                next_try = False
                for label in self.classes:
                    if next_try:
                        break
                    for coord in coord_dic[label]:
                        threshold = self.object_size//2
                        too_close = self.__eukl_dist(coord, rnd_coord, threshold)
                        if too_close:
                            next_try = True
                            break

                if not too_close:
                    coord_dic['Fake'].append(rnd_coord)

        return coord_dic



    def __eukl_dist(self, p, q, threshold):
        """Returns boolean whether distance is smaller than threshold"""
        d = math.sqrt(((p[0]-q[0])**2)+((p[1]-q[1])**2))
        return (d <= threshold)



    def __extract_object(self, img, coord, label):
        """Crop image around coord,
           augment cropped image,
           save image in class directorys,
           update self.annotation for none-Fake images

           Note: cv2-Images have format [y,x], our coords have format [x,y]
        """

        final_images = []
        if self.augmentation:
            #for augmentation we crop a larger image, so we can rotate more easily
            object_size = self.object_size+(self.object_size//2)
        else:
            object_size = self.object_size
        raw_img_size = img.shape        # raw_img_size has format (y,x)
        y = coord[1]
        if y == raw_img_size[0]:
            y == raw_img_size[0]-1
        x = coord[0]
        if x == raw_img_size[1]:
            x == raw_img_size[1]-1

        start_y = y - (object_size//2)
        if ( start_y < 0 ):
            start_y = 0
            border_top = (object_size//2)-y
        else:
            border_top = 0

        end_y = start_y + object_size - border_top
        if end_y > raw_img_size[0]:
            border_bottom = end_y-raw_img_size[0]
            end_y = raw_img_size[0]
        else:
            border_bottom = 0

        start_x = x - (object_size//2)
        if ( start_x < 0 ):
            start_x = 0
            border_left = (object_size//2)-x
        else:
            border_left = 0

        end_x = start_x + object_size - border_left
        if end_x > raw_img_size[1]:
            border_right = end_x-raw_img_size[1]
            end_x = raw_img_size[1]
        else:
            border_right = 0

        object = img[start_y:end_y, start_x:end_x]
        object = cv2.copyMakeBorder(object, border_top, border_bottom, border_left, border_right, cv2.BORDER_REPLICATE)

        if self.augmentation:
            final_images = self.__augment_img(object, label)
        else:
            if self.final_size:
                object = cv2.resize(object, (self.final_size,self.final_size))
            final_images.append((object, self.class_to_idx[label]))

        # safe images to storage and update annotation
        for final_image, labelstrength in final_images:
            self.total_images += 1
            filename = str(self.total_images) + '.png'

            #fill pickle file
            self.lstrength_anno[filename] = labelstrength

            #fill coco file
            if label != 'Fake':
                self.total_annotations += 1
                timestmp = str(datetime.datetime.utcnow())
                anno_img = {'id':self.total_images,'width':final_image.shape[0],'height':final_image.shape[1],'file_name':filename,'license':None,'flickr_url':None,'coco_url':None,'date_captured':timestmp}
                anno_anno = {'id':self.total_annotations,'image_id':self.total_images,'category_id':label,'segmentation':[],'area':final_image.size,'bbox':[],'iscrowd':0}
                self.annotation['images'].append(anno_img)
                self.annotation['annotations'].append(anno_anno)

            savefileat = os.path.join(self.dataset_path, label, filename)
            cv2.imwrite(savefileat, final_image)

        return



    """Create altered versions of given image.
        Return set of alterations.
    """
    def __augment_img(self, input_img, label):
        """Create set of alterations of given image."""
        images = []
        final_images =[]

        # apply on input_img
        fliplr = iaa.Fliplr(1)
        flipud = iaa.Flipud(1)

        # apply on input_img and each flip
        rotate60 = iaa.Affine(rotate=(60))
        rotate120 = iaa.Affine(rotate=(120))
        rotate180 = iaa.Rot90(2)
        rotate240 = iaa.Affine(rotate=(-120))
        rotate300 = iaa.Affine(rotate=(-60))

        # apply on input_img, each flip and each rotation
        gauss = iaa.AdditiveGaussianNoise(scale=0.04*255)
        brighter = iaa.Multiply(1.5)
        darker = iaa.Multiply(0.5)
        gaussblur = iaa.GaussianBlur(sigma=(1))

        final_images.append(input_img)
        for img in final_images:
            img_fliplr = fliplr(image=img)
            img_flipud = flipud(image=img)
            images.append(img)
            images.append(img_fliplr)
            images.append(img_flipud)
        final_images = []
        for img in images:
            img_rot60 = rotate60(image=img)
            img_rot120 = rotate120(image=img)
            img_rot180 = rotate180(image=img)
            img_rot240 = rotate240(image=img)
            img_rot300 = rotate300(image=img)
            final_images.append(img)
            final_images.append(img_rot60)
            final_images.append(img_rot120)
            final_images.append(img_rot180)
            final_images.append(img_rot240)
            final_images.append(img_rot300)
        images = []
        for img in final_images:
            img_gauss = gauss(image=img)
            img_bright = brighter(image=img)
            img_dark = darker(image=img)
            img_gblur = gaussblur(image=img)
            images.append(img)
            images.append(img_gauss)
            images.append(img_bright)
            images.append(img_dark)
            images.append(img_gblur)

        top_crop = math.ceil((self.object_size//2)/2)
        right_crop = (self.object_size//2)//2
        bottom_crop = (self.object_size//2)//2
        left_crop = math.ceil((self.object_size//2)/2)
        img_size = input_img.shape

        final_images = []
        cur_label = self.class_to_idx[label]
        for img in images:
            if label == 'Fake':
                label_strength = 0.0
            else:
                # This part will shift the image by some value and adjust the label strength. its like a 2d normal distribution to determ the amount and direction its shifted
                mu, std = 0.0, 10
                normdist = np.random.normal(loc=mu, scale=std, size=(2,))   #random 2d point with normdist prop
                vecnorm = np.linalg.norm(normdist)                          #length of the vector
                nrv = scipy.stats.norm(0, std)
                zero_nrv = nrv.pdf(0.0)
                label_strength = nrv.pdf(max(0, vecnorm - std / 2.0)) / zero_nrv
                img = scipy.ndimage.shift(img, normdist, mode="nearest")
            img = img[top_crop:(img_size[0]-bottom_crop), left_crop:(img_size[1]-right_crop)]
            if self.final_size:
                img = cv2.resize(img, (self.final_size,self.final_size))
            final_images.append((img,label_strength))

        return final_images

In [4]:
BeeData = SVObjectExtractor(raw_path="/home/tkf/Desktop/dev/Uni/pollen-detection/Fullsize/",target_path="/home/tkf/Desktop/dev/Uni/pollen-detection", dataset_name="PollenDataSmall", object_size=64, augmentation=True, counterclass=True)

Create directories..
Directories created.
Process Data...
Data created.
