In [2]:
import pdb
import os
from pathlib import Path
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
import math

In [7]:
class DataPreparation:
    """Prepare unprocessed Data:
    Make directories, crop raw images create random image sections as counterclasses and
    provide annotation.json in COCO format

    Directory Structure:

    basedirectory/
     ├───raw_data_directory/
     │
     ├──annotation.json
     │
     └───Data/
           ├───ForagerData/
           │     ├───Fake/
           │     └───Real/
           └───PollenData/
                 ├───Fake/
                 └───Real/
    """
    
    def __init__(self, pollen_size=50, forager_size=300, raw_img_size=[3000,4000], source='raw_data/BeesBook/'):
        """Set major values
        pollen_size and forager_size are final sizes of square pictures for a nn
        source is a path to raw images
        DATA_CREATED shall be set True manually after calling the class the first time
        BASE_DIR is the current directory
        Other directories are for 
        """
        self.DATA_CREATED = False
        self.IMG_POLLEN_SIZE = pollen_size
        self.IMG_FORAGER_SIZE = forager_size
        self.RAW_DATA_DIR = source
        self.IMAGE_RAW_SIZE = raw_img_size
        self.BASE_DIR = os.getcwd() + '/'
        self.POLLEN_DIR = 'Data/PollenData/Real/'
        self.FORAGER_DIR = 'Data/ForagerData/Real/'
        self.RND_POLLEN_DIR = 'Data/PollenData/Fake/'
        self.RND_FORAGER_DIR = 'Data/ForagerData/Fake/'
        self.IMG_DIR = 'img/'
        self.ANN_DIR = 'ann/'

        if not os.path.isdir('Data'):
            try:
                print("Creating directories..")
                os.makedirs(self.POLLEN_DIR)
                os.makedirs(self.FORAGER_DIR)
                os.mkdir(self.RND_POLLEN_DIR)
                os.mkdir(self.RND_FORAGER_DIR)
            except Exception as e:
                print(e)
                return
            print("Directories created.")
            if not self.DATA_CREATED:
                print("Creating Data..")
                self.create_data()
        else:
            print("Directory already exists.")
            print("If you want to create Data anyway use <Intance>.create_data()")
    
    
    
    def eukl_dist(self, p1, p2):
        d = math.sqrt(((p1[0]-p2[0])**2)+((p1[1]-p2[1])**2))
        return d < max(self.IMG_FORAGER_SIZE, self.IMG_POLLEN_SIZE)
    
    
    
    def create_rnd_coords(self, coords):
        """creates a set of random coordinates of same size as 'coords'
        Images cropped around random coordinates will not overlapp with images
        cropped around labeled coordinates.
        """
        rnd_coords = []

        for i in range(len(coords)):
            too_close = True
            while too_close:
                rnd_x = random.randint(180, 2635)
                if rnd_x > 2360:
                    coin = random.randint(0,1)
                    if coin:
                        rnd_y = random.randint(90, 600)
                    else:
                        rnd_y = random.randint(3500, 3950)
                else:
                    rnd_y = random.randint(230, 2675)
                rnd_coord = [rnd_y, rnd_x]
                for coord in coords:
                    too_close = self.eukl_dist(rnd_coord,coord)
                    if too_close:
                        break
                else:
                    rnd_coords.append(rnd_coord)
        return rnd_coords
    
    
    
    def crop_img(self, img, coord, size, img_id, anno, anno_id=None, category=None, iscrowd=None):
        """Creates file of size 'size' around given coords
        For non-random images an annotation will be created.
        
        Annotation: imgas have format: [y,x] points have format [x,y]
        """
        filename = (str(img_id)).zfill(10) + '.png'
        y = coord[1]-(size//2)
        if (y < 0):
            y = 0
        elif (y > self.IMAGE_RAW_SIZE[0]):
            y = self.IMAGE_RAW_SIZE[0] - size
        x = coord[0]-(size//2)
        if (x < 0):
            x = 0
        elif (x > self.IMAGE_RAW_SIZE[1]):
            x = self.IMAGE_RAW_SIZE[1] - size
        crop_img = img[y:y+size, x:x+size]

        if anno:
            time1 = str(datetime.datetime.utcnow())
            anno_img = {'id':img_id,'width':size,'height':size,'file_name':filename,'license':None,'flickr_url':None,'coco_url':None,'date_captured':time1}
            anno_anno = {'id':anno_id,'image_id':img_id,'category_id':category,'segmentation':[],'area':size**2,'bbox':[],'iscrowd':iscrowd}
            img_id += 1
            anno_id += 1
            return crop_img, filename, anno_img, anno_anno, img_id, anno_id

        img_id += 1
        return crop_img, filename, img_id
    
    
    
    def create_cropped_imgs(self, filename, coords, img_counter, anno_counter, orig_dir):
        """Creates cropped images with annotation around all coords in given file.
        For each cropped image an image of same size around random coords without annotation is created.
        """
        
        img = cv2.imread(filename, 0)
        anno_images = []
        anno_annos = []
        rnd_coords = self.create_rnd_coords(coords)
        datasets = [(self.IMG_FORAGER_SIZE,(self.FORAGER_DIR, self.RND_FORAGER_DIR),2), (self.IMG_POLLEN_SIZE, (self.POLLEN_DIR, self.RND_POLLEN_DIR),1)]

        for i in range(len(coords)):
            for dataset in datasets:

                # Real
                crop_img, filename, anno_image, anno_anno, img_counter, anno_counter = self.crop_img(img, coords[i], dataset[0], img_counter, True, anno_counter, dataset[2], 0)
                anno_images.append(anno_image)
                anno_annos.append(anno_anno)
                try:
                    os.chdir(self.BASE_DIR + dataset[1][0])
                    cv2.imwrite(filename, crop_img)
                except Exception as e:
                    print(e)
                
                # Fake
                crop_img, filename, img_counter = self.crop_img(img, rnd_coords[i], dataset[0], img_counter, False)
                try:
                    os.chdir(self.BASE_DIR + dataset[1][1])
                    cv2.imwrite(filename, crop_img)
                except Exception as e:
                    print(e)
                
            os.chdir(orig_dir)

        return (anno_images, anno_annos, img_counter, anno_counter)
    
    
    
    def create_data(self):
        if not self.DATA_CREATED:
            today = str(datetime.datetime.utcnow())
            annotation = {
                'info': {'year': 2020,'version': None,'description': 'Pollenforager Detection','contributor': 'Mara Kortenkamp, Tim Feige','url': 'https://github.com/marakortenkamp/pollen-detection','date_created': today},
                'images': [],
                'annotations': [],
                'licenses': {'id': None,'name': None,'url': None,},
                'category': [{'supercategory': 'oject','id': 1,'name': 'Pollen'},{'supercategory': 'animal','id': 2,'name': 'Bee'}]
                }
            total_img_counter, total_anno_counter = 0, 9000000000
            os.chdir(self.BASE_DIR + self.RAW_DATA_DIR)
            folders = next(os.walk('.'))[1]
            for folder in folders:
                os.chdir(folder + '/' + self.ANN_DIR)
                files = os.listdir()
                for file in files:
                    if 'json' in file:
                        with open(file) as json_file:
                            data = json.load(json_file)
                        if len(data['objects']):
                            file_name = Path(file).stem
                            pollen_coords = []
                            for obj in data['objects']:
                                pollen_coords.append(obj['points']['exterior'][0])
                            os.chdir('../' + self.IMG_DIR)
                            current_dir = os.getcwd()
                            anno_imgs, anno_annos, total_img_counter, total_anno_counter = self.create_cropped_imgs(file_name, pollen_coords, total_img_counter, total_anno_counter, current_dir)
                            os.chdir('../' + self.ANN_DIR)
                            for ann_img in anno_imgs:
                                annotation['images'].append(ann_img)
                            for ann_ann in anno_annos:
                                annotation['annotations'].append(ann_ann)
                os.chdir(self.BASE_DIR + self.RAW_DATA_DIR)
            os.chdir(self.BASE_DIR)    
            with open('annotation.json', 'w') as fp:
                json.dump(annotation, fp)
            print("Data created.")
            self.DATA_CREATED = True

In [9]:
BeeData = DataPreparation()

Creating directories..
Directories created.
Creating Data..
Data created.
