In [58]:
import os
import shutil
from sklearn.model_selection import train_test_split
import random

random.seed(98052)

## split for train and test

In [59]:
# input directories
data_dir = '<raw data directory>'
sub_folders = ['contourbufferstrips', 'fieldborders', 'filterstrips', 'riparian', 'terraces', 'waterways', 'wsb']
output_dir = '<augmented data directory>'
os.makedirs(output_dir, exist_ok =True)

RATIO = 0.3 # 70% as training, 30% as testing

In [6]:
# split data into train/jpg, train/polygon, test/jpg, test/polygon folders
for sub_folder in sub_folders:
    
    image_dir = os.path.join(data_dir, sub_folder)

    raw_jpg = os.path.join(image_dir, 'jpg')
    raw_poly = os.path.join(image_dir, 'polygon')

    file_names = next(os.walk(raw_jpg))[2]
    train, test = train_test_split(file_names, test_size=RATIO, random_state = 98052)

    test_dir = os.path.join(data_dir, sub_folder+'_test')
    os.makedirs(test_dir, exist_ok = True)

    test_jpg = os.path.join(test_dir, 'jpg')
    os.makedirs(test_jpg, exist_ok = True)
    test_poly = os.path.join(test_dir, 'polygon')
    os.makedirs(test_poly, exist_ok = True)

    # move 30% dataset to the test folers
    for f_test in test:
        path1_jpg = os.path.join(raw_jpg, f_test)
        path2_jpg = os.path.join(test_jpg, f_test)
        shutil.move(path1_jpg, path2_jpg)   

        fn_img, ext = os.path.splitext(os.path.basename(f_test))
        for name in next(os.walk(raw_poly))[2]:
            if name.startswith(fn_img):        
                path1_poly = os.path.join(raw_poly, name)
                path2_poly = os.path.join(test_poly, name)
                shutil.move(path1_poly, path2_poly)
                
    if bool(set(next(os.walk(raw_jpg))[2]) & set(next(os.walk(test_jpg))[2])):
        print ("there is overlap between training jpg and test jpg")
    else:
        pass
    
    if bool(set(next(os.walk(raw_poly))[2]) & set(next(os.walk(test_poly))[2])):
        print ("there is overlap between training polygon and test polygon")
    else:
        pass
    
    # rename raw image folder to training image folder
    os.rename(image_dir, image_dir+'_train')

## merge for train and test

In [61]:
subcategory_folders = ['waterways', 'fieldborders', 'terraces', 'wsb']

In [62]:
# create corresponding folders
input_train = os.path.join(output_dir, 'train')
input_test = os.path.join(output_dir, 'test')

os.makedirs(input_train, exist_ok = True)
os.makedirs(input_test, exist_ok = True)

input_train_jpg = os.path.join(input_train, 'jpg')
os.makedirs(input_train_jpg, exist_ok = True)
input_train_poly = os.path.join(input_train, 'polygon')
os.makedirs(input_train_poly, exist_ok = True)

input_test_jpg = os.path.join(input_test, 'jpg')
os.makedirs(input_test_jpg, exist_ok = True)
input_test_poly = os.path.join(input_test, 'polygon')
os.makedirs(input_test_poly, exist_ok = True)


In [63]:
# merge training dataset

for folder in subcategory_folders:
    folder_dir = folder+'_train'
    
    image_dir = os.path.join(data_dir, folder_dir)
    raw_jpg = os.path.join(image_dir, 'jpg')
    raw_poly = os.path.join(image_dir, 'polygon')
    
    for jpg in next(os.walk(raw_jpg))[2]:
        path1_jpg = os.path.join(raw_jpg, jpg)
        path2_jpg = os.path.join(input_train_jpg, jpg)
        shutil.copy(path1_jpg, path2_jpg)   
    
    for poly in next(os.walk(raw_poly))[2]:
        path1_poly = os.path.join(raw_poly, poly)
        path2_poly = os.path.join(input_train_poly, poly)
        shutil.copy(path1_poly, path2_poly)

In [64]:
# merge testing dataset

for folder in subcategory_folders:
    folder_dir = folder+'_test'
    
    image_dir = os.path.join(data_dir, folder_dir)
    raw_jpg = os.path.join(image_dir, 'jpg')
    raw_poly = os.path.join(image_dir, 'polygon')
    
    for jpg in next(os.walk(raw_jpg))[2]:
        path1_jpg = os.path.join(raw_jpg, jpg)
        path2_jpg = os.path.join(input_test_jpg, jpg)
        shutil.copy(path1_jpg, path2_jpg)   
    
    for poly in next(os.walk(raw_poly))[2]:
        path1_poly = os.path.join(raw_poly, poly)
        path2_poly = os.path.join(input_test_poly, poly)
        shutil.copy(path1_poly, path2_poly)

In [65]:
if bool(set(next(os.walk(input_train_poly))[2]) & set(next(os.walk(input_test_poly))[2])):
    print ("there is overlap between training jpg and test jpg")
else:
    pass

if bool(set(next(os.walk(input_train_jpg))[2]) & set(next(os.walk(input_test_jpg))[2])):
    print ("there is overlap between training polygon and test polygon")
else:
    pass

## data aug

In [66]:
import os
import cv2
from PIL import Image
import numpy as np
import imgaug as ia
import matplotlib.pyplot as plt
from imgaug import augmenters as iaa
from matplotlib.pyplot import imshow
import matplotlib.gridspec as gridspec
%matplotlib inline 

#### define augmentation directories

In [67]:
image_location = os.path.join(output_dir, 'train/jpg')
poly_location = os.path.join(output_dir, 'train/polygon')

category_names = ['contourbufferstrips', 'fieldborders', 'filterstrips', 'riparian', 'terraces', 'waterways', 'wsb']

aug_dir = os.path.join(output_dir, 'train_aug')
os.makedirs(aug_dir, exist_ok =True)

#### helper functions

In [68]:
def load_image(image):
    img = Image.open(image)
    data = np.asarray(img, dtype = 'uint8')
    return data

In [69]:
def load_mask(poly):
    img = Image.open(poly).convert('L')
    data = np.asarray(img, dtype = 'uint8')
    return data

In [70]:
def visualize_imgs(dim_x, dim_y, images):
    gs = gridspec.GridSpec(dim_x, dim_y, top=1., bottom=0., right=1., left=0., hspace=0.,
        wspace=0.) # the size of grid will be adjusted to the number of augmented images

    for i in range(len(images)):
        ax = plt.subplot(gs[i])
        ax.imshow(images[i])
        ax.set_xticks([])
        ax.set_yticks([])

In [71]:
def seq_functions(aug_method):
     
    seq_fliplr = iaa.Sequential([
        iaa.Fliplr(1.0), # horizontal flips
    ], random_order=False) 
    
    seq_flipud = iaa.Sequential([
        iaa.Flipud(1.0), # vertical flips
    ], random_order=False) 
    
    seq_flip = iaa.Sequential([
        iaa.Fliplr(1.0), # horizontal flips
        iaa.Flipud(1.0), # vertical flips
    ], random_order=False) 
        
    seq_rotate45 = iaa.Sequential([
            iaa.Affine(
            rotate=45,
        )
    ], random_order=False) 
    
    seq_rotate90 = iaa.Sequential([
            iaa.Affine(
            rotate=90,
        )
    ], random_order=False) 
    
    seq_rotate135 = iaa.Sequential([
            iaa.Affine(
            rotate=135,
        )
    ], random_order=False) 
    
    seq_dict = {'fliplr': seq_fliplr, 'flipud': seq_flipud, 'flip': seq_flip, \
                'rotate45': seq_rotate45, 'rotate90': seq_rotate90, 'rotate135': seq_rotate135,}
    
    seq = seq_dict[aug_method]
    
    return seq

In [72]:
def aug_mask_image(_mask, image, num_aug, operation_name):
        
    masks1 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks2 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks3 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks4 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks5 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks6 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))
    masks7 = np.zeros((num_aug, _mask.shape[0], _mask.shape[1]))

    _img = image 
    images = np.zeros((num_aug, _img.shape[0], _img.shape[1], 3), 'uint8') # dtype uint8.

    for i in range(num_aug):
        masks1[i] = _mask[:,:,0]
        masks2[i] = _mask[:,:,1]
        masks3[i] = _mask[:,:,2]
        masks4[i] = _mask[:,:,3]
        masks5[i] = _mask[:,:,4]
        masks6[i] = _mask[:,:,5]
        masks7[i] = _mask[:,:,6]
        
        images[i] = _img
    
    seq = seq_functions(operation_name)

    seq_det = seq.to_deterministic() # set the random parameters to derterministic
    
    images_aug = seq_det.augment_images(images)
    
    masks1_aug = seq_det.augment_images(masks1)
    masks2_aug = seq_det.augment_images(masks2)
    masks3_aug = seq_det.augment_images(masks3)
    masks4_aug = seq_det.augment_images(masks4)
    masks5_aug = seq_det.augment_images(masks5)
    masks6_aug = seq_det.augment_images(masks6)
    masks7_aug = seq_det.augment_images(masks7)
    
    return images_aug, masks1_aug, masks2_aug, masks3_aug, masks4_aug, masks5_aug, masks6_aug, masks7_aug,

#### data aug operations

In [73]:
operations = ['fliplr', 'flipud', 'flip', 'rotate90']

for operation_name in operations:

    sub_operation = os.path.join(aug_dir, operation_name)
    os.makedirs(sub_operation, exist_ok = True)
    
    for image_name in next(os.walk(image_location))[2]:
        image = os.path.join(image_location, image_name)

        num_aug = 1
        _img = load_image(image)

        mask = np.zeros([256, 256, len(category_names)])
        fn_img, ext = os.path.splitext(os.path.basename(image_name))

        mask_names = []
        for i in range(len(category_names)):
            if fn_img.split('_')[0] == category_names[i]:
                poly = os.path.join(poly_location, image_name)
            else:
                poly = os.path.join(poly_location, fn_img + '_' + category_names[i] + '.jpg')
            mask[:,:,i:i+1] = np.expand_dims(load_mask(poly), axis = 2)
            mask_names.append(poly)


        images_aug, masks1_aug, masks2_aug, masks3_aug, masks4_aug, masks5_aug, masks6_aug, masks7_aug = \
        aug_mask_image(mask, _img, num_aug, operation_name)

        masks = [masks1_aug, masks2_aug, masks3_aug, masks4_aug, masks5_aug, masks6_aug, masks7_aug]
        
        output_jpg_path = os.path.join(sub_operation, 'jpg')
        output_poly_path = os.path.join(sub_operation, 'polygon')
        os.makedirs(output_jpg_path, exist_ok = True)
        os.makedirs(output_poly_path, exist_ok = True)
        
        for i in range(num_aug):
            fn1,fn2, fn3, ex = image_name.split('_')
            file_name = fn1 + '_' + fn2 + '_' + fn3 + '_' + operation_name + str(i) + '_' + ex
            output_name = os.path.join(output_jpg_path, file_name)
            cv2.imwrite(output_name, images_aug[i])

        for i in range(len(mask_names)):
            fn_img, ext = os.path.splitext(os.path.basename(mask_names[i]))
            mask_array = masks[i]

            for j in range(num_aug):
                if fn_img.split('_')[-1] == "merged":
                    fn1,fn2, fn3, ex = mask_names[i].split('/')[-1].split('_')
                    file_name = fn1 + '_' + fn2 + '_' + fn3 + '_' + operation_name + str(j) + '_' + ex
                else:
                    fn1,fn2, fn3, fn4, ex = mask_names[i].split('/')[-1].split('_')
                    file_name = fn1 + '_' + fn2 + '_' + fn3 + '_' +  operation_name + str(j) + '_' + fn4 + '_'+ ex
                output_name = os.path.join(output_poly_path, file_name)
    
                cv2.imwrite(output_name, mask_array[j])

#### merge augmentaion

In [74]:
# merge training dataset

for folder_dir in operations:
    
    image_dir = os.path.join(aug_dir, folder_dir)
    raw_jpg = os.path.join(image_dir, 'jpg')
    raw_poly = os.path.join(image_dir, 'polygon')
    
    output_jpg = os.path.join(aug_dir, 'jpg')
    output_poly = os.path.join(aug_dir, 'polygon')
    os.makedirs(output_jpg, exist_ok =True)
    os.makedirs(output_poly, exist_ok =True)
    
    for jpg in next(os.walk(raw_jpg))[2]:
        path1_jpg = os.path.join(raw_jpg, jpg)
        shutil.copy(path1_jpg, output_jpg)   
    
    for poly in next(os.walk(raw_poly))[2]:
        path1_poly = os.path.join(raw_poly, poly)
        shutil.copy(path1_poly, output_poly)