## new training data
1. make half-sized, 299-aligned images
2. split data
3. augment train

## training data addon
1. generate raw addon data: from original cell images (various sizes), to half-sized, 299-aligned images
2. split raw addon data to train/valid
3. augment train
4. add new addon data to training data pool, change image file name before merging, if neccessary

### preprocess images

In [25]:
import os
import numpy as np
import cv2
from PIL import Image
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [26]:
HLS_L = [0.5]
HLS_S = [0.4, 0.5]

In [27]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def hls_trans_smart(image, HLS_L=HLS_L, HLS_S=HLS_S):
    # image = cv2.imread(image_name)
    # image = np.asarray(image)

    # 图像归一化，且转换为浮点型
    hlsImg = image.astype(np.float32)
    hlsImg = hlsImg / 255.0
    # 颜色空间转换 BGR转为HLS
    hlsImg = cv2.cvtColor(hlsImg, cv2.COLOR_BGR2HLS)
    
    # 1.调整亮度
    l = np.average(hlsImg[:,:,1])
    i = len(HLS_L) - 1
    while i != -1 and HLS_L[i] > l:
        i -= 1
    if i != len(HLS_L)-1:
        hls_l = HLS_L[i+1]
        hlsImg[:, :, 1] = hls_l / l * hlsImg[:, :, 1]
        hlsImg[:, :, 1][hlsImg[:, :, 1] > 1] = 1
        # print(image_name, "changing l", l, "to", hls_l)
        
    # 2.调整饱和度
    s = np.average(hlsImg[:,:,2])
    i = len(HLS_S) - 1
    while i != -1 and HLS_S[i] > s:
        i -= 1
    if i != len(HLS_S)-1:
        hls_s = HLS_S[i+1]
        hlsImg[:, :, 2] = hls_s / s * hlsImg[:, :, 2]
        hlsImg[:, :, 2][hlsImg[:, :, 2] > 1] = 1
        # print(image_name, "changing s", s, "to", hls_s)
        
    # HLS2BGR
    hlsImg = cv2.cvtColor(hlsImg, cv2.COLOR_HLS2BGR)
    # 转换为8位unsigned char
    hlsImg = hlsImg * 255
    image = hlsImg.astype(np.uint8)
    
    return image

# half the image size
def half_image(image_name, save_dir, depth):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
#     os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    with Image.open(image_name) as image:
        w, h = image.size
        image.resize((w//2, h//2)).save(image_name_)

# half the image size and pad/crop to size 299
def half_and_pad_image(image_name, save_dir, depth, size, hls):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
    os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    
#     with Image.open(image_name) as image:
#         w, h = image.size
#         img = image.resize((w//2, h//2))
#         img_croped = img.crop(
#             (
#                 -((size - img.size[0]) // 2),
#                 -((size - img.size[1]) // 2),
#                 size - ((size - img.size[0]) // 2),
#                 size - ((size - img.size[1]) // 2)
#             )
#         )
#         img_croped.save(image_name_, quality=100)
        
    image = cv2.imread(image_name)
    
    # half-size image
#     h, w, _ = image.shape
#     image = cv2.resize(image, (w//2, h//2))
    image = cv2.pyrDown(image)
    
    # change l and s of image
    if hls:
        image = hls_trans_smart(image)

#     new_image = np.ones((size, size, 3)) * 255  # white
    new_image = np.zeros((size, size, 3))  # black
    h, w, _ = image.shape
    if h < size and w < size:
        new_image[(size-h)//2:h+(size-h)//2, (size-w)//2:w+(size-w)//2, :] = image
    elif h < size:
        new_image[(size-h)//2:h+(size-h)//2, :, :] = image[:, (w-size)//2:size+(w-size)//2, :]
    elif w < size:
        new_image[:, (size-w)//2:w+(size-w)//2, :] = image[(h-size)//2:size+(h-size)//2, :, :]
    else:
        new_image[:, :, :] = image[(h-size)//2:size+(h-size)//2, (w-size)//2:size+(w-size)//2, :]
#     cv2.imwrite(image_name_, new_image, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    cv2.imwrite(image_name_, new_image)
        

def batch_half_image(image_names, save_dir, depth, size, hls):
    for image_name in image_names:
#         half_image(image_name, save_dir, depth)
        half_and_pad_image(image_name, save_dir, depth, size, hls)

In [28]:
def process(cells_dir, cells_dir_half, depth=1, size=299, hls=True):
    image_names = scan_files(cells_dir, postfix=".bmp")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_half_image, batch, cells_dir_half, depth, size, hls))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [29]:
cells_dir = "/home/cnn/Documents/batch6.1_bmp/cells"
cells_dir_half299 = "/home/cnn/Documents/batch6.1_bmp/cells_hls05"
for sub_dir in os.listdir(cells_dir):
    os.makedirs(os.path.join(cells_dir_half299, sub_dir), exist_ok=True)

process(cells_dir, cells_dir_half299)

# images 126879
One Job Done, Remaining Job Count: 126
One Job Done, Remaining Job Count: 125
One Job Done, Remaining Job Count: 124
One Job Done, Remaining Job Count: 123
One Job Done, Remaining Job Count: 122
One Job Done, Remaining Job Count: 121
One Job Done, Remaining Job Count: 120
One Job Done, Remaining Job Count: 119
One Job Done, Remaining Job Count: 118
One Job Done, Remaining Job Count: 117
One Job Done, Remaining Job Count: 116
One Job Done, Remaining Job Count: 115
One Job Done, Remaining Job Count: 114
One Job Done, Remaining Job Count: 113
One Job Done, Remaining Job Count: 112
One Job Done, Remaining Job Count: 111
One Job Done, Remaining Job Count: 110
One Job Done, Remaining Job Count: 109
One Job Done, Remaining Job Count: 108
One Job Done, Remaining Job Count: 107
One Job Done, Remaining Job Count: 106
One Job Done, Remaining Job Count: 105
One Job Done, Remaining Job Count: 104
One Job Done, Remaining Job Count: 103
One Job Done, Remaining Job Count: 102
One Job D

### split data to train/valid, randomly

In [8]:
import os
import random
import shutil

In [9]:
def split_train_and_valid(data_path, save_path, split=0.1):
    def create_directory(save_path, subdirs):
        for datadir in ["train", "valid"]:
            for subdir in subdirs:
                os.makedirs(os.path.join(save_path, datadir, subdir), exist_ok=True)
    
    def remove_directory(data_path, subdirs):
        for subdir in subdirs:
            shutil.rmtree(os.path.join(data_path, subdir))
    
    subdirs = os.listdir(data_path)
    create_directory(save_path, subdirs)
    
    for subdir in subdirs:
        sub_path = os.path.join(data_path, subdir)
        sub_files = [os.path.join(sub_path, f) for f in os.listdir(sub_path) if f.endswith(".bmp")]
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        
        sub_train_path = os.path.join(save_path, "train", subdir)      
        sub_train_files = sub_files[int(len(sub_files)*split):]
        for file in sub_train_files:
            shutil.move(file, sub_train_path)
            
        sub_valid_path = os.path.join(save_path, "valid", subdir)
        sub_valid_files = sub_files[:int(len(sub_files)*split)]
        for file in sub_valid_files:
            shutil.move(file, sub_valid_path)
            
        print("{}: split # {} files to train, # {} files to valid".format(subdir, len(sub_train_files), len(sub_valid_files)))
        
    remove_directory(data_path, subdirs)

In [10]:
data_path = "/home/cnn/Documents/batch6.1_bmp/cells_black"
save_path = "/home/cnn/Documents/batch6.1_bmp/cells_black"

split_train_and_valid(data_path, save_path)

AGC_A: split # 2373 files to train, # 263 files to valid
ACTINO: split # 5411 files to train, # 601 files to valid
MC: split # 7263 files to train, # 807 files to valid
VIRUS: split # 2566 files to train, # 285 files to valid
RC: split # 371 files to train, # 41 files to valid
AGC_B: split # 1178 files to train, # 130 files to valid
EC: split # 569 files to train, # 63 files to valid
LSIL_F: split # 1471 files to train, # 163 files to valid
SC: split # 5493 files to train, # 610 files to valid
HSIL_M: split # 4908 files to train, # 545 files to valid
HSIL_S: split # 17292 files to train, # 1921 files to valid
HSIL_B: split # 3090 files to train, # 343 files to valid
CC: split # 12440 files to train, # 1382 files to valid
ASCUS: split # 5004 files to train, # 556 files to valid
TRI: split # 25273 files to train, # 2808 files to valid
GEC: split # 4821 files to train, # 535 files to valid
FUNGI: split # 4372 files to train, # 485 files to valid
LSIL_E: split # 3038 files to train, # 337 

### split data to train/valid, based on given split

In [30]:
import os
import random
import shutil

In [31]:
def map_name(file_dir):
    files = scan_files(file_dir, postfix=".bmp")
    name_map = {os.path.splitext(os.path.basename(file))[0]:file for file in files}
    return name_map

def create_directory(save_path, subdirs):
    for datadir in ["train", "valid"]:
        for subdir in subdirs:
            os.makedirs(os.path.join(save_path, datadir, subdir), exist_ok=True)

def remove_directory(data_path, subdirs):
    for subdir in subdirs:
        shutil.rmtree(os.path.join(data_path, subdir))

def get_inter_tokens(file_dir, file_path):
    tokens_dir = os.path.abspath(file_dir).split(os.sep)
    tokens_file = os.path.abspath(os.path.dirname(file_path)).split(os.sep)
    return tokens_file[len(tokens_dir):]

def arrange_by_template(temp_dir, file_dir):
    subdirs = os.listdir(file_dir)
    
    temp_name_map = map_name(temp_dir)
    file_name_map = map_name(file_dir)
    
    for basename in file_name_map:
        if not basename in temp_name_map:
            print(basename + " not found in " + temp_dir)
            continue
        tokens = get_inter_tokens(temp_dir, temp_name_map[basename])
        target_dir = os.path.join(file_dir, *tokens)
        os.makedirs(target_dir, exist_ok=True)
        shutil.move(file_name_map[basename], target_dir)
        
    remove_directory(file_dir, subdirs)

In [32]:
template_dir = "/home/cnn/Documents/batch6.1_bmp/train1"
source_dir = "/home/cnn/Documents/batch6.1_bmp/cells_hls05"

arrange_by_template(template_dir, source_dir)

### data augmentation

In [33]:
import os
import cv2
import numpy as np
from PIL import Image

In [34]:
def rotate(image_name):
    basename = os.path.splitext(image_name)[0]
    jpg = Image.open(image_name)
    jpg.rotate(90).save(basename + "_r90.bmp")
    jpg.rotate(180).save(basename + "_r180.bmp")
    jpg.rotate(270).save(basename + "_r270.bmp")
    jpg.close()
    
def batch_rotate(image_names):
    for image_name in image_names:
        rotate(image_name)
        
def process(cells_dir):
    image_names = scan_files(cells_dir, postfix=".bmp")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_rotate, batch))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [35]:
cells_dir = "/home/cnn/Documents/batch6.1_bmp/cells_hls05/train"

process(cells_dir)

# images 114198
One Job Done, Remaining Job Count: 114
One Job Done, Remaining Job Count: 113
One Job Done, Remaining Job Count: 112
One Job Done, Remaining Job Count: 111
One Job Done, Remaining Job Count: 110
One Job Done, Remaining Job Count: 109
One Job Done, Remaining Job Count: 108
One Job Done, Remaining Job Count: 107
One Job Done, Remaining Job Count: 106
One Job Done, Remaining Job Count: 105
One Job Done, Remaining Job Count: 104
One Job Done, Remaining Job Count: 103
One Job Done, Remaining Job Count: 102
One Job Done, Remaining Job Count: 101
One Job Done, Remaining Job Count: 100
One Job Done, Remaining Job Count: 99
One Job Done, Remaining Job Count: 98
One Job Done, Remaining Job Count: 97
One Job Done, Remaining Job Count: 96
One Job Done, Remaining Job Count: 95
One Job Done, Remaining Job Count: 94
One Job Done, Remaining Job Count: 93
One Job Done, Remaining Job Count: 92
One Job Done, Remaining Job Count: 91
One Job Done, Remaining Job Count: 90
One Job Done, Remai

### copy files, change filename if a file already exists in target directory

In [36]:
import os
import shutil

In [37]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def get_inter_tokens(file_dir, file_path):
    tokens_dir = os.path.abspath(file_dir).split(os.sep)
    tokens_file = os.path.abspath(os.path.dirname(file_path)).split(os.sep)
    return tokens_file[len(tokens_dir):]

def copy_and_addon(src_folder, dst_folder, addon, postfix):
    src_files = scan_files(src_folder, postfix=postfix)
    for file in src_files:
        tokens = get_inter_tokens(src_folder, file)
        basename = os.path.splitext(os.path.basename(file))[0] + addon + postfix
        shutil.copy(file, os.path.join(dst_folder, *tokens, basename))
    
def move_and_addon(src_folder, dst_folder, addon, postfix):
    src_files = scan_files(src_folder, postfix=postfix)
    for file in src_files:
        tokens = get_inter_tokens(src_folder, file)
        basename = os.path.splitext(os.path.basename(file))[0] + addon + postfix
        shutil.move(file, os.path.join(dst_folder, *tokens, basename))

In [38]:
src_folder = "/home/cnn/Documents/batch6.1_bmp/cells_hls05"
dst_folder = "/home/cnn/Documents/batch6.1_bmp/train1"
addon = "_hls05"
postfix = ".bmp"


copy_and_addon(src_folder, dst_folder, addon, postfix)
# move_and_addon(src_folder, dst_folder, addon, postfix)