## new training data
1. make half-sized, 299-aligned images
2. split data
3. augment train

## training data addon
1. generate raw addon data: from original cell images (various sizes), to half-sized, 299-aligned images
2. split raw addon data to train/valid
3. augment train
4. add new addon data to training data pool, change image file name before merging, if neccessary

### preprocess images

In [1]:
import os
import numpy as np
import cv2
from PIL import Image
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [25]:
HLS_L = [0.7]
HLS_S = [0.4, 0.5]

In [26]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def hls_trans_smart(image, HLS_L=HLS_L, HLS_S=HLS_S):
    # image = cv2.imread(image_name)
    # image = np.asarray(image)

    # 图像归一化，且转换为浮点型
    hlsImg = image.astype(np.float32)
    hlsImg = hlsImg / 255.0
    # 颜色空间转换 BGR转为HLS
    hlsImg = cv2.cvtColor(hlsImg, cv2.COLOR_BGR2HLS)
    
    # 1.调整亮度
    l = np.average(hlsImg[:,:,1])
    i = len(HLS_L) - 1
    while i != -1 and HLS_L[i] > l:
        i -= 1
    if i != len(HLS_L)-1:
        hls_l = HLS_L[i+1]
        hlsImg[:, :, 1] = hls_l / l * hlsImg[:, :, 1]
        hlsImg[:, :, 1][hlsImg[:, :, 1] > 1] = 1
        # print(image_name, "changing l", l, "to", hls_l)
        
    # 2.调整饱和度
    s = np.average(hlsImg[:,:,2])
    i = len(HLS_S) - 1
    while i != -1 and HLS_S[i] > s:
        i -= 1
    if i != len(HLS_S)-1:
        hls_s = HLS_S[i+1]
        hlsImg[:, :, 2] = hls_s / s * hlsImg[:, :, 2]
        hlsImg[:, :, 2][hlsImg[:, :, 2] > 1] = 1
        # print(image_name, "changing s", s, "to", hls_s)
        
    # HLS2BGR
    hlsImg = cv2.cvtColor(hlsImg, cv2.COLOR_HLS2BGR)
    # 转换为8位unsigned char
    hlsImg = hlsImg * 255
    image = hlsImg.astype(np.uint8)
    
    return image

# half the image size
def half_image(image_name, save_dir, depth):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
#     os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    with Image.open(image_name) as image:
        w, h = image.size
        image.resize((w//2, h//2)).save(image_name_)

# half the image size and pad/crop to size 299
def half_and_pad_image(image_name, save_dir, depth, size, hls):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
    os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    
#     with Image.open(image_name) as image:
#         w, h = image.size
#         img = image.resize((w//2, h//2))
#         img_croped = img.crop(
#             (
#                 -((size - img.size[0]) // 2),
#                 -((size - img.size[1]) // 2),
#                 size - ((size - img.size[0]) // 2),
#                 size - ((size - img.size[1]) // 2)
#             )
#         )
#         img_croped.save(image_name_, quality=100)
        
    image = cv2.imread(image_name)
    
    # half-size image
#     h, w, _ = image.shape
#     image = cv2.resize(image, (w//2, h//2))
    image = cv2.pyrDown(image)
    
    # change l and s of image
    if hls:
        image = hls_trans_smart(image)

#     new_image = np.ones((size, size, 3)) * 255  # white
    new_image = np.zeros((size, size, 3))  # black
    h, w, _ = image.shape
    if h < size and w < size:
        new_image[(size-h)//2:h+(size-h)//2, (size-w)//2:w+(size-w)//2, :] = image
    elif h < size:
        new_image[(size-h)//2:h+(size-h)//2, :, :] = image[:, (w-size)//2:size+(w-size)//2, :]
    elif w < size:
        new_image[:, (size-w)//2:w+(size-w)//2, :] = image[(h-size)//2:size+(h-size)//2, :, :]
    else:
        new_image[:, :, :] = image[(h-size)//2:size+(h-size)//2, (w-size)//2:size+(w-size)//2, :]
#     cv2.imwrite(image_name_, new_image, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    cv2.imwrite(image_name_, new_image)
        

def batch_half_image(image_names, save_dir, depth, size, hls):
    for image_name in image_names:
#         half_image(image_name, save_dir, depth)
        half_and_pad_image(image_name, save_dir, depth, size, hls)

In [27]:
def process(cells_dir, cells_dir_half, depth=1, size=299, hls=True):
    image_names = scan_files(cells_dir, postfix=".bmp")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_half_image, batch, cells_dir_half, depth, size, hls))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [28]:
cells_dir = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells"
cells_dir_half299 = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299-hls07"
for sub_dir in os.listdir(cells_dir):
    os.makedirs(os.path.join(cells_dir_half299, sub_dir), exist_ok=True)

process(cells_dir, cells_dir_half299)

# images 148300
One Job Done, Remaining Job Count: 148
One Job Done, Remaining Job Count: 147
One Job Done, Remaining Job Count: 146
One Job Done, Remaining Job Count: 145
One Job Done, Remaining Job Count: 144
One Job Done, Remaining Job Count: 143
One Job Done, Remaining Job Count: 142
One Job Done, Remaining Job Count: 141
One Job Done, Remaining Job Count: 140
One Job Done, Remaining Job Count: 139
One Job Done, Remaining Job Count: 138
One Job Done, Remaining Job Count: 137
One Job Done, Remaining Job Count: 136
One Job Done, Remaining Job Count: 135
One Job Done, Remaining Job Count: 134
One Job Done, Remaining Job Count: 133
One Job Done, Remaining Job Count: 132
One Job Done, Remaining Job Count: 131
One Job Done, Remaining Job Count: 130
One Job Done, Remaining Job Count: 129
One Job Done, Remaining Job Count: 128
One Job Done, Remaining Job Count: 127
One Job Done, Remaining Job Count: 126
One Job Done, Remaining Job Count: 125
One Job Done, Remaining Job Count: 124
One Job D

### split data to train/valid, randomly

In [6]:
import os
import random
import shutil

In [7]:
def split_train_and_valid(data_path, save_path, split=0.1):
    def create_directory(save_path, subdirs):
        for datadir in ["train", "valid"]:
            for subdir in subdirs:
                os.makedirs(os.path.join(save_path, datadir, subdir), exist_ok=True)
    
    def remove_directory(data_path, subdirs):
        for subdir in subdirs:
            shutil.rmtree(os.path.join(data_path, subdir))
    
    subdirs = os.listdir(data_path)
    create_directory(save_path, subdirs)
    
    for subdir in subdirs:
        sub_path = os.path.join(data_path, subdir)
        sub_files = [os.path.join(sub_path, f) for f in os.listdir(sub_path) if f.endswith(".bmp")]
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        
        sub_train_path = os.path.join(save_path, "train", subdir)      
        sub_train_files = sub_files[int(len(sub_files)*split):]
        for file in sub_train_files:
            shutil.move(file, sub_train_path)
            
        sub_valid_path = os.path.join(save_path, "valid", subdir)
        sub_valid_files = sub_files[:int(len(sub_files)*split)]
        for file in sub_valid_files:
            shutil.move(file, sub_valid_path)
            
        print("{}: split # {} files to train, # {} files to valid".format(subdir, len(sub_train_files), len(sub_valid_files)))
        
    remove_directory(data_path, subdirs)

In [8]:
data_path = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299"
save_path = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299"

split_train_and_valid(data_path, save_path)

ASCUS: split # 5273 files to train, # 585 files to valid
VIRUS: split # 4255 files to train, # 472 files to valid
SCC_R: split # 4228 files to train, # 469 files to valid
LSIL_E: split # 2988 files to train, # 331 files to valid
CC: split # 11615 files to train, # 1290 files to valid
HSIL_M: split # 4831 files to train, # 536 files to valid
RC: split # 905 files to train, # 100 files to valid
HSIL_S: split # 26803 files to train, # 2978 files to valid
PH: split # 5239 files to train, # 582 files to valid
ACTINO: split # 8456 files to train, # 939 files to valid
GEC: split # 4822 files to train, # 535 files to valid
SC: split # 7133 files to train, # 792 files to valid
LSIL_F: split # 1240 files to train, # 137 files to valid
EC: split # 569 files to train, # 63 files to valid
MC: split # 5608 files to train, # 623 files to valid
AGC_B: split # 1178 files to train, # 130 files to valid
HSIL_B: split # 3000 files to train, # 333 files to valid
TRI: split # 25273 files to train, # 2808 fi

### split data to train/valid, based on given split

In [29]:
import os
import random
import shutil

In [30]:
def map_name(file_dir):
    files = scan_files(file_dir, postfix=".bmp")
    name_map = {os.path.splitext(os.path.basename(file))[0]:file for file in files}
    return name_map

def create_directory(save_path, subdirs):
    for datadir in ["train", "valid"]:
        for subdir in subdirs:
            os.makedirs(os.path.join(save_path, datadir, subdir), exist_ok=True)

def remove_directory(data_path, subdirs):
    for subdir in subdirs:
        shutil.rmtree(os.path.join(data_path, subdir))

def get_inter_tokens(file_dir, file_path):
    tokens_dir = os.path.abspath(file_dir).split(os.sep)
    tokens_file = os.path.abspath(os.path.dirname(file_path)).split(os.sep)
    return tokens_file[len(tokens_dir):]

def arrange_by_template(temp_dir, file_dir):
    subdirs = os.listdir(file_dir)
    
    temp_name_map = map_name(temp_dir)
    file_name_map = map_name(file_dir)
    
    for basename in file_name_map:
        if not basename in temp_name_map:
            print(basename + " not found in " + temp_dir)
            continue
        tokens = get_inter_tokens(temp_dir, temp_name_map[basename])
        target_dir = os.path.join(file_dir, *tokens)
        os.makedirs(target_dir, exist_ok=True)
        shutil.move(file_name_map[basename], target_dir)
        
    remove_directory(file_dir, subdirs)

In [31]:
template_dir = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299"
source_dir = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299-hls07"

arrange_by_template(template_dir, source_dir)

### data augmentation

In [32]:
import os
import cv2
import numpy as np
from PIL import Image

In [33]:
def rotate(image_name):
    basename = os.path.splitext(image_name)[0]
    jpg = Image.open(image_name)
    jpg.rotate(90).save(basename + "_r90.bmp")
    jpg.rotate(180).save(basename + "_r180.bmp")
    jpg.rotate(270).save(basename + "_r270.bmp")
    jpg.close()
    
def batch_rotate(image_names):
    for image_name in image_names:
        rotate(image_name)
        
def process(cells_dir):
    image_names = scan_files(cells_dir, postfix=".bmp")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_rotate, batch))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [34]:
cells_dir = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299-hls07/train"

process(cells_dir)

# images 133481
One Job Done, Remaining Job Count: 133
One Job Done, Remaining Job Count: 132
One Job Done, Remaining Job Count: 131
One Job Done, Remaining Job Count: 130
One Job Done, Remaining Job Count: 129
One Job Done, Remaining Job Count: 128
One Job Done, Remaining Job Count: 127
One Job Done, Remaining Job Count: 126
One Job Done, Remaining Job Count: 125
One Job Done, Remaining Job Count: 124
One Job Done, Remaining Job Count: 123
One Job Done, Remaining Job Count: 122
One Job Done, Remaining Job Count: 121
One Job Done, Remaining Job Count: 120
One Job Done, Remaining Job Count: 119
One Job Done, Remaining Job Count: 118
One Job Done, Remaining Job Count: 117
One Job Done, Remaining Job Count: 116
One Job Done, Remaining Job Count: 115
One Job Done, Remaining Job Count: 114
One Job Done, Remaining Job Count: 113
One Job Done, Remaining Job Count: 112
One Job Done, Remaining Job Count: 111
One Job Done, Remaining Job Count: 110
One Job Done, Remaining Job Count: 109
One Job D

### copy files, change filename if a file already exists in target directory

In [35]:
import os
import shutil

In [36]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def get_inter_tokens(file_dir, file_path):
    tokens_dir = os.path.abspath(file_dir).split(os.sep)
    tokens_file = os.path.abspath(os.path.dirname(file_path)).split(os.sep)
    return tokens_file[len(tokens_dir):]

def copy_and_addon(src_folder, dst_folder, addon, postfix):
    src_files = scan_files(src_folder, postfix=postfix)
    for file in src_files:
        tokens = get_inter_tokens(src_folder, file)
        basename = os.path.splitext(os.path.basename(file))[0] + addon + postfix
        shutil.copy(file, os.path.join(dst_folder, *tokens, basename))
    
def move_and_addon(src_folder, dst_folder, addon, postfix):
    src_files = scan_files(src_folder, postfix=postfix)
    for i,file in enumerate(src_files):
        if i % 10000 == 0:
            print("# files merged", i)
        tokens = get_inter_tokens(src_folder, file)
        basename = os.path.splitext(os.path.basename(file))[0] + addon + postfix
        shutil.move(file, os.path.join(dst_folder, *tokens, basename))

In [37]:
src_folder = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299-hls07"
dst_folder = "/home/hdd_array0/batch6.2_xcp/batch6.2-cells-half299"
addon = "_hls07"
postfix = ".bmp"

# copy_and_addon(src_folder, dst_folder, addon, postfix)
move_and_addon(src_folder, dst_folder, addon, postfix)

# files merged 0
# files merged 10000
# files merged 20000
# files merged 30000
# files merged 40000
# files merged 50000
# files merged 60000
# files merged 70000
# files merged 80000
# files merged 90000
# files merged 100000
# files merged 110000
# files merged 120000
# files merged 130000
# files merged 140000
# files merged 150000
# files merged 160000
# files merged 170000
# files merged 180000
# files merged 190000
# files merged 200000
# files merged 210000
# files merged 220000
# files merged 230000
# files merged 240000
# files merged 250000
# files merged 260000
# files merged 270000
# files merged 280000
# files merged 290000
# files merged 300000
# files merged 310000
# files merged 320000
# files merged 330000
# files merged 340000
# files merged 350000
# files merged 360000
# files merged 370000
# files merged 380000
# files merged 390000
# files merged 400000
# files merged 410000
# files merged 420000
# files merged 430000
# files merged 440000
# files merged 450000
# 