### preprocess images

In [1]:
import os
import numpy as np
import cv2
from PIL import Image
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [44]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

# half the image size
def half_image(image_name, save_dir, depth):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
#     os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    with Image.open(image_name) as image:
        w, h = image.size
        image.resize((w//2, h//2)).save(image_name_)

# half the image size and pad/crop to size 299
def half_and_pad_image(image_name, save_dir, depth, size):
    tokens = image_name.rsplit(os.sep, depth+1)
    image_name_ = os.path.join(save_dir, *tokens[1:])
    os.makedirs(os.path.dirname(image_name_), exist_ok=True)
    
#     with Image.open(image_name) as image:
#         w, h = image.size
#         img = image.resize((w//2, h//2))
#         img_croped = img.crop(
#             (
#                 -((size - img.size[0]) // 2),
#                 -((size - img.size[1]) // 2),
#                 size - ((size - img.size[0]) // 2),
#                 size - ((size - img.size[1]) // 2)
#             )
#         )
#         img_croped.save(image_name_, quality=100)
        
    image = cv2.imread(image_name)
    h, w, _ = image.shape
    image = cv2.resize(image, (w//2, h//2))
    new_image = np.ones((size, size, 3)) * 255
    h, w, _ = image.shape
    if h < size and w < size:
        new_image[(size-h)//2:h+(size-h)//2, (size-w)//2:w+(size-w)//2, :] = image
    elif h < size:
        new_image[(size-h)//2:h+(size-h)//2, :, :] = image[:, (w-size)//2:size+(w-size)//2, :]
    elif w < size:
        new_image[:, (size-w)//2:w+(size-w)//2, :] = image[(h-size)//2:size+(h-size)//2, :, :]
    else:
        new_image[:, :, :] = image[(h-size)//2:size+(h-size)//2, (w-size)//2:size+(w-size)//2, :]
#     cv2.imwrite(image_name_, new_image, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
    cv2.imwrite(image_name_, new_image)
        

def batch_half_image(image_names, save_dir, depth, size):
    for image_name in image_names:
#         half_image(image_name, save_dir, depth)
        half_and_pad_image(image_name, save_dir, depth, size)

In [45]:
def process(cells_dir, cells_dir_half, depth=1, size=299):
    image_names = scan_files(cells_dir, postfix=".jpg")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count() - 4)
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_half_image, batch, cells_dir_half, depth, size))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [46]:
cells_dir = "/home/cnn/Documents/batch6.1/cells"
cells_dir_half299 = "/home/cnn/Documents/batch6.1/cells_half299_cv2_white"
for sub_dir in os.listdir(cells_dir):
    os.makedirs(os.path.join(cells_dir_half299, sub_dir), exist_ok=True)

process(cells_dir, cells_dir_half299)

# images 126879
One Job Done, Remaining Job Count: 126
One Job Done, Remaining Job Count: 125
One Job Done, Remaining Job Count: 124
One Job Done, Remaining Job Count: 123
One Job Done, Remaining Job Count: 122
One Job Done, Remaining Job Count: 121
One Job Done, Remaining Job Count: 120
One Job Done, Remaining Job Count: 119
One Job Done, Remaining Job Count: 118
One Job Done, Remaining Job Count: 117
One Job Done, Remaining Job Count: 116
One Job Done, Remaining Job Count: 115
One Job Done, Remaining Job Count: 114
One Job Done, Remaining Job Count: 113
One Job Done, Remaining Job Count: 112
One Job Done, Remaining Job Count: 111
One Job Done, Remaining Job Count: 110
One Job Done, Remaining Job Count: 109
One Job Done, Remaining Job Count: 108
One Job Done, Remaining Job Count: 107
One Job Done, Remaining Job Count: 106
One Job Done, Remaining Job Count: 105
One Job Done, Remaining Job Count: 104
One Job Done, Remaining Job Count: 103
One Job Done, Remaining Job Count: 102
One Job D

In [35]:
image_jpg = "/home/cnn/Documents/batch6.1/cells_half299_image_q100/TRI/2018-01-15-15_13_04_x10119_y40386_w41_h38.jpg"
cv2_jpg = "/home/cnn/Documents/batch6.1/cells_half299_image/TRI/2018-01-15-15_13_04_x10119_y40386_w41_h38.jpg"
# cv2_jpg = "/home/cnn/Documents/batch6.1/cells_half299_cv2_q100/TRI/2018-01-15-15_13_04_x10119_y40386_w41_h38.jpg"

# image_img = np.asarray(Image.open(image_jpg))
# cv2_img = np.asarray(Image.open(cv2_jpg))

image_img = cv2.imread(image_jpg)
cv2_img = cv2.imread(cv2_jpg)

print(image_img.shape)
print(cv2_img.shape)

mask = image_img.copy()
count = 0
for i in range(299):
    for j in range(299):
        for k in range(3):
            if image_img[i][j][k] != cv2_img[i][j][k]:
                mask[i][j][2] = 255
                print(i, j, k, image_img[i][j][k], cv2_img[i][j][k])
                count += 1
print(count)
cv2.imwrite("/home/cnn/Documents/batch6.1/mask.jpg", mask)

(299, 299, 3)
(299, 299, 3)
128 129 2 1 0
128 130 2 1 0
128 132 0 2 0
128 133 0 2 0
128 136 1 1 0
128 137 1 1 0
128 141 0 2 0
128 144 2 0 4
128 145 2 0 4
128 146 2 0 4
128 147 2 0 6
128 148 2 0 6
128 149 2 0 6
128 150 1 1 0
128 150 2 0 6
128 151 1 1 0
128 151 2 0 6
128 152 0 2 0
128 152 2 0 6
128 153 0 2 0
128 153 2 0 4
128 154 2 0 4
128 155 2 0 3
128 156 2 0 3
128 157 2 0 1
128 158 2 0 1
128 159 2 0 1
129 137 1 1 0
129 144 0 0 2
129 144 2 0 3
129 145 0 0 2
129 145 2 0 3
129 146 0 0 2
129 146 2 0 3
129 147 2 0 3
129 148 2 0 4
129 149 2 0 4
129 150 2 0 4
129 151 2 0 4
129 152 2 0 4
129 153 2 0 3
129 154 2 0 3
129 155 2 0 1
129 156 2 0 1
129 157 0 0 2
129 158 0 0 2
129 159 0 0 2
130 144 0 0 4
130 145 0 0 4
130 146 0 0 4
130 147 0 0 4
130 149 0 0 2
130 149 2 0 1
130 150 0 0 2
130 150 2 0 1
130 151 0 0 2
130 151 2 0 1
130 152 0 0 2
130 153 0 0 2
130 154 0 0 2
130 155 0 0 2
130 156 0 0 4
130 157 0 2 4
130 158 0 0 4
130 158 1 0 1
130 159 0 0 4
130 159 1 0 1
131 129 2 1 0
131 130 2 1 0
131 14

146 146 2 90 89
146 147 0 148 142
146 147 1 111 112
146 147 2 83 87
146 148 0 147 144
146 148 1 112 110
146 148 2 79 87
146 149 0 146 149
146 149 1 112 113
146 149 2 82 89
146 150 0 156 157
146 150 1 123 122
146 150 2 97 96
146 151 0 162 163
146 151 1 128 129
146 151 2 104 99
146 152 0 168 169
146 152 1 135 139
146 152 2 109 104
146 153 0 173 168
146 153 1 144 140
146 153 2 113 105
146 154 1 146 149
146 154 2 112 114
146 155 0 164 168
146 155 1 142 147
146 155 2 107 119
146 156 0 187 172
146 156 1 166 155
146 156 2 135 134
146 157 0 187 185
146 157 1 170 169
146 157 2 149 157
146 158 0 201 200
146 158 1 192 185
146 159 0 2 12
146 159 1 1 0
147 128 0 0 2
147 129 0 0 2
147 130 0 0 2
147 133 2 0 1
147 134 2 0 1
147 136 0 0 1
147 136 1 0 1
147 136 2 0 1
147 137 0 0 2
147 138 0 2 6
147 138 1 0 3
147 139 0 206 201
147 139 1 200 197
147 139 2 201 192
147 140 0 189 193
147 140 1 178 187
147 140 2 174 180
147 141 0 177 166
147 141 1 164 159
147 142 0 159 145
147 142 1 142 136
147 142 2 121 126


156 155 0 211 216
156 155 1 205 211
156 155 2 194 196
156 156 0 214 202
156 156 1 208 198
156 156 2 203 187
156 157 0 206 204
156 157 1 201 198
156 157 2 202 193
156 158 0 202 211
156 158 1 199 203
156 158 2 201 203
156 159 0 2 5
156 159 2 0 1
157 128 0 0 4
157 128 1 0 1
157 129 0 0 4
157 129 1 0 1
157 130 0 0 2
157 132 0 2 0
157 133 0 2 0
157 133 2 0 1
157 135 2 0 1
157 136 0 0 4
157 136 1 1 6
157 136 2 0 7
157 138 0 0 2
157 139 0 207 208
157 139 1 202 205
157 139 2 204 201
157 140 0 204 203
157 140 1 197 199
157 140 2 200 194
157 141 0 209 208
157 141 1 201 204
157 141 2 201 199
157 142 0 196 182
157 142 1 190 176
157 142 2 185 169
157 143 0 178 189
157 143 1 175 182
157 143 2 167 173
157 144 0 172 160
157 144 1 156 148
157 144 2 139 120
157 145 0 165 164
157 145 1 144 151
157 145 2 117 125
157 146 0 181 175
157 146 1 160 161
157 146 2 128 142
157 147 0 195 185
157 147 1 180 172
157 147 2 154 158
157 148 1 176 172
157 148 2 162 164
157 149 0 189 182
157 149 1 181 170
157 149 2 168 16

True

In [42]:
image_jpg = "/home/cnn/Documents/batch6.1/2018-01-15-15_13_04_x10119_y40386_w41_h38.jpg"

image_img1 = cv2.imread(image_jpg)

cv2.imwrite("/home/cnn/Documents/batch6.1/2018-01-15-15_13_04_x10119_y40386_w41_h38_2.bmp", image_img1, [int(cv2.IMWRITE_JPEG_QUALITY), 100])

True

In [43]:
image_jpg = "/home/cnn/Documents/batch6.1/2018-01-15-15_13_04_x10119_y40386_w41_h38_2.bmp"

image_img2 = cv2.imread(image_jpg)



mask = image_img.copy()
count = 0
for i in range(299):
    for j in range(299):
        for k in range(3):
            if image_img1[i][j][k] != image_img2[i][j][k]:
                mask[i][j][2] = 255
                print(i, j, k, image_img[i][j][k], cv2_img[i][j][k])
                count += 1
print(count)
# cv2.imwrite("/home/cnn/Documents/batch6.1/mask.jpg", mask)

0


### split data to train/valid

In [47]:
import os
import random
import shutil

In [48]:
def split_train_and_valid(data_path, save_path, split=0.1):
    def create_directory(save_path, subdirs):
        for datadir in ["train", "valid"]:
            for subdir in subdirs:
                os.makedirs(os.path.join(save_path, datadir, subdir), exist_ok=True)
    
    def remove_directory(data_path, subdirs):
        for subdir in subdirs:
            shutil.rmtree(os.path.join(data_path, subdir))
    
    subdirs = os.listdir(data_path)
    create_directory(save_path, subdirs)
    
    for subdir in subdirs:
        sub_path = os.path.join(data_path, subdir)
        sub_files = [os.path.join(sub_path, f) for f in os.listdir(sub_path) if f.endswith(".jpg")]
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        random.shuffle(sub_files)
        
        sub_train_path = os.path.join(save_path, "train", subdir)      
        sub_train_files = sub_files[int(len(sub_files)*split):]
        for file in sub_train_files:
            shutil.move(file, sub_train_path)
            
        sub_valid_path = os.path.join(save_path, "valid", subdir)
        sub_valid_files = sub_files[:int(len(sub_files)*split)]
        for file in sub_valid_files:
            shutil.move(file, sub_valid_path)
            
        print("{}: split # {} files to train, # {} files to valid".format(subdir, len(sub_train_files), len(sub_valid_files)))
        
    remove_directory(data_path, subdirs)

In [49]:
data_path = "/home/cnn/Documents/batch6.1/cells_half299_cv2_white"
save_path = "/home/cnn/Documents/batch6.1/cells_half299_cv2_white"

split_train_and_valid(data_path, save_path)

AGC_A: split # 2373 files to train, # 263 files to valid
ACTINO: split # 5411 files to train, # 601 files to valid
MC: split # 7263 files to train, # 807 files to valid
VIRUS: split # 2566 files to train, # 285 files to valid
RC: split # 371 files to train, # 41 files to valid
AGC_B: split # 1178 files to train, # 130 files to valid
EC: split # 569 files to train, # 63 files to valid
LSIL_F: split # 1471 files to train, # 163 files to valid
SC: split # 5493 files to train, # 610 files to valid
HSIL_M: split # 4908 files to train, # 545 files to valid
HSIL_S: split # 17292 files to train, # 1921 files to valid
HSIL_B: split # 3090 files to train, # 343 files to valid
CC: split # 12440 files to train, # 1382 files to valid
ASCUS: split # 5004 files to train, # 556 files to valid
TRI: split # 25273 files to train, # 2808 files to valid
GEC: split # 4821 files to train, # 535 files to valid
FUNGI: split # 4372 files to train, # 485 files to valid
LSIL_E: split # 3038 files to train, # 337 

### data augmentation

In [50]:
import os
import cv2
import numpy as np
from PIL import Image

In [51]:
def rotate(image_name):
    basename = os.path.splitext(image_name)[0]
    jpg = Image.open(image_name)
    jpg.rotate(90).save(basename + "_r90.jpg")
    jpg.rotate(180).save(basename + "_r180.jpg")
    jpg.rotate(270).save(basename + "_r270.jpg")
    jpg.close()
    
def batch_rotate(image_names):
    for image_name in image_names:
        rotate(image_name)
        
def process(cells_dir, cells_dir_aug):
    image_names = scan_files(cells_dir, postfix=".jpg")
    print("# images", len(image_names))
    
    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []
    
    batch_size = 1000
    for i in range(0, len(image_names), batch_size):
        batch = image_names[i : i+batch_size]
        tasks.append(executor.submit(batch_rotate, batch))

    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [52]:
cells_dir = "/home/cnn/Documents/batch6.1/cells_half299_cv2_white/train"
cells_dir_aug = cells_dir

process(cells_dir, cells_dir_aug)

# images 114198
One Job Done, Remaining Job Count: 114
One Job Done, Remaining Job Count: 113
One Job Done, Remaining Job Count: 112
One Job Done, Remaining Job Count: 111
One Job Done, Remaining Job Count: 110
One Job Done, Remaining Job Count: 109
One Job Done, Remaining Job Count: 108
One Job Done, Remaining Job Count: 107
One Job Done, Remaining Job Count: 106
One Job Done, Remaining Job Count: 105
One Job Done, Remaining Job Count: 104
One Job Done, Remaining Job Count: 103
One Job Done, Remaining Job Count: 102
One Job Done, Remaining Job Count: 101
One Job Done, Remaining Job Count: 100
One Job Done, Remaining Job Count: 99
One Job Done, Remaining Job Count: 98
One Job Done, Remaining Job Count: 97
One Job Done, Remaining Job Count: 96
One Job Done, Remaining Job Count: 95
One Job Done, Remaining Job Count: 94
One Job Done, Remaining Job Count: 93
One Job Done, Remaining Job Count: 92
One Job Done, Remaining Job Count: 91
One Job Done, Remaining Job Count: 90
One Job Done, Remai