In [1]:
import os
import cv2
import random
import shutil
import xml.dom.minidom
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

### move jpgs with xmls/txts out

In [2]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def copy_jpg_and_labels(from_path, to_path):
    image_names = scan_files(from_path, postfix=".jpg")
    for image_name in image_names:
        xml_name = os.path.splitext(image_name)[0] + ".xml"
        txt_name = os.path.splitext(image_name)[0] + ".txt"
        if os.path.isfile(xml_name):
            shutil.move(image_name, to_path)
            shutil.move(xml_name, to_path)
        elif os.path.isfile(txt_name):
            shutil.move(image_name, to_path)
            shutil.move(txt_name, to_path)
            

orig_path = "../../yolo_sc_20181128/SC_orig"
tmp_path = "../../yolo_sc_20181128/SC_tmp"

copy_jpg_and_labels(orig_path, tmp_path)

### find proper training input image size (608x608)

In [3]:
# collect all sizes of images
tmp_path = "../../yolo_sc_20181128/SC_tmp"
image_names = [os.path.join(tmp_path, f) for f in os.listdir(tmp_path) if f.endswith(".jpg")]
print("# of jpgs", len(image_names))
image_sizes = []
for image_name in image_names:
    with Image.open(image_name) as image:
        image_sizes.append(image.size)

# of jpgs 1973


In [4]:
# kmeans clustering
X = np.array(image_sizes)
kmeans = KMeans(n_clusters=9, random_state=2018).fit(X)
print(X.shape, kmeans)

(1973, 2) KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=2018, tol=0.0001, verbose=0)


In [5]:
centers = []
for center in kmeans.cluster_centers_:
    print(center)
    centers.append(center)

[103.93630573 103.94267516]
[131.1965812 137.8974359]
[96.65745856 85.98895028]
[122.68774704 113.96047431]
[101.10429448 130.24539877]
[121.05940594  91.46534653]
[77.74789916 78.84033613]
[149.48571429 109.27619048]
[ 86.0456621  103.23287671]


In [6]:
total = len(image_sizes)
large = len([image_size for image_size in image_sizes if image_size[0] > 608 or image_size[1] > 608])
print(total, large)

1973 0


### image padding and resizing

In [7]:
def read_labels_from_xml(xml_name):
    DOMTree = xml.dom.minidom.parse(xml_name)
    collection = DOMTree.documentElement
    filename = collection.getElementsByTagName("filename")
    objects = collection.getElementsByTagName("object")

    w = collection.getElementsByTagName("width")[0]
    w_val = int(w.firstChild.nodeValue)
    h = collection.getElementsByTagName("height")[0]
    h_val = int(h.firstChild.nodeValue)
    w.firstChild.replaceWholeText(str(h_val))
    h.firstChild.replaceWholeText(str(w_val))

    labels = {'w':w_val, 'h':h_val, 'boxes':[]}
    
    for object in objects:
        xmin = object.getElementsByTagName("xmin")[0]
        xmin_val = int(xmin.firstChild.nodeValue)
        xmax = object.getElementsByTagName("xmax")[0]
        xmax_val = int(xmax.firstChild.nodeValue)
        ymin = object.getElementsByTagName("ymin")[0]
        ymin_val = int(ymin.firstChild.nodeValue)
        ymax = object.getElementsByTagName("ymax")[0]
        ymax_val = int(ymax.firstChild.nodeValue)
        labels["boxes"].append([xmin_val, ymin_val, xmax_val, ymax_val])
        
    return labels


def read_labels_from_txt(txt_name):
    jpg_name = os.path.splitext(txt_name)[0] + ".jpg"
    with Image.open(jpg_name) as img:
        w, h = img.size
        
    labels = {'w':w, 'h':h, 'boxes':[]}
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            xc, yc = w*float(tokens[1]), h*float(tokens[2])
            w_, h_ = w*float(tokens[3]), h*float(tokens[4])
            xmin, ymin = int(xc - w_/2), int(yc - h_/2)
            xmax, ymax = int(xc + w_/2), int(yc + h_/2)
            labels["boxes"].append([xmin, ymin, xmax, ymax])
            
    return labels


def resize_img_with_padding(img_name, size, save_path):
    xml_name = os.path.splitext(img_name)[0] + ".xml"
    txt_name = os.path.splitext(img_name)[0] + ".txt"
    if os.path.isfile(xml_name):
        labels = read_labels_from_xml(xml_name)
    elif os.path.isfile(txt_name):
        labels = read_labels_from_txt(txt_name)
    else:
        return
    
    img_name_new = os.path.join(save_path, os.path.basename(img_name))
    txt_name_new = os.path.splitext(img_name_new)[0] + ".txt"
    
    img = Image.open(img_name)
    
    # pad and crop image
    x = -((size - img.size[0]) // 2)
    y = -((size - img.size[1]) // 2)
    img_croped = img.crop(
        (
            x,
            y,
            size + x,
            size + y
        )
    )
    img_croped.save(img_name_new)

    # change label coordinates
    labels_yolo = []
    for box in labels["boxes"]:
        xmin = box[0] - x
        ymin = box[1] - y
        xmax = box[2] - x
        ymax = box[3] - y
        xcenter = (xmin+xmax)/2.0/size
        ycenter = (ymin+ymax)/2.0/size
        w = (xmax-xmin)/size
        h = (ymax-ymin)/size
        labels_yolo.append(['0', str(xcenter), str(ycenter), str(w), str(h)])
        
#         # cut cells
#         cell_name = os.path.splitext(img_name_new)[0] + '_' + str(xmin) + '_' + str(ymin) + ".jpg"
#         img_croped.crop((xmin, ymin, xmax, ymax)).save(cell_name)
        
    # write lables to txt
    with open(txt_name_new, 'w') as f:
        for label in labels_yolo:
            f.write(' '.join(label) + '\n')
            
    print("processed ", img_name)

In [8]:
tmp_path = "../../yolo_sc_20181128/SC_tmp"
yolo_path = "../../yolo_sc_20181128/SC_yolo"
size = 608

image_names = [os.path.join(tmp_path, f) for f in os.listdir(tmp_path) if f.endswith(".jpg")]

for image_name in image_names:
    resize_img_with_padding(image_name, size, yolo_path)

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_33_43_x27345_y14363_w100_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_48_59_x19973_y24859_w138_h130.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x52527_y36814_w92_h126.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x35392_y43758_w118_h96.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x45511_y39830_w118_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_31_06_x10580_y22014_w76_h72.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_19_03_x43409_y31368_w116_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x18965_y7508_w90_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x11646_y33988_w70_h76.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_42_03_x16927_y31182_w86_h86.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_18_16_x29445_y50956_w154_h98.jpg
processed  ../../yolo_sc_20181128/

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x15631_y44777_w92_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_25_54_x34988_y38073_w130_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_41_50_x28231_y31563_w118_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_45_22_x18918_y51477_w116_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x13378_y50063_w96_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_31_00_x27792_y15783_w104_h86.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_48_55_x23232_y38867_w94_h92.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x44477_y52083_w106_h72.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x29374_y16411_w82_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_15_56_x13513_y26206_w102_h124.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x37239_y22820_w80_h98.jpg
processed  ../../yolo_sc_20181128/S

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_42_03_x11350_y31899_w70_h92.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x31544_y7402_w120_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x47553_y13062_w78_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_33_43_x54403_y22488_w82_h84.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_25_54_x11619_y31009_w102_h152.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_46_18_x16814_y39320_w132_h132.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_41_48_x37053_y46753_w106_h128.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_46_28_x33397_y15512_w118_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_45_56_x25531_y43334_w110_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_14_03_x19619_y44092_w96_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x17759_y34326_w92_h78.jpg
processed  ../../yolo_sc_201811

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x25293_y46115_w106_h142.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x38491_y25975_w114_h126.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x24160_y46385_w126_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_25_54_x39445_y37056_w114_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x8893_y22770_w88_h84.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_19_02_x32711_y43833_w128_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_01_00_x49284_y35134_w116_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_42_03_x18718_y30590_w140_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_41_48_x36079_y29623_w152_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_22_46_x31639_y48433_w142_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_29_16_x12259_y52249_w118_h88.jpg
processed  ../../yolo_sc_2

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_03_44_x24842_y23098_w106_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_21_37_x14082_y39011_w96_h124.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x36846_y16169_w90_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_38_53_x55548_y30381_w122_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_48_55_x44911_y40228_w104_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x37367_y33511_w90_h78.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_48_55_x38337_y23481_w142_h142.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x44163_y45476_w106_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_14_48_x48609_y28501_w84_h74.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_46_18_x30739_y28062_w114_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_31_06_x29214_y31305_w98_h72.jpg
processed  ../../yolo_sc_2018112

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_24_43_x36308_y39313_w88_h80.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_14_04_x15703_y33768_w116_h150.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x31440_y11735_w104_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x55809_y28117_w84_h78.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x42567_y27220_w86_h86.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_22_46_x13681_y25358_w88_h96.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-18_23_22_x44900_y19431_w146_h122.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_09_59_x38783_y31608_w116_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x42821_y38943_w96_h140.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_01_07_x46770_y38919_w94_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_01_50_x44875_y36310_w110_h110.jpg
processed  ../../yolo_sc_20181128

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_46_18_x50722_y39428_w130_h118.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_21_37_x17373_y13301_w110_h126.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_24_43_x43763_y56312_w90_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_12_15_x26563_y34370_w126_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x45535_y19240_w102_h74.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_46_18_x27967_y49843_w82_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x36490_y41647_w94_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_46_19_x35865_y71601_w124_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_21_37_x32636_y37062_w116_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_25_54_x47732_y18706_w66_h144.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_09_35_x23017_y23243_w118_h114.jpg
processed  ../../yolo_sc_20

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-18_16_45_x30879_y12274_w102_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_41_50_x46552_y31514_w106_h84.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_31_00_x31664_y10942_w100_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_14_41_x24561_y21245_w98_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x22235_y9991_w92_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x44934_y15189_w84_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_43_24_x49967_y34432_w102_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_59_02_x10300_y25516_w126_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_35_45_x47030_y62871_w90_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_03_13_x31767_y32407_w110_h110.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_35_45_x18565_y28627_w92_h86.jpg
processed  ../../yolo_sc_2018112

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_14_41_x38414_y31556_w102_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_38_53_x27067_y24929_w106_h118.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_38_53_x29545_y14791_w110_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_15_56_x50090_y34592_w110_h130.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_14_41_x22657_y11588_w84_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_01_07_x50422_y19577_w122_h110.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x19879_y28729_w84_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_17_57_x42860_y59142_w124_h84.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_03_56_x6887_y17263_w108_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_37_32_x22541_y13897_w78_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_22_46_x21856_y17450_w110_h74.jpg
processed  ../../yolo_sc_2018

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_06_40_x28313_y57638_w122_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x34144_y6434_w148_h122.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_18_16_x43872_y36640_w104_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x7577_y36036_w114_h86.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_37_43_x45343_y39019_w90_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_21_37_x23062_y44476_w120_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_12_29_x13179_y17673_w128_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_09_21_x11483_y44969_w132_h166.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-18_01_00_x17684_y14755_w114_h120.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_42_03_x27148_y14389_w92_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_24_43_x30645_y45798_w128_h138.jpg
processed  ../../yolo_sc_20

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_21_37_x12193_y19800_w104_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x47750_y40739_w88_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_24_43_x28742_y47206_w100_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_23_26_x55273_y37828_w104_h118.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_31_00_x30072_y28645_w102_h76.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x28248_y9453_w164_h122.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_30_43_x20988_y51043_w108_h128.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_43_24_x38096_y23007_w80_h76.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_08_33_x31257_y40971_w90_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x47558_y10769_w124_h130.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_14_41_x35020_y19420_w174_h94.jpg
processed  ../../yolo_sc_20181

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_56_00_x52218_y42502_w128_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-17_14_41_x24884_y15772_w92_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x20751_y38047_w124_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_43_24_x41683_y35272_w120_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_17_57_x57461_y33342_w122_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x10378_y13110_w94_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_48_55_x31521_y15884_w112_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_14_48_x53181_y8449_w102_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_35_45_x13727_y21456_w82_h72.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-18_27_35_x46636_y12525_w110_h102.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x50813_y7276_w74_h82.jpg
processed  ../../yolo_sc_20181128/

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_37_32_x17805_y43713_w130_h122.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_14_48_x10542_y22562_w84_h82.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x32978_y39550_w108_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x13066_y41296_w80_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x17864_y30378_w104_h84.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_42_03_x53077_y53954_w102_h74.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_35_45_x21183_y15385_w90_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x38039_y53671_w132_h110.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x33902_y32293_w120_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x43639_y26858_w70_h76.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x51913_y22169_w100_h134.jpg
processed  ../../yolo_sc_20181

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_03_44_x44796_y36298_w74_h74.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x28886_y8856_w92_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_17_57_x61162_y52365_w90_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_45_22_x34277_y46092_w108_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_48_55_x30680_y48102_w98_h86.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_38_53_x9357_y21526_w74_h64.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_37_32_x22377_y34989_w114_h96.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x47579_y18257_w100_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x29292_y17992_w110_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-16_38_53_x39622_y45539_w72_h68.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-15_03_44_x30219_y7460_w126_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_56_07_x34427_y40957_w144_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x25740_y48890_w112_h110.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x12115_y42722_w84_h104.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_31_06_x15208_y12285_w112_h112.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-15_19_25_x31253_y13116_w82_h94.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_45_22_x52608_y37751_w110_h106.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-18_27_35_x26199_y11144_w104_h110.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_12_28_x9798_y13709_w88_h150.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_41_48_x47364_y41295_w90_h118.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-11-16_01_46_x30223_y33465_w102_h92.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_42_00_x28141_y38955_w92_h124.jpg
processed  ../../yolo_sc_2018

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_45_22_x30628_y24518_w106_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-17_59_02_x33441_y47653_w98_h132.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_43_24_x40717_y11553_w84_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_17_57_x39621_y12631_w134_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_32_17_x31304_y51265_w132_h128.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_37_32_x6615_y23987_w120_h114.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_20_45_x42980_y10000_w92_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_35_45_x39166_y8214_w86_h96.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-18_36_03_x57294_y17203_w128_h100.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_06_40_x26325_y57208_w140_h164.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_14_03_x20957_y34958_w140_h80.jpg
processed  ../../yolo_sc_2018112

processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_01_26_x39075_y17589_w88_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x27084_y26226_w92_h90.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_41_48_x27405_y27647_w104_h88.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_39_43_x41092_y16777_w126_h120.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-17_28_51_x25629_y33892_w90_h116.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_01_26_x37312_y42772_w106_h108.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_45_22_x52943_y28179_w134_h98.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-19_41_48_x46371_y28628_w112_h122.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-10-16_59_14_x39613_y9806_w120_h92.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x49175_y9897_w70_h68.jpg
processed  ../../yolo_sc_20181128/SC_tmp/2017-10-09-18_23_17_x45676_y15358_w70_h80.jpg
processed  ../../yolo_sc_20181128/S

### split data into train/valid

In [9]:
import random
import shutil


def split_train_valid(data_path):
    img_names = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith(".jpg")]

    random.shuffle(img_names)
    random.shuffle(img_names)

    split = 0.8

    train_names = img_names[:int(len(img_names)*split)]
    valid_names = img_names[int(len(img_names)*split):]

    train_path = os.path.join(data_path, "train")
    os.makedirs(train_path, exist_ok=True)
    for img_name in train_names:
        txt_name = os.path.splitext(img_name)[0] + ".txt"
        shutil.move(img_name, train_path)
        shutil.move(txt_name, train_path)

    valid_path = os.path.join(data_path, "valid")
    os.makedirs(valid_path, exist_ok=True)
    for img_name in valid_names:
        txt_name = os.path.splitext(img_name)[0] + ".txt"
        shutil.move(img_name, valid_path)
        shutil.move(txt_name, valid_path)
        

yolo_path = "../../yolo_sc_20181128/SC_yolo"
split_train_valid(yolo_path)

### kmeans clustering

In [10]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def collect_sizes_from_yolotxt(txt_name, size):
    sizes = []
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
#             print(tokens)
            w = int(float(tokens[3])*size)
            h = int(float(tokens[4])*size)
            sizes.append([w, h])
    return sizes

def collect_sizes_from_yolotxt_all(txt_names, size):
    sizes = []
    for txt_name in txt_names:
        sizes += collect_sizes_from_yolotxt(txt_name, size)
    return sizes

In [11]:
yolo_path = "../../yolo_sc_20181128/SC_yolo"

sizes = collect_sizes_from_yolotxt_all(scan_files(yolo_path, postfix=".txt"), 608)

# kmeans clustering
X = np.array(sizes)
kmeans = KMeans(n_clusters=9, random_state=2018).fit(X)
print(kmeans)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=9, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=2018, tol=0.0001, verbose=0)


In [12]:
centers = []
for center in kmeans.cluster_centers_:
    print(center)
    centers.append(center)

[34.43137255 46.25      ]
[61.84090909 62.86363636]
[31.61237785 32.28338762]
[38.83888889 36.575     ]
[44.26623377 45.79220779]
[46.23611111 60.82638889]
[50.31395349 36.97093023]
[67.01941748 47.23300971]
[52.73553719 50.55371901]


In [13]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{1860: array([50.31395349, 36.97093023]), 2812: array([46.23611111, 60.82638889]), 2665: array([52.73553719, 50.55371901]), 1592: array([34.43137255, 46.25      ]), 1020: array([31.61237785, 32.28338762]), 2027: array([44.26623377, 45.79220779]), 1420: array([38.83888889, 36.575     ]), 3165: array([67.01941748, 47.23300971]), 3887: array([61.84090909, 62.86363636])}


In [14]:
hassorted = sorted(tosort.items())
print(hassorted)
print(",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(1020, array([31.61237785, 32.28338762])), (1420, array([38.83888889, 36.575     ])), (1592, array([34.43137255, 46.25      ])), (1860, array([50.31395349, 36.97093023])), (2027, array([44.26623377, 45.79220779])), (2665, array([52.73553719, 50.55371901])), (2812, array([46.23611111, 60.82638889])), (3165, array([67.01941748, 47.23300971])), (3887, array([61.84090909, 62.86363636]))]
31,32,  38,36,  34,46,  50,36,  44,45,  52,50,  46,60,  67,47,  61,62


### cut cells from yolo images/txts

In [3]:
import os
from PIL import Image

image_dir = "../../yolo_sc_20181128/SC_yolo/train"
save_path = "../../yolo_sc_20181128/SC_cell"

def cut_cell(img_name, txt_name, save_path):
    img = Image.open(img_name)
    w,h = img.size
    labels = []
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            cx, cy = float(tokens[1]), float(tokens[2])
            w_, h_ = float(tokens[3]), float(tokens[4])
            xmin, ymin = int((cx-w_/2)*w), int((cy-h_/2)*h)
            xmax, ymax = int((cx+w_/2)*w), int((cy+h_/2)*h)
            img.crop((xmin, ymin, xmax, ymax)).save(os.path.join(save_path, "{}_{}_{}_{}.jpg".format(xmin, ymin, xmax, ymax)))
    img.close()
    
img_names = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
for img_name in img_names:
    txt_name = os.path.splitext(img_name)[0] + ".txt"
    cut_cell(img_name, txt_name, save_path)