In [1]:
import os
import cv2
import xml.dom.minidom
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

### find proper training input image size (608x608)

In [2]:
# collect all sizes of images
image_path = "./CC"
image_names = [os.path.join(image_path, f) for f in os.listdir(image_path) if f.endswith(".jpg")]
image_sizes = []
for image_name in image_names:
    with Image.open(image_name) as image:
        image_sizes.append(image.size)

In [None]:
# kmeans clustering
X = np.array(image_sizes)
kmeans = KMeans(n_clusters=9, random_state=2018).fit(X)
print(kmeans)

In [None]:
centers = []
for center in kmeans.cluster_centers_:
    print(center)
    centers.append(center)

In [None]:
total = len(image_sizes)
large = len([image_size for image_size in image_sizes if image_size[0] > 608 or image_size[1] > 608])
print(total, large)

### image padding and resizing

In [3]:
def read_labels(xml_name):
    DOMTree = xml.dom.minidom.parse(xml_name)
    collection = DOMTree.documentElement
    filename = collection.getElementsByTagName("filename")
    objects = collection.getElementsByTagName("object")
        
    w = collection.getElementsByTagName("width")[0]
    w_val = int(w.firstChild.nodeValue)
    h = collection.getElementsByTagName("height")[0]
    h_val = int(h.firstChild.nodeValue)
    w.firstChild.replaceWholeText(str(h_val))
    h.firstChild.replaceWholeText(str(w_val))
    
    labels = {'w':w_val, 'h':h_val, 'boxes':[]}
    
    for object in objects:
        xmin = object.getElementsByTagName("xmin")[0]
        xmin_val = int(xmin.firstChild.nodeValue)
        xmax = object.getElementsByTagName("xmax")[0]
        xmax_val = int(xmax.firstChild.nodeValue)
        ymin = object.getElementsByTagName("ymin")[0]
        ymin_val = int(ymin.firstChild.nodeValue)
        ymax = object.getElementsByTagName("ymax")[0]
        ymax_val = int(ymax.firstChild.nodeValue)
        labels["boxes"].append([xmin_val, ymin_val, xmax_val, ymax_val])
        
    return labels


def resize_img_with_padding(img_name, size, save_path):
    xml_name = os.path.splitext(img_name)[0] + ".xml"
    if not os.path.isfile(xml_name):
        return
    
    img_name_new = os.path.join(save_path, os.path.basename(img_name))
    txt_name = os.path.splitext(img_name_new)[0] + ".txt"
    
    img = Image.open(img_name)
    
    # pad and crop image
    x = -((size - img.size[0]) // 2)
    y = -((size - img.size[1]) // 2)
    img_croped = img.crop(
        (
            x,
            y,
            size + x,
            size + y
        )
    )
    img_croped.save(img_name_new)

    # change label coordinates
    labels = read_labels(xml_name)
    labels_yolo = []
    for box in labels["boxes"]:
        xmin = box[0] - x
        ymin = box[1] - y
        xmax = box[2] - x
        ymax = box[3] - y
        xcenter = (xmin+xmax)/2.0/size
        ycenter = (ymin+ymax)/2.0/size
        w = (xmax-xmin)/size
        h = (ymax-ymin)/size
        labels_yolo.append(['0', str(xcenter), str(ycenter), str(w), str(h)])
        
#         # cut cells
#         cell_name = os.path.splitext(img_name_new)[0] + '_' + str(xmin) + '_' + str(ymin) + ".jpg"
#         img_croped.crop((xmin, ymin, xmax, ymax)).save(cell_name)
        
    # write lables to txt
    with open(txt_name, 'w') as f:
        for label in labels_yolo:
            f.write(','.join(label) + '\n')
            
    print("processed ", img_name)

In [5]:
save_path = "./CC_yolo"
size = 608

for image_name in image_names:
    resize_img_with_padding(image_name, size, save_path)

processed  ./CC/2018-01-15-17_36_38_x35516_y13643_w290_h392.jpg
processed  ./CC/2017-09-07-18_47_44_x21331_y31270_w280_h632.jpg
processed  ./CC/2018-01-15-17_36_38_x31706_y31869_w388_h310.jpg
processed  ./CC/2017-09-07-18_47_44_x32544_y18086_w338_h396.jpg
processed  ./CC/2018-01-15-17_36_38_x7307_y40274_w484_h474.jpg
processed  ./CC/2017-09-07-18_47_44_x25617_y23376_w506_h324.jpg
processed  ./CC/2017-10-09-19_01_07_x52877_y40763_w326_h260.jpg
processed  ./CC/2018-01-15-17_52_36_x16884_y13624_w454_h462.jpg
processed  ./CC/2017-09-07-18_47_44_x25919_y13938_w344_h324.jpg
processed  ./CC/2017-09-07-18_47_44_x51861_y21597_w472_h608.jpg
processed  ./CC/2018-01-15-17_45_12_x29819_y22868_w534_h356.jpg
processed  ./CC/2017-09-07-18_47_44_x10470_y36268_w470_h420.jpg
processed  ./CC/2018-01-15-17_36_38_x11225_y44692_w296_h242.jpg
processed  ./CC/2017-09-07-18_47_44_x31515_y34090_w288_h434.jpg
processed  ./CC/2018-01-15-17_52_36_x17427_y47127_w354_h472.jpg
processed  ./CC/2018-01-15-17_36_38_x8129

processed  ./CC/2018-01-15-17_52_36_x18175_y41129_w462_h630.jpg
processed  ./CC/2018-01-15-17_52_36_x16068_y34410_w528_h370.jpg
processed  ./CC/2018-01-15-17_52_36_x20521_y11997_w478_h462.jpg
processed  ./CC/2018-01-15-17_41_06_x14313_y19783_w280_h428.jpg
processed  ./CC/2018-01-15-17_52_36_x24683_y5817_w450_h432.jpg
processed  ./CC/2018-01-15-17_36_38_x40328_y17265_w248_h458.jpg
processed  ./CC/2017-09-07-18_47_44_x23175_y8003_w530_h350.jpg
processed  ./CC/2017-09-07-18_47_44_x34206_y32292_w382_h354.jpg
processed  ./CC/2018-01-15-17_52_36_x18851_y20137_w228_h310.jpg
processed  ./CC/2018-01-15-17_36_38_x7991_y32261_w284_h242.jpg
processed  ./CC/2018-01-15-17_45_12_x8040_y27302_w458_h558.jpg
processed  ./CC/2017-09-07-18_47_44_x22484_y17034_w468_h400.jpg
processed  ./CC/2018-01-15-17_36_38_x22407_y13068_w330_h292.jpg
processed  ./CC/2018-01-15-17_36_38_x10946_y50670_w372_h340.jpg
processed  ./CC/2018-01-15-17_52_36_x25322_y22393_w350_h404.jpg
processed  ./CC/2018-01-15-17_36_38_x18236_y

processed  ./CC/2017-10-10-15_25_54_x13213_y16534_w490_h306.jpg
processed  ./CC/2017-09-07-18_47_44_x34365_y13841_w484_h406.jpg
processed  ./CC/2018-01-15-17_36_38_x18865_y19708_w314_h292.jpg
processed  ./CC/2018-01-15-17_52_36_x16656_y36748_w412_h356.jpg
processed  ./CC/2018-01-15-17_52_36_x19196_y27381_w376_h224.jpg
processed  ./CC/2018-01-15-17_52_36_x22113_y21717_w460_h512.jpg
processed  ./CC/2017-10-09-15_59_05_x54490_y44506_w354_h470.jpg
processed  ./CC/2018-01-15-17_36_38_x21172_y29467_w324_h420.jpg
processed  ./CC/2018-01-15-17_45_12_x26113_y28565_w444_h486.jpg
processed  ./CC/2017-10-09-18_43_24_x10121_y38247_w258_h454.jpg
processed  ./CC/2017-10-09-15_59_05_x23301_y37052_w398_h332.jpg
processed  ./CC/2018-01-15-17_45_12_x32698_y25894_w526_h630.jpg
processed  ./CC/2018-01-15-17_36_38_x47922_y36294_w386_h304.jpg
processed  ./CC/2017-09-07-18_47_44_x14239_y42406_w234_h444.jpg
processed  ./CC/2018-01-15-17_52_36_x25226_y7747_w438_h346.jpg
processed  ./CC/2018-01-15-17_41_06_x1438

processed  ./CC/2018-01-15-17_45_12_x9355_y37177_w424_h430.jpg
processed  ./CC/2018-01-15-17_52_36_x23542_y37594_w388_h372.jpg
processed  ./CC/2017-10-09-19_01_07_x26680_y29503_w226_h420.jpg
processed  ./CC/2017-09-07-18_47_44_x34180_y35924_w474_h398.jpg
processed  ./CC/2017-09-07-18_47_44_x33820_y44407_w414_h438.jpg
processed  ./CC/2017-10-10-15_25_54_x10345_y38300_w426_h422.jpg
processed  ./CC/2017-10-09-19_01_07_x19638_y19565_w314_h202.jpg
processed  ./CC/2017-10-09-19_01_07_x30036_y13907_w346_h332.jpg
processed  ./CC/2018-01-15-17_36_38_x16323_y26279_w464_h400.jpg
processed  ./CC/2018-01-15-17_36_38_x7852_y30996_w384_h454.jpg
processed  ./CC/2018-01-15-17_36_38_x37359_y16467_w266_h228.jpg
processed  ./CC/2017-09-07-18_47_44_x28626_y37030_w438_h466.jpg
processed  ./CC/2017-10-09-19_01_07_x50494_y45590_w418_h340.jpg
processed  ./CC/2018-01-15-17_41_06_x14000_y40657_w358_h336.jpg
processed  ./CC/2018-01-15-17_52_36_x26157_y49074_w442_h442.jpg
processed  ./CC/2018-01-15-17_36_38_x34154

processed  ./CC/2018-01-15-17_36_38_x27380_y34357_w536_h440.jpg
processed  ./CC/2017-10-09-19_01_07_x13199_y29721_w356_h396.jpg
processed  ./CC/2018-01-15-17_52_36_x22393_y21056_w512_h476.jpg
processed  ./CC/2018-01-15-17_52_36_x18592_y29297_w426_h324.jpg
processed  ./CC/2017-10-12-21_37_42_x24730_y41706_w306_h452.jpg
processed  ./CC/2018-01-15-17_36_38_x29128_y53843_w332_h286.jpg
processed  ./CC/2018-01-15-17_52_36_x28983_y5259_w468_h346.jpg
processed  ./CC/2018-01-15-17_52_36_x24624_y17978_w412_h280.jpg
processed  ./CC/2017-11-24-12_36_50_x34627_y45183_w344_h358.jpg
processed  ./CC/2017-10-10-15_25_54_x10572_y42052_w334_h348.jpg
processed  ./CC/2018-01-15-17_36_38_x42010_y39766_w466_h410.jpg
processed  ./CC/2017-09-07-18_47_44_x26398_y21196_w252_h480.jpg
processed  ./CC/2017-10-09-19_01_07_x25450_y14999_w400_h368.jpg
processed  ./CC/2017-10-09-19_01_07_x45637_y26536_w300_h386.jpg
processed  ./CC/2018-01-15-17_36_38_x22331_y36501_w454_h384.jpg
processed  ./CC/2017-09-07-18_47_44_x2750

processed  ./CC/2017-11-24-13_10_54_x7131_y28267_w386_h288.jpg
processed  ./CC/2017-10-10-15_25_54_x10250_y32926_w376_h346.jpg
processed  ./CC/2018-01-15-17_47_42_x27686_y16547_w402_h310.jpg
processed  ./CC/2018-01-15-17_52_36_x12919_y35541_w238_h464.jpg
processed  ./CC/2018-01-15-17_36_38_x24647_y42048_w220_h412.jpg
processed  ./CC/2018-01-15-17_52_36_x18002_y28439_w430_h288.jpg
processed  ./CC/2017-10-09-19_01_07_x36557_y33077_w350_h340.jpg
processed  ./CC/2018-01-15-17_43_13_x40168_y6740_w392_h314.jpg
processed  ./CC/2017-10-09-19_01_07_x24998_y25530_w400_h350.jpg
processed  ./CC/2017-09-07-18_47_44_x36784_y26707_w374_h308.jpg
processed  ./CC/2017-10-10-15_25_54_x15820_y15364_w354_h306.jpg
processed  ./CC/2017-10-10-16_20_45_x37970_y45366_w384_h464.jpg
processed  ./CC/2017-09-07-18_47_44_x44074_y32559_w454_h358.jpg
processed  ./CC/2018-01-15-17_36_38_x27684_y23231_w304_h382.jpg
processed  ./CC/2018-01-15-17_52_36_x23836_y22529_w546_h282.jpg
processed  ./CC/2017-11-24-13_10_54_x40973

processed  ./CC/2018-01-15-17_36_38_x16644_y13446_w410_h380.jpg
processed  ./CC/2017-10-10-15_25_54_x13803_y35779_w344_h432.jpg
processed  ./CC/2018-01-15-17_36_38_x5559_y41464_w284_h412.jpg
processed  ./CC/2017-10-10-16_20_45_x13311_y32813_w346_h486.jpg
processed  ./CC/2018-01-15-17_36_38_x45800_y31664_w424_h358.jpg
processed  ./CC/2017-09-07-18_47_44_x18269_y38573_w424_h462.jpg
processed  ./CC/2017-10-09-19_01_07_x9934_y30880_w296_h354.jpg
processed  ./CC/2017-10-09-19_01_07_x31530_y19778_w432_h366.jpg
processed  ./CC/2018-01-15-17_36_38_x24852_y46417_w290_h378.jpg
processed  ./CC/2018-01-15-17_36_38_x37637_y29546_w238_h348.jpg
processed  ./CC/2018-01-15-17_41_06_x6397_y30913_w400_h322.jpg
processed  ./CC/2017-10-10-17_23_26_x48938_y29046_w370_h262.jpg
processed  ./CC/2017-10-10-15_25_54_x56898_y34191_w430_h558.jpg
processed  ./CC/2018-01-15-17_36_38_x6237_y23831_w224_h350.jpg
processed  ./CC/2018-01-15-17_36_38_x23877_y9567_w288_h240.jpg
processed  ./CC/2018-01-15-17_36_38_x40675_y2

processed  ./CC/2017-10-09-19_01_07_x14088_y18798_w438_h326.jpg
processed  ./CC/2017-09-07-18_47_44_x31361_y38599_w502_h368.jpg
processed  ./CC/2018-01-15-17_36_38_x30530_y45452_w312_h198.jpg
processed  ./CC/2017-10-10-15_25_54_x17932_y45056_w426_h380.jpg
processed  ./CC/2018-01-15-17_36_38_x45848_y36918_w344_h308.jpg
processed  ./CC/2017-10-10-15_25_54_x17915_y35155_w528_h470.jpg
processed  ./CC/2017-10-09-19_01_07_x29627_y39954_w314_h254.jpg
processed  ./CC/2017-11-24-13_16_54_x20087_y23886_w320_h440.jpg
processed  ./CC/2018-01-15-17_52_36_x22240_y9589_w216_h306.jpg
processed  ./CC/2017-11-24-13_16_54_x20322_y26544_w330_h312.jpg
processed  ./CC/2017-09-07-18_47_44_x12197_y24536_w334_h588.jpg
processed  ./CC/2017-09-22-13_57_59_x48874_y11075_w386_h348.jpg
processed  ./CC/2017-10-09-19_01_07_x27349_y10765_w430_h364.jpg
processed  ./CC/2017-09-07-18_47_44_x13110_y22188_w442_h372.jpg
processed  ./CC/2017-10-09-15_59_05_x54255_y36414_w336_h340.jpg
processed  ./CC/2018-01-15-17_52_36_x2173

processed  ./CC/2017-10-10-15_25_54_x13927_y30177_w376_h520.jpg
processed  ./CC/2018-01-15-17_52_36_x19297_y32582_w424_h410.jpg
processed  ./CC/2017-10-09-15_59_05_x45089_y21909_w562_h212.jpg
processed  ./CC/2018-01-15-17_41_06_x11600_y15161_w326_h452.jpg
processed  ./CC/2017-10-10-15_25_54_x10851_y33900_w544_h322.jpg
processed  ./CC/2018-01-15-17_36_38_x36716_y36076_w386_h338.jpg
processed  ./CC/2017-09-07-09_24_10_x22420_y19050_w390_h410.jpg
processed  ./CC/2017-10-09-15_59_05_x43813_y18783_w430_h384.jpg
processed  ./CC/2017-10-10-16_20_45_x42193_y11331_w376_h370.jpg
processed  ./CC/2018-01-15-17_36_38_x32726_y38399_w412_h284.jpg
processed  ./CC/2018-01-15-17_45_12_x8826_y37057_w424_h376.jpg
processed  ./CC/2017-10-10-15_25_54_x18624_y40841_w572_h460.jpg
processed  ./CC/2017-10-12-21_37_42_x24557_y28969_w380_h288.jpg
processed  ./CC/2017-09-07-18_47_44_x48716_y45969_w366_h390.jpg
processed  ./CC/2017-10-09-15_59_05_x16279_y46694_w316_h386.jpg
processed  ./CC/2018-01-15-17_52_36_x1565

processed  ./CC/2017-10-10-15_25_54_x54894_y31666_w492_h516.jpg
processed  ./CC/2018-01-15-17_36_38_x10476_y31356_w416_h418.jpg
processed  ./CC/2017-09-07-18_47_44_x21309_y10904_w378_h296.jpg
processed  ./CC/2017-10-10-15_25_54_x9667_y32814_w348_h650.jpg
processed  ./CC/2018-01-15-17_52_36_x14439_y48868_w414_h506.jpg
processed  ./CC/2018-01-15-17_36_38_x33034_y29291_w280_h248.jpg
processed  ./CC/2017-11-24-13_12_52_x50262_y22855_w296_h316.jpg
processed  ./CC/2017-10-09-19_01_07_x46332_y23026_w288_h448.jpg
processed  ./CC/2018-01-15-17_52_36_x18769_y33948_w340_h488.jpg
processed  ./CC/2018-01-15-17_52_36_x21125_y51519_w190_h300.jpg
processed  ./CC/2018-01-15-17_36_38_x36624_y14572_w414_h388.jpg
processed  ./CC/2017-10-10-15_25_54_x13056_y38896_w472_h478.jpg
processed  ./CC/2018-01-15-17_52_36_x15051_y43147_w570_h386.jpg
processed  ./CC/2017-10-09-19_01_07_x10265_y15055_w354_h466.jpg
processed  ./CC/2017-11-24-13_16_54_x17565_y22175_w300_h248.jpg
processed  ./CC/2017-09-07-18_47_44_x4588

processed  ./CC/2018-01-15-17_36_38_x35967_y26463_w412_h292.jpg
processed  ./CC/2018-01-15-17_52_36_x25395_y15536_w488_h426.jpg
processed  ./CC/2017-09-07-18_47_44_x38175_y24869_w296_h446.jpg
processed  ./CC/2018-01-15-17_41_06_x15169_y41133_w362_h414.jpg
processed  ./CC/2018-01-15-17_36_38_x13267_y34629_w502_h466.jpg
processed  ./CC/2018-01-15-17_52_36_x17956_y12178_w370_h230.jpg
processed  ./CC/2017-10-09-19_01_07_x7408_y35115_w382_h264.jpg
processed  ./CC/2017-10-09-19_01_07_x12609_y39199_w276_h452.jpg
processed  ./CC/2017-10-09-19_01_07_x33135_y49206_w274_h390.jpg
processed  ./CC/2018-01-15-17_36_38_x17025_y50236_w346_h306.jpg
processed  ./CC/2017-09-07-18_47_44_x29845_y14244_w446_h464.jpg
processed  ./CC/2017-09-07-18_47_44_x13647_y13406_w354_h428.jpg
processed  ./CC/2017-10-09-19_01_07_x46985_y13861_w424_h466.jpg
processed  ./CC/2018-01-15-17_52_36_x17576_y32885_w362_h194.jpg
processed  ./CC/2017-09-07-18_47_44_x40270_y41066_w484_h530.jpg
processed  ./CC/2018-01-15-17_45_12_x1095

processed  ./CC/2017-10-10-16_20_45_x15140_y11866_w528_h494.jpg
processed  ./CC/2017-09-07-18_47_44_x32390_y9460_w394_h482.jpg
processed  ./CC/2018-01-15-17_41_06_x5807_y31500_w316_h384.jpg
processed  ./CC/2018-01-15-17_36_38_x21857_y27181_w342_h308.jpg
processed  ./CC/2018-01-15-17_36_38_x10971_y36103_w360_h470.jpg
processed  ./CC/2018-01-15-17_36_38_x21091_y21462_w482_h378.jpg
processed  ./CC/2018-01-15-17_36_38_x22193_y16978_w296_h394.jpg
processed  ./CC/2018-01-15-17_52_36_x24166_y15930_w350_h472.jpg
processed  ./CC/2018-01-15-17_36_38_x15209_y49583_w404_h366.jpg
processed  ./CC/2018-01-15-17_36_38_x48485_y26303_w366_h420.jpg
processed  ./CC/2017-10-10-16_20_45_x46028_y35594_w488_h582.jpg
processed  ./CC/2017-09-07-18_47_44_x29527_y23443_w472_h444.jpg
processed  ./CC/2017-10-09-19_01_07_x35265_y20276_w376_h276.jpg
processed  ./CC/2018-01-15-17_36_38_x22119_y40835_w402_h326.jpg
processed  ./CC/2017-09-07-18_47_44_x27596_y5756_w446_h302.jpg
processed  ./CC/2018-01-15-17_45_12_x9151_y

processed  ./CC/2018-01-15-17_52_36_x14547_y48518_w274_h220.jpg
processed  ./CC/2018-01-15-17_52_36_x20304_y33197_w404_h414.jpg
processed  ./CC/2018-01-15-17_36_38_x21493_y53086_w412_h296.jpg
processed  ./CC/2017-09-07-18_47_44_x38489_y46623_w324_h362.jpg
processed  ./CC/2017-10-09-19_01_07_x26587_y39797_w352_h430.jpg
processed  ./CC/2017-09-07-18_47_44_x48192_y7200_w448_h512.jpg
processed  ./CC/2017-10-09-19_01_07_x20835_y51836_w350_h456.jpg
processed  ./CC/2017-09-07-18_47_44_x37842_y36256_w388_h390.jpg
processed  ./CC/2018-01-15-17_36_38_x42519_y49978_w380_h402.jpg
processed  ./CC/2018-01-15-17_36_38_x26993_y51739_w296_h374.jpg
processed  ./CC/2018-01-15-17_45_12_x8734_y15785_w424_h394.jpg
processed  ./CC/2017-09-07-18_47_44_x48860_y32050_w284_h408.jpg
processed  ./CC/2017-10-09-19_01_07_x32670_y7494_w322_h368.jpg
processed  ./CC/2017-10-10-15_25_54_x14263_y21355_w502_h400.jpg
processed  ./CC/2017-09-07-18_47_44_x7915_y32127_w430_h412.jpg
processed  ./CC/2018-01-15-17_36_38_x9332_y1

processed  ./CC/2017-10-10-16_20_45_x38532_y14699_w508_h334.jpg
processed  ./CC/2017-10-09-19_01_07_x18141_y8894_w236_h472.jpg
processed  ./CC/2017-10-10-16_20_45_x37405_y34393_w444_h592.jpg
processed  ./CC/2017-11-24-13_16_54_x18866_y27910_w402_h280.jpg
processed  ./CC/2017-10-09-15_59_05_x29194_y26766_w472_h452.jpg
processed  ./CC/2017-10-10-15_25_54_x14238_y38190_w526_h386.jpg
processed  ./CC/2017-10-10-17_23_26_x22777_y45435_w310_h316.jpg
processed  ./CC/2017-09-07-09_41_39_x41569_y10871_w340_h294.jpg
processed  ./CC/2017-10-09-15_59_05_x50943_y45195_w298_h438.jpg
processed  ./CC/2018-01-15-17_36_38_x28181_y23313_w304_h372.jpg
processed  ./CC/2018-01-15-17_41_06_x35277_y12934_w438_h480.jpg
processed  ./CC/2018-01-15-17_52_36_x23610_y18051_w328_h388.jpg
processed  ./CC/2018-01-15-17_52_36_x15825_y36133_w370_h404.jpg
processed  ./CC/2017-10-09-15_59_05_x55028_y21386_w366_h462.jpg
processed  ./CC/2017-09-07-18_47_44_x28859_y31602_w428_h480.jpg
processed  ./CC/2017-11-24-13_10_54_x4758

### split data into train/valid

In [6]:
import random
import shutil

data_path = "./CC_yolo"
img_names = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith(".jpg")]

random.shuffle(img_names)
random.shuffle(img_names)

split = 0.8

train_names = img_names[:int(len(img_names)*split)]
valid_names = img_names[int(len(img_names)*split):]

train_path = "./CC_yolo"
for img_name in train_names:
    txt_name = os.path.splitext(img_name)[0] + ".txt"
    shutil.move(img_name, os.path.join(train_path, "train"))
    shutil.move(txt_name, os.path.join(train_path, "train"))
    
for img_name in valid_names:
    txt_name = os.path.splitext(img_name)[0] + ".txt"
    shutil.move(img_name, os.path.join(train_path, "valid"))
    shutil.move(txt_name, os.path.join(train_path, "valid"))

### kmeans clustering

In [10]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

def collect_sizes_from_yolotxt(txt_name, size):
    sizes = []
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split(',')
#             print(tokens)
            w = int(float(tokens[3])*size)
            h = int(float(tokens[4])*size)
            sizes.append([w, h])
    return sizes

def collect_sizes_from_yolotxt_all(txt_names, size):
    sizes = []
    for txt_name in txt_names:
        sizes += collect_sizes_from_yolotxt(txt_name, size)
    return sizes

In [12]:
sizes = collect_sizes_from_yolotxt_all(scan_files("./CC_yolo", postfix=".txt"), 608)

# kmeans clustering
X = np.array(sizes)
kmeans = KMeans(n_clusters=9, random_state=2018).fit(X)
print(kmeans)

['0', '0.5213815789473685', '0.5032894736842105', '0.3223684210526316', '0.4144736842105263']
['0', '0.5032894736842105', '0.5008223684210527', '0.27631578947368424', '0.3470394736842105']
['0', '0.5082236842105263', '0.4876644736842105', '0.3157894736842105', '0.3404605263157895']
['0', '0.4901315789473684', '0.5008223684210527', '0.22697368421052633', '0.3371710526315789']
['0', '0.5016447368421053', '0.4942434210526316', '0.34210526315789475', '0.29769736842105265']
['0', '0.5205592105263158', '0.4975328947368421', '0.3667763157894737', '0.29769736842105265']
['0', '0.4975328947368421', '0.5024671052631579', '0.2878289473684211', '0.2944078947368421']
['0', '0.49917763157894735', '0.4967105263157895', '0.4292763157894737', '0.3717105263157895']
['0', '0.524671052631579', '0.5074013157894737', '0.3355263157894737', '0.37664473684210525']
['0', '0.5032894736842105', '0.4917763157894737', '0.25', '0.3026315789473684']
['0', '0.4975328947368421', '0.48355263157894735', '0.29111842105263

['0', '0.49588815789473684', '0.49506578947368424', '0.23848684210526316', '0.34868421052631576']
['0', '0.5057565789473685', '0.5024671052631579', '0.26480263157894735', '0.2450657894736842']
['0', '0.49588815789473684', '0.49835526315789475', '0.23190789473684212', '0.24013157894736842']
['0', '0.4975328947368421', '0.5139802631578947', '0.3569078947368421', '0.36019736842105265']
['0', '0.5', '0.5016447368421053', '0.27960526315789475', '0.33881578947368424']
['0', '0.5032894736842105', '0.48848684210526316', '0.1513157894736842', '0.27631578947368424']
['0', '0.48930921052631576', '0.5115131578947368', '0.36348684210526316', '0.3125']
['0', '0.4967105263157895', '0.5213815789473685', '0.3223684210526316', '0.35855263157894735']
['0', '0.4942434210526316', '0.4967105263157895', '0.2911184210526316', '0.2598684210526316']
['0', '0.5024671052631579', '0.5024671052631579', '0.32401315789473684', '0.20559210526315788']
['0', '0.49917763157894735', '0.5476973684210527', '0.36348684210526

In [13]:
centers = []
for center in kmeans.cluster_centers_:
    print(center)
    centers.append(center)

[147.3045977  146.37356322]
[186.50276243 249.98895028]
[246.2568306 170.3442623]
[287.83333333 239.97368421]
[153.48       200.14222222]
[233.54275093 220.91449814]
[240.08403361 277.05882353]
[197.34421365 196.72700297]
[194.19066148 153.46303502]


In [14]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{69072: array([287.83333333, 239.97368421]), 66517: array([240.08403361, 277.05882353]), 38822: array([197.34421365, 196.72700297]), 29801: array([194.19066148, 153.46303502]), 51592: array([233.54275093, 220.91449814]), 21561: array([147.3045977 , 146.37356322]), 41948: array([246.2568306, 170.3442623]), 30717: array([153.48      , 200.14222222]), 46623: array([186.50276243, 249.98895028])}


In [15]:
hassorted = sorted(tosort.items())
print(hassorted)
print(",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(21561, array([147.3045977 , 146.37356322])), (29801, array([194.19066148, 153.46303502])), (30717, array([153.48      , 200.14222222])), (38822, array([197.34421365, 196.72700297])), (41948, array([246.2568306, 170.3442623])), (46623, array([186.50276243, 249.98895028])), (51592, array([233.54275093, 220.91449814])), (66517, array([240.08403361, 277.05882353])), (69072, array([287.83333333, 239.97368421]))]
147,146,  194,153,  153,200,  197,196,  246,170,  186,249,  233,220,  240,277,  287,239


### cut cells from yolo images/txts

In [1]:
import os
from PIL import Image

image_dir = "./CC_yolo/train"
save_path = "./CC_cell"

def cut_cell(img_name, txt_name, save_path):
    img = Image.open(img_name)
    w,h = img.size
    labels = []
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split(',')
            cx, cy = float(tokens[1]), float(tokens[2])
            w_, h_ = float(tokens[3]), float(tokens[4])
            xmin, ymin = int((cx-w_/2)*w), int((cy-h_/2)*h)
            xmax, ymax = int((cx+w_/2)*w), int((cy+h_/2)*h)
            img.crop((xmin, ymin, xmax, ymax)).save(os.path.join(save_path, "{}_{}_{}_{}.jpg".format(xmin, ymin, xmax, ymax)))
    img.close()
    
img_names = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(".jpg")]
for img_name in img_names:
    txt_name = os.path.splitext(img_name)[0] + ".txt"
    cut_cell(img_name, txt_name, save_path)