In [1]:
import os
import numpy as np
from sklearn.cluster import KMeans

from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [6]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list


def collect_sizes(txt_fname, size):
    sizes = []
    with open(txt_fname, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            sizes.append([int(float(tokens[3])*size), int(float(tokens[4])*size)])
    return sizes

def batch_collect_sizes(txt_fnames, size):
    sizes = []
    for txt_fname in txt_fnames:
        sizes += collect_sizes(txt_fname, size)
    return sizes

def worker_single(data_path, size=608):
    txt_fnames = scan_files(data_path, postfix=".txt")
    sizes = []
    for i,txt_fname in enumerate(txt_fnames):
        if i % 10000 == 0:
            print(i)
        sizes += collect_sizes(txt_fname, size)
    return sizes

def worker_multip(data_path, size=608):
    files = scan_files(data_path, postfix=".txt")
    print("# files:", len(files))

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []

    batch_size = 1000
    for i in range(0, len(files), batch_size):
        batch = files[i : i+batch_size]
        tasks.append(executor.submit(batch_collect_sizes, batch, size))
    
    sizes = []
    job_count = len(tasks)
    for future in as_completed(tasks):
        result = future.result()  # get the returning result from calling fuction
        sizes += result
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s, Files collected: %d" % (job_count, len(sizes)))
        
    return sizes

In [7]:
data_path = "/home/hdd0/Develop/liyu/batch6.3_1216"
sizes = worker_multip(data_path)

print("# files", len(sizes))

# files: 1294695
One Job Done, Remaining Job Count: 1294, Files collected: 2117
One Job Done, Remaining Job Count: 1293, Files collected: 4209
One Job Done, Remaining Job Count: 1292, Files collected: 6386
One Job Done, Remaining Job Count: 1291, Files collected: 8609
One Job Done, Remaining Job Count: 1290, Files collected: 10685
One Job Done, Remaining Job Count: 1289, Files collected: 12825
One Job Done, Remaining Job Count: 1288, Files collected: 15082
One Job Done, Remaining Job Count: 1287, Files collected: 17242
One Job Done, Remaining Job Count: 1286, Files collected: 19427
One Job Done, Remaining Job Count: 1285, Files collected: 21524
One Job Done, Remaining Job Count: 1284, Files collected: 23540
One Job Done, Remaining Job Count: 1283, Files collected: 25712
One Job Done, Remaining Job Count: 1282, Files collected: 27841
One Job Done, Remaining Job Count: 1281, Files collected: 30122
One Job Done, Remaining Job Count: 1280, Files collected: 32279
One Job Done, Remaining Job

One Job Done, Remaining Job Count: 1060, Files collected: 495764
One Job Done, Remaining Job Count: 1059, Files collected: 497885
One Job Done, Remaining Job Count: 1058, Files collected: 500067
One Job Done, Remaining Job Count: 1057, Files collected: 502212
One Job Done, Remaining Job Count: 1056, Files collected: 504457
One Job Done, Remaining Job Count: 1055, Files collected: 506589
One Job Done, Remaining Job Count: 1054, Files collected: 508592
One Job Done, Remaining Job Count: 1053, Files collected: 510684
One Job Done, Remaining Job Count: 1052, Files collected: 512826
One Job Done, Remaining Job Count: 1051, Files collected: 515017
One Job Done, Remaining Job Count: 1050, Files collected: 517215
One Job Done, Remaining Job Count: 1049, Files collected: 519407
One Job Done, Remaining Job Count: 1048, Files collected: 521565
One Job Done, Remaining Job Count: 1047, Files collected: 523685
One Job Done, Remaining Job Count: 1046, Files collected: 525757
One Job Done, Remaining J

Traceback (most recent call last):
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/queues.py", line 234, in _feed
    obj = ForkingPickler.dumps(obj)
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/reduction.py", line 50, in dumps
    cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function batch_collect_sizes at 0x7f35bf0e30d0>: it's not the same object as __main__.batch_collect_sizes
Traceback (most recent call last):
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/queues.py", line 234, in _feed
    obj = ForkingPickler.dumps(obj)
  File "/home/unicorn/.conda/envs/algo-work/lib/python3.5/multiprocessing/reduction.py", line 50, in dumps
    cls(buf, protocol).dump(obj)
_pickle.PicklingError: Can't pickle <function batch_collect_sizes at 0x7f35bf0e30d0>: it's not the same object as __main__.batch_collect_sizes
Traceback (most recent call last):
  File "/home/unicorn/.conda/envs/algo-work/lib

One Job Done, Remaining Job Count: 1028, Files collected: 563188
One Job Done, Remaining Job Count: 1027, Files collected: 565399
One Job Done, Remaining Job Count: 1026, Files collected: 567568
One Job Done, Remaining Job Count: 1025, Files collected: 569489
One Job Done, Remaining Job Count: 1024, Files collected: 571493
One Job Done, Remaining Job Count: 1023, Files collected: 573619
One Job Done, Remaining Job Count: 1022, Files collected: 575743
One Job Done, Remaining Job Count: 1021, Files collected: 577873
One Job Done, Remaining Job Count: 1020, Files collected: 580208
One Job Done, Remaining Job Count: 1019, Files collected: 582378
One Job Done, Remaining Job Count: 1018, Files collected: 584570
One Job Done, Remaining Job Count: 1017, Files collected: 586706
One Job Done, Remaining Job Count: 1016, Files collected: 588810
One Job Done, Remaining Job Count: 1015, Files collected: 590935
One Job Done, Remaining Job Count: 1014, Files collected: 592988
One Job Done, Remaining J

One Job Done, Remaining Job Count: 900, Files collected: 834796
One Job Done, Remaining Job Count: 899, Files collected: 837057
One Job Done, Remaining Job Count: 898, Files collected: 839273
One Job Done, Remaining Job Count: 897, Files collected: 841282
One Job Done, Remaining Job Count: 896, Files collected: 843461
One Job Done, Remaining Job Count: 895, Files collected: 845576
One Job Done, Remaining Job Count: 894, Files collected: 847642
One Job Done, Remaining Job Count: 893, Files collected: 849736
One Job Done, Remaining Job Count: 892, Files collected: 851964
One Job Done, Remaining Job Count: 891, Files collected: 853994
One Job Done, Remaining Job Count: 890, Files collected: 856005
One Job Done, Remaining Job Count: 889, Files collected: 857946
One Job Done, Remaining Job Count: 888, Files collected: 859896
One Job Done, Remaining Job Count: 887, Files collected: 862200
One Job Done, Remaining Job Count: 886, Files collected: 864443
One Job Done, Remaining Job Count: 885, 

One Job Done, Remaining Job Count: 772, Files collected: 1105567
One Job Done, Remaining Job Count: 771, Files collected: 1107827
One Job Done, Remaining Job Count: 770, Files collected: 1109961
One Job Done, Remaining Job Count: 769, Files collected: 1112035
One Job Done, Remaining Job Count: 768, Files collected: 1114262
One Job Done, Remaining Job Count: 767, Files collected: 1116338
One Job Done, Remaining Job Count: 766, Files collected: 1118455
One Job Done, Remaining Job Count: 765, Files collected: 1120518
One Job Done, Remaining Job Count: 764, Files collected: 1122678
One Job Done, Remaining Job Count: 763, Files collected: 1124709
One Job Done, Remaining Job Count: 762, Files collected: 1126779
One Job Done, Remaining Job Count: 761, Files collected: 1128806
One Job Done, Remaining Job Count: 760, Files collected: 1130927
One Job Done, Remaining Job Count: 759, Files collected: 1133123
One Job Done, Remaining Job Count: 758, Files collected: 1135198
One Job Done, Remaining J

One Job Done, Remaining Job Count: 645, Files collected: 1373172
One Job Done, Remaining Job Count: 644, Files collected: 1375210
One Job Done, Remaining Job Count: 643, Files collected: 1377481
One Job Done, Remaining Job Count: 642, Files collected: 1379570
One Job Done, Remaining Job Count: 641, Files collected: 1381566
One Job Done, Remaining Job Count: 640, Files collected: 1383620
One Job Done, Remaining Job Count: 639, Files collected: 1385635
One Job Done, Remaining Job Count: 638, Files collected: 1387799
One Job Done, Remaining Job Count: 637, Files collected: 1389943
One Job Done, Remaining Job Count: 636, Files collected: 1392042
One Job Done, Remaining Job Count: 635, Files collected: 1394318
One Job Done, Remaining Job Count: 634, Files collected: 1396523
One Job Done, Remaining Job Count: 633, Files collected: 1398741
One Job Done, Remaining Job Count: 632, Files collected: 1401003
One Job Done, Remaining Job Count: 631, Files collected: 1403315
One Job Done, Remaining J

One Job Done, Remaining Job Count: 518, Files collected: 1641478
One Job Done, Remaining Job Count: 517, Files collected: 1643523
One Job Done, Remaining Job Count: 516, Files collected: 1645482
One Job Done, Remaining Job Count: 515, Files collected: 1647587
One Job Done, Remaining Job Count: 514, Files collected: 1649720
One Job Done, Remaining Job Count: 513, Files collected: 1651815
One Job Done, Remaining Job Count: 512, Files collected: 1654026
One Job Done, Remaining Job Count: 511, Files collected: 1656149
One Job Done, Remaining Job Count: 510, Files collected: 1658309
One Job Done, Remaining Job Count: 509, Files collected: 1660489
One Job Done, Remaining Job Count: 508, Files collected: 1662578
One Job Done, Remaining Job Count: 507, Files collected: 1664610
One Job Done, Remaining Job Count: 506, Files collected: 1666750
One Job Done, Remaining Job Count: 505, Files collected: 1668793
One Job Done, Remaining Job Count: 504, Files collected: 1670985
One Job Done, Remaining J

One Job Done, Remaining Job Count: 390, Files collected: 1912864
One Job Done, Remaining Job Count: 389, Files collected: 1915083
One Job Done, Remaining Job Count: 388, Files collected: 1917213
One Job Done, Remaining Job Count: 387, Files collected: 1919197
One Job Done, Remaining Job Count: 386, Files collected: 1921370
One Job Done, Remaining Job Count: 385, Files collected: 1923280
One Job Done, Remaining Job Count: 384, Files collected: 1925441
One Job Done, Remaining Job Count: 383, Files collected: 1927492
One Job Done, Remaining Job Count: 382, Files collected: 1929577
One Job Done, Remaining Job Count: 381, Files collected: 1931647
One Job Done, Remaining Job Count: 380, Files collected: 1933777
One Job Done, Remaining Job Count: 379, Files collected: 1935934
One Job Done, Remaining Job Count: 378, Files collected: 1938028
One Job Done, Remaining Job Count: 377, Files collected: 1940273
One Job Done, Remaining Job Count: 376, Files collected: 1942447
One Job Done, Remaining J

One Job Done, Remaining Job Count: 263, Files collected: 2181562
One Job Done, Remaining Job Count: 262, Files collected: 2183671
One Job Done, Remaining Job Count: 261, Files collected: 2185744
One Job Done, Remaining Job Count: 260, Files collected: 2187746
One Job Done, Remaining Job Count: 259, Files collected: 2189888
One Job Done, Remaining Job Count: 258, Files collected: 2191940
One Job Done, Remaining Job Count: 257, Files collected: 2193960
One Job Done, Remaining Job Count: 256, Files collected: 2196016
One Job Done, Remaining Job Count: 255, Files collected: 2198014
One Job Done, Remaining Job Count: 254, Files collected: 2200225
One Job Done, Remaining Job Count: 253, Files collected: 2202497
One Job Done, Remaining Job Count: 252, Files collected: 2204550
One Job Done, Remaining Job Count: 251, Files collected: 2206651
One Job Done, Remaining Job Count: 250, Files collected: 2208793
One Job Done, Remaining Job Count: 249, Files collected: 2211053
One Job Done, Remaining J

One Job Done, Remaining Job Count: 136, Files collected: 2451911
One Job Done, Remaining Job Count: 135, Files collected: 2454028
One Job Done, Remaining Job Count: 134, Files collected: 2456152
One Job Done, Remaining Job Count: 133, Files collected: 2458131
One Job Done, Remaining Job Count: 132, Files collected: 2460232
One Job Done, Remaining Job Count: 131, Files collected: 2462342
One Job Done, Remaining Job Count: 130, Files collected: 2464305
One Job Done, Remaining Job Count: 129, Files collected: 2466308
One Job Done, Remaining Job Count: 128, Files collected: 2468388
One Job Done, Remaining Job Count: 127, Files collected: 2470459
One Job Done, Remaining Job Count: 126, Files collected: 2472664
One Job Done, Remaining Job Count: 125, Files collected: 2474908
One Job Done, Remaining Job Count: 124, Files collected: 2477037
One Job Done, Remaining Job Count: 123, Files collected: 2479276
One Job Done, Remaining Job Count: 122, Files collected: 2481395
One Job Done, Remaining J

One Job Done, Remaining Job Count: 8, Files collected: 2723991
One Job Done, Remaining Job Count: 7, Files collected: 2726065
One Job Done, Remaining Job Count: 6, Files collected: 2728228
One Job Done, Remaining Job Count: 5, Files collected: 2730436
One Job Done, Remaining Job Count: 4, Files collected: 2732618
One Job Done, Remaining Job Count: 3, Files collected: 2734077
One Job Done, Remaining Job Count: 2, Files collected: 2736131
One Job Done, Remaining Job Count: 1, Files collected: 2738206
One Job Done, Remaining Job Count: 0, Files collected: 2740557
# files 2740557


In [None]:
# collects sizes from additional folder
data_path = "/home/hdd_array0/batch6_1216/VOC2012/images-HSIL-bar"
sizes2 = main(data_path)
print("# files", len(sizes2))

sizes += sizes2
print("# files", len(sizes))

#### save sizes

In [2]:
import pickle

# # save file
# with open("batch6.3-ori-sizes.pkl", 'wb') as f:
#     pickle.dump(sizes, f)

# open saved file
with open("batch6.3-ori-sizes.pkl", 'rb') as f:
    sizes = pickle.load(f)

In [3]:
print(len(sizes))
print([item for item in sizes if item[0] >= 608 or item[1] >= 608])

2740557
[[608, 305], [186, 608], [608, 316], [608, 217], [608, 288], [306, 608], [608, 218], [608, 345], [608, 143], [608, 247], [608, 175], [206, 608], [246, 608], [608, 217], [608, 608], [608, 243], [608, 227], [246, 608], [289, 608], [608, 219], [289, 608], [245, 608], [608, 229], [297, 608], [608, 246], [608, 608], [608, 243], [608, 608], [608, 266], [608, 304], [608, 274], [206, 608], [608, 279], [296, 608], [295, 608], [608, 175], [282, 608], [608, 222], [316, 608], [608, 213], [370, 608], [608, 303], [608, 272], [608, 279], [608, 274], [296, 608], [330, 608], [608, 166], [296, 608], [608, 273], [608, 175], [608, 215], [608, 283], [608, 303], [608, 280], [608, 303], [295, 608], [608, 331], [608, 305], [281, 608], [608, 219], [608, 282], [608, 283], [608, 213], [608, 144], [608, 288], [608, 213], [608, 228], [608, 274], [608, 243], [295, 608], [608, 218], [608, 166], [316, 608], [370, 608], [608, 331], [608, 345], [295, 608], [608, 228], [206, 608], [608, 302], [608, 265], [289, 6

### plot scattered points

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = [size[0] for size in sizes]
y = [size[1] for size in sizes]

#### sizes, kmeans 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
#1class: 12,12,  28,29,  40,65,  62,43,  77,84,  108,63,  78,129,  113,103,  197,89,  142,145,  127,227,  220,183,  277,285,  573,255,  337,595
#11 classes: 12,12,  28,28,  65,40,  43,63,  84,77,  67,110,  129,76,  106,112,  93,192,  148,140,  232,122,  185,217,  284,277,  264,577,  587,314
"""

plt.show()

#### sizes_new, kmeans 15

In [None]:
# get the number of boxes that is bigger > 600
w_large = [size for size in sizes if size[0] > 600]
h_large = [size for size in sizes if size[1] > 600]
print("  ", "total", len(sizes), "w_large", len(w_large), "h_large", len(h_large))

In [None]:
# remove abnormal boxes
sizes_new = [size for size in sizes if size[0] < 600 and size[1] < 600]
print("  ", "sizes_new total", len(sizes_new))


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  27,28,  55,35,  38,58,  76,65,  62,96,  124,63,  97,94,  91,137,  133,113,  231,111,  160,163,  114,230,  222,233,  346,298
"""

plt.show()

#### sizes, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
13,13,  29,34,  49,27,  69,50,  54,77,  101,75,  85,108,  192,75,  127,119,  112,193,  193,162,  244,258,  535,247,  248,583,  556,560
"""

plt.show()

#### sizes_new, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  26,27,  48,29,  26,59,  48,53,  80,63,  59,94,  129,61,  94,100,  130,114,  96,173,  229,113,  171,175,  194,291,  308,246
"""

plt.show()

### kmeans

In [9]:
X = np.array(sizes)
kmeans = KMeans(n_clusters=15, random_state=1).fit(X)

In [10]:
centers = []
for center in kmeans.cluster_centers_:
    print("du", center)
    centers.append(center)
print(centers)

du [ 22.98086634  23.2255064 ]
du [ 100.27611079   99.94783955]
du [ 223.41498143  110.37364149]
du [ 12.5134726   12.51055885]
du [ 64.69865801  41.2986681 ]
du [ 125.67787524  225.49415205]
du [ 43.44046915  65.58272368]
du [ 124.22938733   61.46071679]
du [ 137.27112783  115.94252681]
du [  95.08829886  148.07895406]
du [ 238.08174331  228.98490048]
du [ 80.11308856  73.81451565]
du [  63.74935099  103.98852026]
du [ 164.35727273  165.95059561]
du [ 33.61110741  34.8148654 ]
[array([ 22.98086634,  23.2255064 ]), array([ 100.27611079,   99.94783955]), array([ 223.41498143,  110.37364149]), array([ 12.5134726 ,  12.51055885]), array([ 64.69865801,  41.2986681 ]), array([ 125.67787524,  225.49415205]), array([ 43.44046915,  65.58272368]), array([ 124.22938733,   61.46071679]), array([ 137.27112783,  115.94252681]), array([  95.08829886,  148.07895406]), array([ 238.08174331,  228.98490048]), array([ 80.11308856,  73.81451565]), array([  63.74935099,  103.98852026]), array([ 164.3572727

In [11]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{2848: array([ 43.44046915,  65.58272368]), 14080: array([  95.08829886,  148.07895406]), 27275: array([ 164.35727273,  165.95059561]), 28339: array([ 125.67787524,  225.49415205]), 10022: array([ 100.27611079,   99.94783955]), 15915: array([ 137.27112783,  115.94252681]), 7635: array([ 124.22938733,   61.46071679]), 2671: array([ 64.69865801,  41.2986681 ]), 1170: array([ 33.61110741,  34.8148654 ]), 24659: array([ 223.41498143,  110.37364149]), 533: array([ 22.98086634,  23.2255064 ]), 5913: array([ 80.11308856,  73.81451565]), 156: array([ 12.5134726 ,  12.51055885]), 6629: array([  63.74935099,  103.98852026]), 54517: array([ 238.08174331,  228.98490048])}


In [12]:
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(156, array([ 12.5134726 ,  12.51055885])), (533, array([ 22.98086634,  23.2255064 ])), (1170, array([ 33.61110741,  34.8148654 ])), (2671, array([ 64.69865801,  41.2986681 ])), (2848, array([ 43.44046915,  65.58272368])), (5913, array([ 80.11308856,  73.81451565])), (6629, array([  63.74935099,  103.98852026])), (7635, array([ 124.22938733,   61.46071679])), (10022, array([ 100.27611079,   99.94783955])), (14080, array([  95.08829886,  148.07895406])), (15915, array([ 137.27112783,  115.94252681])), (24659, array([ 223.41498143,  110.37364149])), (27275, array([ 164.35727273,  165.95059561])), (28339, array([ 125.67787524,  225.49415205])), (54517, array([ 238.08174331,  228.98490048]))]
  12,12,  22,23,  33,34,  64,41,  43,65,  80,73,  63,103,  124,61,  100,99,  95,148,  137,115,  223,110,  164,165,  125,225,  238,228


### cv2.kmeans2

In [None]:
import cv2

In [None]:
X = np.array(sizes_new, dtype=np.float32)

criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
flags = cv2.KMEANS_RANDOM_CENTERS
compactness, label, centers = cv2.kmeans(X, 15, None, criteria, 10, flags)

In [None]:
print(centers)
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))