In [1]:
import os
import numpy as np
from sklearn.cluster import KMeans

from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list


def collect_sizes(txt_fname, size):
    sizes = []
    with open(txt_fname, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            sizes.append([int(float(tokens[3])*size), int(float(tokens[4])*size)])
    return sizes

def batch_collect_sizes(txt_fnames, size):
    sizes = []
    for txt_fname in txt_fnames:
        sizes += collect_sizes(txt_fname, size)
    return sizes

def worker_single(data_path, size=608):
    txt_fnames = scan_files(data_path, postfix=".txt")
    sizes = []
    for i,txt_fname in enumerate(txt_fnames):
        if i % 10000 == 0:
            print(i)
        sizes += collect_sizes(txt_fname, size)
    return sizes

def worker_multip(data_path, size=608):
    files = scan_files(data_path, postfix=".txt")
    print("# files:", len(files))

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    tasks = []

    batch_size = 1000
    for i in range(0, len(files), batch_size):
        batch = files[i : i+batch_size]
        tasks.append(executor.submit(batch_collect_sizes, batch, size))
    
    sizes = []
    job_count = len(tasks)
    for future in as_completed(tasks):
        result = future.result()  # get the returning result from calling fuction
        sizes += result
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s, Files collected: %d" % (job_count, len(sizes)))
        
    return sizes

In [3]:
data_path = "/home/TMP4T/batch6.3-1216-yearend/original"
sizes = worker_multip(data_path)

print("# files", len(sizes))

# files: 1283355
One Job Done, Remaining Job Count: 1283, Files collected: 2219
One Job Done, Remaining Job Count: 1282, Files collected: 4366
One Job Done, Remaining Job Count: 1281, Files collected: 6421
One Job Done, Remaining Job Count: 1280, Files collected: 8705
One Job Done, Remaining Job Count: 1279, Files collected: 10773
One Job Done, Remaining Job Count: 1278, Files collected: 12951
One Job Done, Remaining Job Count: 1277, Files collected: 15064
One Job Done, Remaining Job Count: 1276, Files collected: 17324
One Job Done, Remaining Job Count: 1275, Files collected: 19507
One Job Done, Remaining Job Count: 1274, Files collected: 21745
One Job Done, Remaining Job Count: 1273, Files collected: 23894
One Job Done, Remaining Job Count: 1272, Files collected: 26083
One Job Done, Remaining Job Count: 1271, Files collected: 28196
One Job Done, Remaining Job Count: 1270, Files collected: 30196
One Job Done, Remaining Job Count: 1269, Files collected: 32258
One Job Done, Remaining Job

One Job Done, Remaining Job Count: 1149, Files collected: 289736
One Job Done, Remaining Job Count: 1148, Files collected: 291783
One Job Done, Remaining Job Count: 1147, Files collected: 293926
One Job Done, Remaining Job Count: 1146, Files collected: 295988
One Job Done, Remaining Job Count: 1145, Files collected: 298128
One Job Done, Remaining Job Count: 1144, Files collected: 300185
One Job Done, Remaining Job Count: 1143, Files collected: 302191
One Job Done, Remaining Job Count: 1142, Files collected: 304293
One Job Done, Remaining Job Count: 1141, Files collected: 306543
One Job Done, Remaining Job Count: 1140, Files collected: 308593
One Job Done, Remaining Job Count: 1139, Files collected: 310759
One Job Done, Remaining Job Count: 1138, Files collected: 312988
One Job Done, Remaining Job Count: 1137, Files collected: 314982
One Job Done, Remaining Job Count: 1136, Files collected: 317035
One Job Done, Remaining Job Count: 1135, Files collected: 319260
One Job Done, Remaining J

One Job Done, Remaining Job Count: 1021, Files collected: 562208
One Job Done, Remaining Job Count: 1020, Files collected: 564336
One Job Done, Remaining Job Count: 1019, Files collected: 566539
One Job Done, Remaining Job Count: 1018, Files collected: 568713
One Job Done, Remaining Job Count: 1017, Files collected: 570802
One Job Done, Remaining Job Count: 1016, Files collected: 572857
One Job Done, Remaining Job Count: 1015, Files collected: 574969
One Job Done, Remaining Job Count: 1014, Files collected: 577201
One Job Done, Remaining Job Count: 1013, Files collected: 579432
One Job Done, Remaining Job Count: 1012, Files collected: 581490
One Job Done, Remaining Job Count: 1011, Files collected: 583508
One Job Done, Remaining Job Count: 1010, Files collected: 585582
One Job Done, Remaining Job Count: 1009, Files collected: 587811
One Job Done, Remaining Job Count: 1008, Files collected: 589953
One Job Done, Remaining Job Count: 1007, Files collected: 592120
One Job Done, Remaining J

One Job Done, Remaining Job Count: 893, Files collected: 835181
One Job Done, Remaining Job Count: 892, Files collected: 837309
One Job Done, Remaining Job Count: 891, Files collected: 839628
One Job Done, Remaining Job Count: 890, Files collected: 841597
One Job Done, Remaining Job Count: 889, Files collected: 843688
One Job Done, Remaining Job Count: 888, Files collected: 845794
One Job Done, Remaining Job Count: 887, Files collected: 848035
One Job Done, Remaining Job Count: 886, Files collected: 850152
One Job Done, Remaining Job Count: 885, Files collected: 852260
One Job Done, Remaining Job Count: 884, Files collected: 854366
One Job Done, Remaining Job Count: 883, Files collected: 856446
One Job Done, Remaining Job Count: 882, Files collected: 858511
One Job Done, Remaining Job Count: 881, Files collected: 860651
One Job Done, Remaining Job Count: 880, Files collected: 862787
One Job Done, Remaining Job Count: 879, Files collected: 864966
One Job Done, Remaining Job Count: 878, 

One Job Done, Remaining Job Count: 761, Files collected: 1116610
One Job Done, Remaining Job Count: 760, Files collected: 1118847
One Job Done, Remaining Job Count: 759, Files collected: 1120897
One Job Done, Remaining Job Count: 758, Files collected: 1123211
One Job Done, Remaining Job Count: 757, Files collected: 1125204
One Job Done, Remaining Job Count: 756, Files collected: 1127425
One Job Done, Remaining Job Count: 755, Files collected: 1129508
One Job Done, Remaining Job Count: 754, Files collected: 1131706
One Job Done, Remaining Job Count: 753, Files collected: 1133700
One Job Done, Remaining Job Count: 752, Files collected: 1135920
One Job Done, Remaining Job Count: 751, Files collected: 1138080
One Job Done, Remaining Job Count: 750, Files collected: 1140164
One Job Done, Remaining Job Count: 749, Files collected: 1142232
One Job Done, Remaining Job Count: 748, Files collected: 1144388
One Job Done, Remaining Job Count: 747, Files collected: 1146446
One Job Done, Remaining J

One Job Done, Remaining Job Count: 631, Files collected: 1394202
One Job Done, Remaining Job Count: 630, Files collected: 1396333
One Job Done, Remaining Job Count: 629, Files collected: 1398589
One Job Done, Remaining Job Count: 628, Files collected: 1400712
One Job Done, Remaining Job Count: 627, Files collected: 1402772
One Job Done, Remaining Job Count: 626, Files collected: 1404796
One Job Done, Remaining Job Count: 625, Files collected: 1406845
One Job Done, Remaining Job Count: 624, Files collected: 1408949
One Job Done, Remaining Job Count: 623, Files collected: 1411198
One Job Done, Remaining Job Count: 622, Files collected: 1413447
One Job Done, Remaining Job Count: 621, Files collected: 1415514
One Job Done, Remaining Job Count: 620, Files collected: 1417565
One Job Done, Remaining Job Count: 619, Files collected: 1419727
One Job Done, Remaining Job Count: 618, Files collected: 1421934
One Job Done, Remaining Job Count: 617, Files collected: 1424089
One Job Done, Remaining J

One Job Done, Remaining Job Count: 504, Files collected: 1665726
One Job Done, Remaining Job Count: 503, Files collected: 1667941
One Job Done, Remaining Job Count: 502, Files collected: 1670042
One Job Done, Remaining Job Count: 501, Files collected: 1672095
One Job Done, Remaining Job Count: 500, Files collected: 1674246
One Job Done, Remaining Job Count: 499, Files collected: 1676461
One Job Done, Remaining Job Count: 498, Files collected: 1678573
One Job Done, Remaining Job Count: 497, Files collected: 1680697
One Job Done, Remaining Job Count: 496, Files collected: 1682945
One Job Done, Remaining Job Count: 495, Files collected: 1685112
One Job Done, Remaining Job Count: 494, Files collected: 1687193
One Job Done, Remaining Job Count: 493, Files collected: 1689370
One Job Done, Remaining Job Count: 492, Files collected: 1691537
One Job Done, Remaining Job Count: 491, Files collected: 1693772
One Job Done, Remaining Job Count: 490, Files collected: 1695838
One Job Done, Remaining J

One Job Done, Remaining Job Count: 373, Files collected: 1946816
One Job Done, Remaining Job Count: 372, Files collected: 1948904
One Job Done, Remaining Job Count: 371, Files collected: 1951050
One Job Done, Remaining Job Count: 370, Files collected: 1953252
One Job Done, Remaining Job Count: 369, Files collected: 1955352
One Job Done, Remaining Job Count: 368, Files collected: 1957537
One Job Done, Remaining Job Count: 367, Files collected: 1959798
One Job Done, Remaining Job Count: 366, Files collected: 1961902
One Job Done, Remaining Job Count: 365, Files collected: 1964026
One Job Done, Remaining Job Count: 364, Files collected: 1966267
One Job Done, Remaining Job Count: 363, Files collected: 1968506
One Job Done, Remaining Job Count: 362, Files collected: 1970732
One Job Done, Remaining Job Count: 361, Files collected: 1973052
One Job Done, Remaining Job Count: 360, Files collected: 1975290
One Job Done, Remaining Job Count: 359, Files collected: 1977312
One Job Done, Remaining J

One Job Done, Remaining Job Count: 243, Files collected: 2226505
One Job Done, Remaining Job Count: 242, Files collected: 2228638
One Job Done, Remaining Job Count: 241, Files collected: 2230710
One Job Done, Remaining Job Count: 240, Files collected: 2232809
One Job Done, Remaining Job Count: 239, Files collected: 2234896
One Job Done, Remaining Job Count: 238, Files collected: 2236980
One Job Done, Remaining Job Count: 237, Files collected: 2239081
One Job Done, Remaining Job Count: 236, Files collected: 2241202
One Job Done, Remaining Job Count: 235, Files collected: 2243456
One Job Done, Remaining Job Count: 234, Files collected: 2245720
One Job Done, Remaining Job Count: 233, Files collected: 2247834
One Job Done, Remaining Job Count: 232, Files collected: 2249991
One Job Done, Remaining Job Count: 231, Files collected: 2252102
One Job Done, Remaining Job Count: 230, Files collected: 2254207
One Job Done, Remaining Job Count: 229, Files collected: 2256288
One Job Done, Remaining J

One Job Done, Remaining Job Count: 114, Files collected: 2501467
One Job Done, Remaining Job Count: 113, Files collected: 2503628
One Job Done, Remaining Job Count: 112, Files collected: 2505680
One Job Done, Remaining Job Count: 111, Files collected: 2507869
One Job Done, Remaining Job Count: 110, Files collected: 2510147
One Job Done, Remaining Job Count: 109, Files collected: 2512242
One Job Done, Remaining Job Count: 108, Files collected: 2514360
One Job Done, Remaining Job Count: 107, Files collected: 2516550
One Job Done, Remaining Job Count: 106, Files collected: 2518658
One Job Done, Remaining Job Count: 105, Files collected: 2520835
One Job Done, Remaining Job Count: 104, Files collected: 2523006
One Job Done, Remaining Job Count: 103, Files collected: 2525190
One Job Done, Remaining Job Count: 102, Files collected: 2527277
One Job Done, Remaining Job Count: 101, Files collected: 2529465
One Job Done, Remaining Job Count: 100, Files collected: 2531640
One Job Done, Remaining J

In [None]:
# collects sizes from additional folder
data_path = "/home/hdd_array0/batch6_1216/VOC2012/images-HSIL-bar"
sizes2 = main(data_path)
print("# files", len(sizes2))

sizes += sizes2
print("# files", len(sizes))

#### save sizes

In [4]:
import pickle

# save file
with open("/home/TMP4T/batch6.3-1216-yearend/original/sizes.pkl", 'wb') as f:
    pickle.dump(sizes, f)

# # open saved file
# with open("/home/TMP4T/batch6.3-1216-yearend/original/sizes.pkl", 'rb') as f:
#     sizes = pickle.load(f)

In [5]:
print(len(sizes))
print([item for item in sizes if item[0] >= 608 or item[1] >= 608])

2743959
[[608, 144], [297, 608], [608, 217], [608, 273], [608, 317], [608, 217], [296, 608], [282, 608], [289, 608], [608, 273], [608, 305], [295, 608], [608, 166], [608, 215], [608, 215], [608, 228], [608, 608], [370, 608], [330, 608], [608, 305], [289, 608], [245, 608], [608, 279], [608, 288], [608, 228], [608, 331], [608, 272], [608, 283], [608, 175], [206, 608], [608, 303], [608, 280], [608, 246], [246, 608], [608, 247], [608, 166], [608, 227], [608, 218], [608, 331], [295, 608], [186, 608], [206, 608], [306, 608], [608, 265], [608, 219], [295, 608], [608, 175], [608, 274], [370, 608], [608, 243], [282, 608], [608, 302], [296, 608], [281, 608], [608, 274], [608, 279], [608, 283], [608, 215], [608, 219], [608, 608], [608, 282], [608, 265], [608, 288], [608, 222], [206, 608], [608, 345], [608, 243], [608, 303], [608, 175], [296, 608], [608, 247], [246, 608], [608, 143], [608, 229], [608, 222], [330, 608], [608, 304], [608, 608], [316, 608], [608, 222], [608, 213], [608, 345], [186, 6

### plot scattered points

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = [size[0] for size in sizes]
y = [size[1] for size in sizes]

#### sizes, kmeans 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
#1class: 12,12,  28,29,  40,65,  62,43,  77,84,  108,63,  78,129,  113,103,  197,89,  142,145,  127,227,  220,183,  277,285,  573,255,  337,595
#11 classes: 12,12,  28,28,  65,40,  43,63,  84,77,  67,110,  129,76,  106,112,  93,192,  148,140,  232,122,  185,217,  284,277,  264,577,  587,314
"""

plt.show()

#### sizes_new, kmeans 15

In [None]:
# get the number of boxes that is bigger > 600
w_large = [size for size in sizes if size[0] > 600]
h_large = [size for size in sizes if size[1] > 600]
print("  ", "total", len(sizes), "w_large", len(w_large), "h_large", len(h_large))

In [None]:
# remove abnormal boxes
sizes_new = [size for size in sizes if size[0] < 600 and size[1] < 600]
print("  ", "sizes_new total", len(sizes_new))


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  27,28,  55,35,  38,58,  76,65,  62,96,  124,63,  97,94,  91,137,  133,113,  231,111,  160,163,  114,230,  222,233,  346,298
"""

plt.show()

#### sizes, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
13,13,  29,34,  49,27,  69,50,  54,77,  101,75,  85,108,  192,75,  127,119,  112,193,  193,162,  244,258,  535,247,  248,583,  556,560
"""

plt.show()

#### sizes_new, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  26,27,  48,29,  26,59,  48,53,  80,63,  59,94,  129,61,  94,100,  130,114,  96,173,  229,113,  171,175,  194,291,  308,246
"""

plt.show()

### kmeans

In [6]:
X = np.array(sizes)
kmeans = KMeans(n_clusters=15, random_state=1).fit(X)

In [7]:
centers = []
for center in kmeans.cluster_centers_:
    print("du", center)
    centers.append(center)
print(centers)

du [ 94.7366241  146.84761174]
du [22.9334211  23.14736064]
du [79.50291639 73.26298062]
du [163.76520024 165.70112257]
du [33.35011585 34.4700928 ]
du [12.51464004 12.51185255]
du [236.48906375 229.4232462 ]
du [125.17637981  60.23033769]
du [99.44314821 99.22045081]
du [122.96277367 224.67884522]
du [136.92351245 115.37597753]
du [64.29506402 40.59385551]
du [ 61.42073225 103.7482135 ]
du [224.08106477 110.99899423]
du [42.59314909 64.31822564]
[array([ 94.7366241 , 146.84761174]), array([22.9334211 , 23.14736064]), array([79.50291639, 73.26298062]), array([163.76520024, 165.70112257]), array([33.35011585, 34.4700928 ]), array([12.51464004, 12.51185255]), array([236.48906375, 229.4232462 ]), array([125.17637981,  60.23033769]), array([99.44314821, 99.22045081]), array([122.96277367, 224.67884522]), array([136.92351245, 115.37597753]), array([64.29506402, 40.59385551]), array([ 61.42073225, 103.7482135 ]), array([224.08106477, 110.99899423]), array([42.59314909, 64.31822564])]


In [8]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{5824: array([79.50291639, 73.26298062]), 27136: array([163.76520024, 165.70112257]), 6372: array([ 61.42073225, 103.7482135 ]), 24872: array([224.08106477, 110.99899423]), 9866: array([99.44314821, 99.22045081]), 27627: array([122.96277367, 224.67884522]), 2739: array([42.59314909, 64.31822564]), 54256: array([236.48906375, 229.4232462 ]), 2609: array([64.29506402, 40.59385551]), 530: array([22.9334211 , 23.14736064]), 7539: array([125.17637981,  60.23033769]), 15797: array([136.92351245, 115.37597753]), 13911: array([ 94.7366241 , 146.84761174]), 156: array([12.51464004, 12.51185255]), 1149: array([33.35011585, 34.4700928 ])}


In [9]:
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(156, array([12.51464004, 12.51185255])), (530, array([22.9334211 , 23.14736064])), (1149, array([33.35011585, 34.4700928 ])), (2609, array([64.29506402, 40.59385551])), (2739, array([42.59314909, 64.31822564])), (5824, array([79.50291639, 73.26298062])), (6372, array([ 61.42073225, 103.7482135 ])), (7539, array([125.17637981,  60.23033769])), (9866, array([99.44314821, 99.22045081])), (13911, array([ 94.7366241 , 146.84761174])), (15797, array([136.92351245, 115.37597753])), (24872, array([224.08106477, 110.99899423])), (27136, array([163.76520024, 165.70112257])), (27627, array([122.96277367, 224.67884522])), (54256, array([236.48906375, 229.4232462 ]))]
  12,12,  22,23,  33,34,  64,40,  42,64,  79,73,  61,103,  125,60,  99,99,  94,146,  136,115,  224,110,  163,165,  122,224,  236,229


### cv2.kmeans2

In [None]:
import cv2

In [None]:
X = np.array(sizes_new, dtype=np.float32)

criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
flags = cv2.KMEANS_RANDOM_CENTERS
compactness, label, centers = cv2.kmeans(X, 15, None, criteria, 10, flags)

In [None]:
print(centers)
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))