In [1]:
import numpy as np
from sklearn.cluster import KMeans
import os

In [2]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list


def collect_sizes(txt_fname, size):
    sizes = []
    with open(txt_fname, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            sizes.append([int(float(tokens[3])*size), int(float(tokens[4])*size)])
    return sizes

def main(data_path, size=608):
    txt_fnames = scan_files(data_path, postfix=".txt")
    sizes = []
    for txt_fname in txt_fnames:
        sizes += collect_sizes(txt_fname, size)
    return sizes

In [3]:
data_path = "/home/ssd0/Develop/liyu/train4/train5"
sizes = main(data_path)

print("# files", len(sizes))

# files 419424


In [None]:
data_path = "/home/hdd_array0/batch6_1216/train_selected"
sizes2 = main(data_path)
print("# files", len(sizes2))

sizes += sizes2
print("# files", len(sizes))

#### save sizes

In [None]:
import pickle

# # save file
# with open("1216_sizes.pkl", 'wb') as f:
#     pickle.dump(sizes, f)

# open saved file
with open("1216_sizes.pkl", 'rb') as f:
    sizes = pickle.load(f)

In [None]:
sizes_withrotated = sizes.copy()
sizes_withrotated += [[item[1], item[0]] for item in sizes]

print("  ", len(sizes), len(sizes_withrotated))

### plot scattered points

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = [size[0] for size in sizes]
y = [size[1] for size in sizes]

#### sizes, kmeans 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
#1class: 12,12,  28,29,  40,65,  62,43,  77,84,  108,63,  78,129,  113,103,  197,89,  142,145,  127,227,  220,183,  277,285,  573,255,  337,595
#11 classes: 12,12,  28,28,  65,40,  43,63,  84,77,  67,110,  129,76,  106,112,  93,192,  148,140,  232,122,  185,217,  284,277,  264,577,  587,314
"""

plt.show()

#### sizes_new, kmeans 15

In [None]:
# get the number of boxes that is bigger > 600
w_large = [size for size in sizes if size[0] > 600]
h_large = [size for size in sizes if size[1] > 600]
print("  ", "total", len(sizes), "w_large", len(w_large), "h_large", len(h_large))

In [None]:
# remove abnormal boxes
sizes_new = [size for size in sizes if size[0] < 600 and size[1] < 600]
print("  ", "sizes_new total", len(sizes_new))


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  27,28,  55,35,  38,58,  76,65,  62,96,  124,63,  97,94,  91,137,  133,113,  231,111,  160,163,  114,230,  222,233,  346,298
"""

plt.show()

#### sizes, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
13,13,  29,34,  49,27,  69,50,  54,77,  101,75,  85,108,  192,75,  127,119,  112,193,  193,162,  244,258,  535,247,  248,583,  556,560
"""

plt.show()

#### sizes_new, cv2.kmeans2 15

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15,15))
ax.scatter(x, y, marker='.', color='black')

# centers = 
cx = [center[0] for center in centers]
cy = [center[1] for center in centers]
ax.scatter(cx, cy, marker='s', color='red')
"""
12,12,  26,27,  48,29,  26,59,  48,53,  80,63,  59,94,  129,61,  94,100,  130,114,  96,173,  229,113,  171,175,  194,291,  308,246
"""

plt.show()

### kmeans

In [4]:
X = np.array(sizes)
kmeans = KMeans(n_clusters=15, random_state=1).fit(X)

In [5]:
centers = []
for center in kmeans.cluster_centers_:
    print("du", center)
    centers.append(center)
print(centers)

du [70.10373461 53.85243843]
du [12.32059272 12.32059272]
du [ 76.60933864 122.08295173]
du [194.45599152 142.30567338]
du [53.02015897 80.58063587]
du [46.87818821 31.0485195 ]
du [104.94854432 102.83119779]
du [231.65372829 231.15883555]
du [35.19475229 51.19500917]
du [139.51789976 199.06980907]
du [123.3221559  137.23814958]
du [25.78879912 26.44159511]
du [105.86662788  67.97452806]
du [81.67199604 85.49078622]
du [141.37435111  95.7386739 ]
[array([70.10373461, 53.85243843]), array([12.32059272, 12.32059272]), array([ 76.60933864, 122.08295173]), array([194.45599152, 142.30567338]), array([53.02015897, 80.58063587]), array([46.87818821, 31.0485195 ]), array([104.94854432, 102.83119779]), array([231.65372829, 231.15883555]), array([35.19475229, 51.19500917]), array([139.51789976, 199.06980907]), array([123.3221559 , 137.23814958]), array([25.78879912, 26.44159511]), array([105.86662788,  67.97452806]), array([81.67199604, 85.49078622]), array([141.37435111,  95.7386739 ])]


In [6]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{7196: array([105.86662788,  67.97452806]), 6982: array([81.67199604, 85.49078622]), 10791: array([104.94854432, 102.83119779]), 9352: array([ 76.60933864, 122.08295173]), 1801: array([35.19475229, 51.19500917]), 53548: array([231.65372829, 231.15883555]), 681: array([25.78879912, 26.44159511]), 1455: array([46.87818821, 31.0485195 ]), 4272: array([53.02015897, 80.58063587]), 151: array([12.32059272, 12.32059272]), 27672: array([194.45599152, 142.30567338]), 16924: array([123.3221559 , 137.23814958]), 27773: array([139.51789976, 199.06980907]), 13534: array([141.37435111,  95.7386739 ]), 3775: array([70.10373461, 53.85243843])}


In [7]:
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(151, array([12.32059272, 12.32059272])), (681, array([25.78879912, 26.44159511])), (1455, array([46.87818821, 31.0485195 ])), (1801, array([35.19475229, 51.19500917])), (3775, array([70.10373461, 53.85243843])), (4272, array([53.02015897, 80.58063587])), (6982, array([81.67199604, 85.49078622])), (7196, array([105.86662788,  67.97452806])), (9352, array([ 76.60933864, 122.08295173])), (10791, array([104.94854432, 102.83119779])), (13534, array([141.37435111,  95.7386739 ])), (16924, array([123.3221559 , 137.23814958])), (27672, array([194.45599152, 142.30567338])), (27773, array([139.51789976, 199.06980907])), (53548, array([231.65372829, 231.15883555]))]
  12,12,  25,26,  46,31,  35,51,  70,53,  53,80,  81,85,  105,67,  76,122,  104,102,  141,95,  123,137,  194,142,  139,199,  231,231


### cv2.kmeans2

In [None]:
import cv2

In [None]:
X = np.array(sizes_new, dtype=np.float32)

criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
flags = cv2.KMEANS_RANDOM_CENTERS
compactness, label, centers = cv2.kmeans(X, 15, None, criteria, 10, flags)

In [None]:
print(centers)
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))