In [1]:
import numpy as np
from sklearn.cluster import KMeans
import os

In [2]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list


def collect_sizes(txt_fname, size):
    sizes = []
    with open(txt_fname, 'r') as f:
        for line in f.readlines():
            tokens = line.strip().split()
            sizes.append([int(float(tokens[3])*size), int(float(tokens[4])*size)])
    return sizes

def main(data_path, size=608):
    txt_fnames = scan_files(data_path, postfix=".txt")
    sizes = []
    for txt_fname in txt_fnames:
        sizes += collect_sizes(txt_fname, size)
    return sizes

In [3]:
data_path = "/home/ssd0/Develop/liyu/batch6_hls09_1216/train"
sizes = main(data_path)

In [4]:
X = np.array(sizes)
kmeans = KMeans(n_clusters=9, random_state=1).fit(X)

In [5]:
centers = []
for center in kmeans.cluster_centers_:
    print("du", center)
    centers.append(center)

du [12.95459679 12.92078274]
du [ 89.45918712 100.49992146]
du [227.22202378 209.10584199]
du [31.61061157 32.62705684]
du [125.73313712 165.1151063 ]
du [521.82590612 261.91206179]
du [142.68227105  93.7540693 ]
du [66.38727395 62.6538129 ]
du [322.93981083 562.21410146]


In [6]:
tosort = {int(center[0]*center[1]):center for center in centers}
print(tosort)

{136672: array([521.82590612, 261.91206179]), 13377: array([142.68227105,  93.7540693 ]), 1031: array([31.61061157, 32.62705684]), 20760: array([125.73313712, 165.1151063 ]), 47513: array([227.22202378, 209.10584199]), 167: array([12.95459679, 12.92078274]), 181561: array([322.93981083, 562.21410146]), 8990: array([ 89.45918712, 100.49992146]), 4159: array([66.38727395, 62.6538129 ])}


In [7]:
hassorted = sorted(tosort.items())
print(hassorted)
print("  " + ",  ".join(["{},{}".format(int(value[1][0]),int(value[1][1])) for value in hassorted]))

[(167, array([12.95459679, 12.92078274])), (1031, array([31.61061157, 32.62705684])), (4159, array([66.38727395, 62.6538129 ])), (8990, array([ 89.45918712, 100.49992146])), (13377, array([142.68227105,  93.7540693 ])), (20760, array([125.73313712, 165.1151063 ])), (47513, array([227.22202378, 209.10584199])), (136672, array([521.82590612, 261.91206179])), (181561, array([322.93981083, 562.21410146]))]
  12,12,  31,32,  66,62,  89,100,  142,93,  125,165,  227,209,  521,261,  322,562
