In [1]:
import os
import random
import shutil

from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
def scan_files(directory, prefix=None, postfix=None):
    files_list = []
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if postfix:
                if special_file.endswith(postfix):
                    files_list.append(os.path.join(root, special_file))
            elif prefix:
                if special_file.startswith(prefix):
                    files_list.append(os.path.join(root, special_file))
            else:
                files_list.append(os.path.join(root, special_file))
    return files_list

### partial select

In [None]:
# 11 yolo classes
yolo_classes = {"ACTINO":9, "CC":8, "VIRUS":10, "FUNGI":6, "TRI":7, "AGC_A":4, 
                "AGC_B":4, "EC":5, "HSIL_B":2, "HSIL_M":2, "HSIL_S":2, "SCC_G":3, 
                "ASCUS":0, "LSIL_F":0, "LSIL_E":1, "SCC_R":3}

not_choose = ["AGC_A", "AGC_B", "HSIL_B", "HSIL_M", "HSIL_S", "FUNGI", "SCC_G", "SCC_R"]
not_choose_index = [yolo_classes[key] for key in not_choose]

In [None]:
def choose(txt_name):
    with open(txt_name, 'r') as f:
        for line in f.readlines():
            if int(line.strip().split()[0]) in not_choose_index:
                return False
    return True

def move(txt, dst_dir):
#     if txt.endswith("_hls09.txt"):  # these are hl changed, we only need original images
#         continue
    if (not txt.endswith("_hls09.txt")) and choose(txt):
        shutil.move(txt, dst_dir)
        bmp = os.path.splitext(txt)[0] + ".bmp"
        shutil.move(bmp, dst_dir)

def batch_move(txts, dst_dir):
    for txt in txts:
        move(txt, dst_dir)
        
def main(src_dir, dst_dir):
    print("start work")
    txts = scan_files(src_dir, postfix=".txt")
    print("total files", len(txts))
    
    executor = ProcessPoolExecutor(max_workers=4)
    tasks = []

    batch_size = 1000
    for i in range(0, len(txts), batch_size):
        batch = txts[i : i+batch_size]
        tasks.append(executor.submit(batch_move, txts, dst_dir))
    
    job_count = len(tasks)
    for future in as_completed(tasks):
        # result = future.result()  # get the returning result from calling fuction
        job_count -= 1
        print("One Job Done, Remaining Job Count: %s" % (job_count))

In [None]:
src_dir = "/home/hdd_array0/batch6_1216/train"
dst_dir = "/home/hdd_array0/batch6_1216/train_selected"

main(src_dir, dst_dir)

### select HSIL-SCC_G
 - randomly select half the data from four rotated partitions

In [30]:
classes = set([1])  # HSIL-SCC_G
data_path1 = "/home/ssd_array/data/batch6.4_1216/original"
data_path2 = "/home/ssd_array/data/batch6.4_1216/rotate"

In [31]:
def read_and_record(txt_names):
    name_map = {}
    
    for i,txt_name in enumerate(txt_names):
        is_in = False
        with open(txt_name, 'r') as f:
            for line in f.readlines():
                tokens = line.strip().split()
                if int(tokens[0]) in classes:
                    is_in = True
                    break

        if is_in:
            basename = os.path.splitext(os.path.basename(txt_name))[0]
            name_map[basename] = [txt_name]
    
        if i % 10000 == 0:
            print(i)
    
    return name_map

In [32]:
files1 = scan_files(data_path1, postfix='.txt')
name_map = read_and_record(files1)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000


In [33]:
print(len(name_map))

365874


In [34]:
def read_and_record(txt_names, name_map):
    
    for i,txt_name in enumerate(txt_names):
        basename = os.path.splitext(os.path.basename(txt_name))[0]
        basename = basename.split('_90')[0]
        basename = basename.split('_180')[0]
        basename = basename.split('_270')[0]
        if basename in name_map:
            name_map[basename].append(txt_name)
    
        if i % 10000 == 0:
            print(i)
    
    return name_map

In [35]:
files2 = scan_files(data_path2, postfix='.txt')
name_map4 = read_and_record(files2, name_map.copy())

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
13

In [36]:
print(len(name_map4))

365874


In [37]:
def select_to_delete(name_map, N=2):
    name_map_new = {}
    for name,paths in name_map.items():
        if len(paths) <= N:
            continue
        paths_new = random.sample(paths, N)
        name_map_new[name] = paths_new
    return name_map_new

In [38]:
name_map_to_delete = select_to_delete(name_map4.copy())
print(len(name_map_to_delete))

357665


In [39]:
# flatten path names
paths_to_delete = []
for name,paths in name_map_to_delete.items():
    paths_to_delete += paths
print(len(paths_to_delete))

715330


In [40]:
files1_rest = set(files1) - set(paths_to_delete)
print(len(files1_rest))

876978


In [41]:
files2_rest = set(files2) - set(paths_to_delete)
print(len(files2_rest))

2565426


In [42]:
files1_rest = [os.path.splitext(f)[0]+'.bmp' for f in files1_rest]
files1_lines = '\n'.join(files1_rest) + '\n'
with open('./original.txt', 'w') as f:
    f.write(files1_lines)

In [43]:
files2_rest = [os.path.splitext(f)[0]+'.bmp' for f in files2_rest]
files2_lines = '\n'.join(files2_rest) + '\n'
with open('./rotate.txt', 'w') as f:
    f.write(files2_lines)

### remove those without matching img&txt files

In [44]:
data_path = "/home/ssd_array/data/batch6.4_1216/rotate_new"
all_files = scan_files(data_path)
print("  ", "# files", len(all_files))

name_match = {}
for f in all_files:
    b = os.path.splitext(os.path.basename(f))[0]
    if not b in name_match:
        name_match[b] = []
    name_match[b].append(f)
print("  ", "# basenames", len(name_match))
    
cnt = 0
for n,m in name_match.items():
    if len(m) < 2:
        cnt += 1
        for f in m:
            os.remove(f)
    elif len(m) > 2:
        print(n)
print("  ", "# unmatched", cnt)

   # files 1999998
   # basenames 999999
   # unmatched 0


### check correctness of renaming

In [28]:
data_path = "/home/ssd_array/data/batch6.4_1216/rotate_"
all_files = scan_files(data_path)
print("  ", "# files", len(all_files))

for f in all_files:
    f = os.path.basename(f)
    if not f.startswith('_'):
        print("  ", f)

   # files 169456


### write img file paths to txt

In [64]:
data_path = ["/home/ssd_array/data/batch6.4_1216/ascus", 
             "/home/ssd_array/data/batch6.4_1216/ascus-flip", 
             "/home/ssd_array/data/batch6.4_1216/flip", 
             "/home/ssd_array/data/batch6.4_1216/fungi", 
             "/home/ssd_array/data/batch6.4_1216/fungi-flip", 
             "/home/ssd_array/data/batch6.4_1216/original", 
             "/home/ssd_array/data/batch6.4_1216/rotate", 
             "/home/ssd_array/data/batch6.4_1216/tri", 
             "/home/ssd_array/data/batch6.4_1216/rotate_new"]

In [65]:
all_imgs = []
for path in data_path:
    all_imgs += scan_files(path, prefix="_n", postfix=".bmp")
    print("  ", path, len(all_imgs))

   /home/ssd_array/data/batch6.4_1216/ascus 132148
   /home/ssd_array/data/batch6.4_1216/ascus-flip 396444
   /home/ssd_array/data/batch6.4_1216/flip 1617820
   /home/ssd_array/data/batch6.4_1216/fungi 1661640
   /home/ssd_array/data/batch6.4_1216/fungi-flip 1749280
   /home/ssd_array/data/batch6.4_1216/original 2809886
   /home/ssd_array/data/batch6.4_1216/rotate 4907014
   /home/ssd_array/data/batch6.4_1216/tri 5809106
   /home/ssd_array/data/batch6.4_1216/rotate_new 6809105


In [66]:
all_imgs = [f for f in all_imgs if os.path.basename(f).startswith("_n")]

In [67]:
matched = []
for img in all_imgs:
    txt = os.path.splitext(img)[0] + '.txt'
    if os.path.isfile(txt):
        matched.append(img)
print("  ", len(matched))

   5642729


In [68]:
print("  ", set(all_imgs)-set(matched))

   set()


In [69]:
matched_basenames = [os.path.basename(f) for f in matched]
print("  ", len(set(matched_basenames)))

   5375146


In [70]:
random.shuffle(matched)
random.shuffle(matched)
random.shuffle(matched)
random.shuffle(matched)
random.shuffle(matched)
random.shuffle(matched)

In [71]:
train = matched[len(matched)//10:]
valid = matched[:len(matched)//10]
with open('train.txt', 'w') as f:
    for l in train:
        f.write(l+'\n')
with open('valid.txt', 'w') as f:
    for l in valid:
        f.write(l+'\n')    

In [73]:
tmp_path = "/home/ssd_array/data/batch6.4_1216/tmp"

tmp = matched[:500]
for f in tmp:
    shutil.copy(f, tmp_path)
    txt = os.path.splitext(f)[0] + '.txt'
    shutil.copy(txt, tmp_path)