In [None]:
from pathlib import Path
import numpy as np
import cv2 as cv

thispath = Path.cwd().resolve()

maskdir = Path(thispath.parent / "data" / "Mask_PyHIST")

image = "000029488200270022"

patches_path = [i for i in maskdir.rglob("*.png") if "tiles" in str(i) and image in str(i)]
print(len(patches_path))

means = []
for path in patches_path:
    patch = cv.imread(str(path))
    patch_mean = np.mean(patch)
    print(patch_mean)
    means.append(patch_mean)
    if patch_mean > 180:
        print(path)

print(f"Mean of means: {np.mean(means)}")

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import cv2 as cv
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from natsort import natsorted
import openslide
import threading
import time
import multiprocessing

thispath = Path.cwd().resolve()


def eval_histogram_threshold(mask, thumb_data):
    
    thumb_data_masked = np.ma.array(thumb_data, mask=np.logical_not(mask))
    mean_thumb_data = thumb_data_masked.mean()
    print(f"Mean image within the mask of: {mean_thumb_data}")

    if mean_thumb_data <= 155:
        upper_thr = 195
        lower_thr = 40
    elif (mean_thumb_data>155 and mean_thumb_data<=180):
        upper_thr = 200
        lower_thr = 45
    elif (mean_thumb_data > 180):
        upper_thr = 205
        lower_thr = 50

    else:
        lower_thr = 45
        upper_thr = 200
    
    return lower_thr, upper_thr


def get_histogram(img, lower, upper):
	
	range_values = np.arange(lower,upper)
	histo_val = np.histogram(img, bins=range_values)[0]
	
	return histo_val


#estrae glimpse e salva metadati relativi al glimpse
def analyze_file(list_dirs):


# image_index_with_problems = "000030734200335038"
# filename = [i for i in list_dirs if image_index_with_problems in str(i)][0]
	datadir = Path("/mnt/nas4/datasets/ToReadme/ExaMode_Dataset1/AOEC")

	for filename in tqdm(list_dirs, desc="Filtering patches from PyHIST"):

		print(f"== Filtering patches {filename.stem} ==")

		binary_mask = cv.imread(str(Path(maskdir / filename.parent.stem / filename.stem / f"binary_{filename.stem}.png")))
		binary_mask[binary_mask == 255] = 1
		mask_shape = binary_mask.shape
		binary_mask = cv.resize(binary_mask, (int(mask_shape[1]*0.5), int(mask_shape[0]*0.5)))
		mask_shape = binary_mask.shape

		slide = openslide.OpenSlide(str(Path(datadir / filename.parent.stem /f"{filename.stem}.svs" )))
		thumbnail = slide.get_thumbnail((mask_shape[1], mask_shape[0]))

		thumbnail_data = np.array(thumbnail)
		if thumbnail_data.shape != mask_shape:
			thumbnail_data = cv.resize(thumbnail_data, (mask_shape[1], mask_shape[0]))

		lower, upper = eval_histogram_threshold(binary_mask, thumbnail_data)
		print(f"Set an lower threshold of {lower} and upper {upper} to compute the histogram")

		patches_path = [i for i in filename.rglob("*.png") if "tiles" in str(i)]

		patch = cv.imread(str(patches_path[0]))
		patch_shape = patch.shape
		total_pixels_patch = patch_shape[0] * patch_shape[1]
		filtered_patches = []
		names = []
		all_row = []
		all_col = []

		for image_patch in tqdm(patches_path, desc=f"Filtering patches of {filename.stem}"):

			image = cv.imread(str(image_patch))
			gray_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

			# plt.imshow(image)
			# plt.title("Patch image")
			# plt.show()

			histo = get_histogram(gray_image, lower, upper)
			
			total_pixels_in_range = np.sum(histo)
			# print(total_pixels_in_range)
			
			if (total_pixels_in_range > 0.6 * total_pixels_patch):
				name = image_patch.stem
				names.append(name)
				all_row.append(patches_metadata.loc[name]['Row'])
				all_col.append(patches_metadata.loc[name]['Column'])
				filtered_patches.append(image_patch)
			
		# Create .csv with metadata information of the filtered patches
		outputdir_metadata = Path(filename / f"{filename.stem}_densely_filtered_metadata.csv")
		patches_metadata = pd.read_csv(Path(filename / "tile_selection.tsv"), sep='\t').set_index("Tile")
		File_metadata = {'patch_name':names,'row':all_row,'column':all_col}
		df_metadata = pd.DataFrame.from_dict(File_metadata)
		df_metadata.to_csv(outputdir_metadata, index=False)

		# Create .csv with filtered parches path
		outputdir_paths = Path(filename / f"{filename.stem}_densely_filtered_paths.csv")
		File = {'filtered_patch_path': filtered_patches}
		df_paths = pd.DataFrame.from_dict(File)
		df_paths.to_csv(outputdir_paths, index=False)

		print(f"Filtered patches: {len(filtered_patches)} from a total of {len(patches_path)}")
		print(f"Filtered .csv for {filename.stem} saved on {outputdir_paths.parent}")


def explore_list(list_dirs):
	#print(threadname + str(" started"))

	for i in range(len(list_dirs)):
		analyze_file(list_dirs[i])
	#print(threadname + str(" finished"))


def chunker_list(seq, size):
		return (seq[i::size] for i in range(size))

def main():
	#create output dir if not exists
	start_time = time.time()

	np.random.seed(0)

	maskdir  = Path(thispath.parent / "data" / "Mask_PyHIST")

	subdirs = natsorted([e for e in maskdir.iterdir() if e.is_dir()])
	list_dirs = []
	for dir in subdirs:
		list_dirs += [i for i in dir.iterdir() if i.is_dir()]

	
	THREAD_NUMBER = 10

	#split in chunks for the threads
	list_dirs = list(chunker_list(list_dirs, THREAD_NUMBER))

	threads = []
	for i in range(THREAD_NUMBER):
		#t = multiprocessing.Process(target=explore_list,args=([list_dirs[i]]))
		t = threading.Thread(target=explore_list, args=([list_dirs[i]]))
		threads.append(t)

	for t in threads:
		t.start()
		#time.sleep(60)

	for t in threads:
		t.join()
	
		#prepare data
	
	elapsed_time = time.time() - start_time
	print(f"Elapsed time: {elapsed_time}")


if __name__ == "__main__":
	main()


In [11]:
from pathlib import Path
import numpy as np
import cv2 as cv
import pandas as pd
thispath = Path.cwd().resolve()
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from natsort import natsorted
import openslide


def eval_histogram_threshold(mask, thumb_data):
    
    thumb_data_masked = np.ma.array(thumb_data, mask=np.logical_not(mask))
    mean_thumb_data = thumb_data_masked.mean()
    print(f"Mean image within the mask of: {mean_thumb_data}")

    if mean_thumb_data <= 155:
        upper_thr = 195
        lower_thr = 40
    elif (mean_thumb_data>155 and mean_thumb_data<=180):
        upper_thr = 200
        lower_thr = 45
    elif (mean_thumb_data > 180):
        upper_thr = 205
        lower_thr = 50

    else:
        lower_thr = 45
        upper_thr = 200
    
    return lower_thr, upper_thr


def get_histogram(img, lower, upper):
	
	range_values = np.arange(lower,upper)
	histo_val = np.histogram(img, bins=range_values)[0]
	
	return histo_val


datadir = Path("/mnt/nas4/datasets/ToReadme/ExaMode_Dataset1/AOEC")
maskdir  = Path(thispath.parent / "data" / "Mask_PyHIST")

subdirs = natsorted([e for e in maskdir.iterdir() if e.is_dir()])
list_dirs = []
for dir in subdirs:
    list_dirs += [i for i in dir.iterdir() if i.is_dir()]


# image_index_with_problems = "000030734200335038"
# filename = [i for i in list_dirs if image_index_with_problems in str(i)][0]

for filename in tqdm(list_dirs, desc="Filtering patches from PyHIST"):

    print(f"== Filtering patches {filename.stem} ==")

    binary_mask = cv.imread(str(Path(maskdir / filename.parent.stem / filename.stem / f"binary_{filename.stem}.png")))
    binary_mask[binary_mask == 255] = 1
    mask_shape = binary_mask.shape
    binary_mask = cv.resize(binary_mask, (int(mask_shape[1]*0.5), int(mask_shape[0]*0.5)))
    mask_shape = binary_mask.shape

    slide = openslide.OpenSlide(str(Path(datadir / filename.parent.stem /f"{filename.stem}.svs" )))
    thumbnail = slide.get_thumbnail((mask_shape[1], mask_shape[0]))

    thumbnail_data = np.array(thumbnail)
    if thumbnail_data.shape != mask_shape:
        thumbnail_data = cv.resize(thumbnail_data, (mask_shape[1], mask_shape[0]))

    lower, upper = eval_histogram_threshold(binary_mask, thumbnail_data)
    print(f"Set an lower threshold of {lower} and upper {upper} to compute the histogram")

    patches_path = [i for i in filename.rglob("*.png") if "tiles" in str(i)]

    patch = cv.imread(str(patches_path[0]))
    patch_shape = patch.shape
    total_pixels_patch = patch_shape[0] * patch_shape[1]
    filtered_patches = []
    names = []
    all_row = []
    all_col = []

    for image_patch in tqdm(patches_path, desc=f"Filtering patches of {filename.stem}"):

        image = cv.imread(str(image_patch))
        gray_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

        # plt.imshow(image)
        # plt.title("Patch image")
        # plt.show()

        histo = get_histogram(gray_image, lower, upper)
        
        total_pixels_in_range = np.sum(histo)
        # print(total_pixels_in_range)
        
        if (total_pixels_in_range > 0.6 * total_pixels_patch):
            name = image_patch.stem
            names.append(name)
            all_row.append(patches_metadata.loc[name]['Row'])
            all_col.append(patches_metadata.loc[name]['Column'])
            filtered_patches.append(image_patch)
        
    # Create .csv with metadata information of the filtered patches
    outputdir_metadata = Path(filename / f"{filename.stem}_densely_filtered_metadata.csv")
    patches_metadata = pd.read_csv(Path(filename / "tile_selection.tsv"), sep='\t').set_index("Tile")
    File_metadata = {'patch_name':names,'row':all_row,'column':all_col}
    df_metadata = pd.DataFrame.from_dict(File_metadata)
    df_metadata.to_csv(outputdir_metadata, index=False)

    # Create .csv with filtered parches path
    outputdir_paths = Path(filename / f"{filename.stem}_densely_filtered_paths.csv")
    File = {'filtered_patch_path': filtered_patches}
    df_paths = pd.DataFrame.from_dict(File)
    df_paths.to_csv(outputdir_paths, index=False)

    print(f"Filtered patches: {len(filtered_patches)} from a total of {len(patches_path)}")
    print(f"Filtered .csv for {filename.stem} saved on {outputdir_paths.parent}")



Filtering patches from PyHIST:   0%|          | 0/721 [00:00<?, ?it/s]

== Filtering patches 000031068400351370 ==
Mean image within the mask of: 131.35898865573915
Set an lower threshold of 40 and upper 195 to compute the histogram
                         Width  Height  Keep  Row  Column
Tile                                                     
000031068400351370_0000    224     224     0    0       0
000031068400351370_0001    224     224     0    0       1
000031068400351370_0002    224     224     0    0       2
000031068400351370_0003    224     224     0    0       3
000031068400351370_0004    224     224     0    0       4
...                        ...     ...   ...  ...     ...
000031068400351370_9043    224       6     0   57     151
000031068400351370_9044    224       6     0   57     152
000031068400351370_9045    224       6     0   57     153
000031068400351370_9046    224       6     0   57     154
000031068400351370_9047    140       6     0   57     155

[9048 rows x 5 columns]


Filtering patches of 000031068400351370: 100%|██████████| 545/545 [00:02<00:00, 231.34it/s]
Filtering patches from PyHIST:   0%|          | 1/721 [00:06<1:12:19,  6.03s/it]

                  patch_name  row  column
0    000031068400351370_4268   27      56
1    000031068400351370_3640   23      52
2    000031068400351370_3957   25      57
3    000031068400351370_2696   17      44
4    000031068400351370_4248   27      36
..                       ...  ...     ...
513  000031068400351370_3336   21      60
514  000031068400351370_4566   29      42
515  000031068400351370_6441   41      45
516  000031068400351370_5659   36      43
517  000031068400351370_3168   20      48

[518 rows x 3 columns]
                                   filtered_patch_path
0    /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
1    /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
2    /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
3    /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
4    /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
..                                                 ...
513  /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
514  /home/lluis/histo_lung/data

Filtering patches of 000035601500689055: 100%|██████████| 2330/2330 [00:09<00:00, 239.92it/s]
Filtering patches from PyHIST:   0%|          | 2/721 [00:16<1:44:36,  8.73s/it]

                    patch_name  row  column
0     000035601500689055_07687   62      61
1     000035601500689055_04077   33      18
2     000035601500689055_03373   27      52
3     000035601500689055_07145   58      11
4     000035601500689055_03956   32      20
...                        ...  ...     ...
1531  000035601500689055_02121   17      30
1532  000035601500689055_07034   57      23
1533  000035601500689055_06606   53      87
1534  000035601500689055_06421   52      25
1535  000035601500689055_05452   44      40

[1536 rows x 3 columns]
                                    filtered_patch_path
0     /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
1     /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
2     /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
3     /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
4     /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...
...                                                 ...
1531  /home/lluis/histo_lung/data/Mask_PyHIST/LungAO...

Filtering patches of 000031326800365854:  12%|█▏        | 403/3346 [00:01<00:11, 246.57it/s]
Filtering patches from PyHIST:   0%|          | 2/721 [00:18<1:53:49,  9.50s/it]


KeyboardInterrupt: 

In [18]:
def explore_list(list_dirs):
	#print(threadname + str(" started"))

	for i in range(len(list_dirs)):
		analyze_file(list_dirs[i])
	#print(threadname + str(" finished"))


def chunker_list(seq, size):
		return (seq[i::size] for i in range(size))


	#create output dir if not exists
start_time = time.time()

np.random.seed(0)

maskdir  = Path(thispath.parent / "data" / "Mask_PyHIST")

subdirs = natsorted([e for e in maskdir.iterdir() if e.is_dir()])
list_dirs = []
for dir in subdirs:
    list_dirs += [i for i in dir.iterdir() if i.is_dir()]


THREAD_NUMBER = 10

#split in chunks for the threads
list_dirs = list(chunker_list(list_dirs, THREAD_NUMBER))
print(list_dirs)

[[PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031068400351370'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000034838900615671'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031796600390253'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031264900361450'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000032202000412410'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000034131800547425'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031915900395454'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031945100398119'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000031209100360993'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000032673000434048'), PosixPath('/home/lluis/histo_lung/data/Mask_PyHIST/LungAOEC_List2/000030769200337776'), PosixPath('/home/lluis/histo_l