In [6]:
import cv2
import os
import shutil
import numpy as np

# Def dimension of new images
DIM = 224
umbral_tejido = 0.35 # 10% de tejido al menos en la imagen para que sea guardada
umbral_patologico = 0.25 # 10% de cancer al menos en la imagen para que sea guardada

# Def  paths
path_SICAP = r'../SICAPv1'
path_SICAP_512 = path_SICAP + r'/512_patch'
path_SICAP_1024 = path_SICAP + r'/1024_patch'

path_dataset_read = path_SICAP_512                            # lectura
path_dataset_write_par = path_SICAP + f'/{DIM}_patch_par'     # Escritura
path_dataset_write_impar = path_SICAP + f'/{DIM}_patch_impar' # Escritura

### Información sobre el Dataset
SICAPv1 is a public patch-wise database composed by 78 histological Whole Slide Images (WSI) of the prostate. These images were collected by specialists of the Hospital Clínico Universitario de Valencia. 

The use of SICAPv1 database is restricted to research pourposes. Please cite SICAPv1 in your publications if it helps your research:

Ángel E. Esteban, Miguel López-Lara, Adrián Colomer, María A. Sales, Rafael Molina and Valery Naranjo, "A new optical density granulometry-based descriptor for the classification of prostate histological images using shallow and deep gaussian processes"

SICAPv1 is composed  of of 78 WSI: 18 correspond to benign prostate tissue biopsies (negative class) and 60 to pathological prostate tissue biopsies (positive class). This dataset was divided into two subsets, 60 WSI (17 benign and 43 pathological) were used to learn the models and the remaining 18 to test them. The 43 pathological WSI are distributed as follows: 18 WSI diagnosed as grade 3, 15 WSI catalogued as grade 4 and the remaining 10 images were marked as grade 5 by the pathologists.

In order to automatically analyse these gigapixel images,  the images weredownsampled from 40� to 10� and divided in patches with a 50% overlap.  To test the influence of the patch size, different sizes were selected: 512^2 and 1024^2,resulting on the two different datasets detailed in the following table:

		Benign	Grade3	Grade4	Grade5	Malign

#WSIs		  17		  18		  15		  10		  43

#512 patch	 6725		  380		  589		  173		 1142

#1024 patch 1909		  113		  181		  50		  344


SICAP1 database is composed of two external folders (one containing the patches of size 512^2 and the other one containing the patches of size 1024^2. Inside 512^2 folder two subfolders containing train and test partition can be found. In 1024^2 case, the test images are not available because this case is outperformed by 512^2 patches in validation. Inside train/test folders, the data is divided taking into account the class (i.e. benign and pathological). Inside each WSI identifier, five different subfolders can be found:

- Annotation: It contains the RGB patches marked as pathological by the experts (in the case of benign samples this folder is empty).

- AnnotationMask: It contains the binary masks belonging to the patches marked as pathological by the experts (in the case of benign samples this folder is empty)

- NoAnnotation: It corresponds to the RGB patches of a WSI without annotation.

- NoAnnotationMask: It contains the tissue masks (i.e. mask discerning tissue and background) from the NoAnnotated patches.

- TissueMask: It contains the tissue masks from the patches containing annotation. (EN TEST NO HAY)

If you have any doubt about the distrubution of the images, do not hesitate to contact us:

cvblab@i3b.upv.es


=============================================================================================

Notas: 
1.- Nombre de archivos: 16B0006668_Block_Region_1_0_0_xini_14356_yini_78832
		El primer número se refiere a la región (no pertenecen a la misma imagen inicial). El segundo número pertenece al desplazamiento vertical, y el tercero al horizontal.








### Funciones para procesar el dataset

In [7]:
def generate_new_patches(file, path_wsi, phase, label, wsi, patch_type_mask, numero):
    if numero%2==0:
        path_write = os.path.join(path_dataset_write_par, phase, label) # falta el patch_type[Mask] y file
        path_write_mask = os.path.join(path_dataset_write_par, phase+'_mask', label) # falta el patch_type[Mask] y file        
        path_write_pathological_noannotation = os.path.join(path_dataset_write_par, phase+'_pathological_noannotation', label) # falta el patch_type[Mask] y file        
    else:
        path_write = os.path.join(path_dataset_write_impar, phase, label) # falta el patch_type[Mask] y file
        path_write_mask = os.path.join(path_dataset_write_impar, phase+'_mask', label) # falta el patch_type[Mask] y file
        path_write_pathological_noannotation = os.path.join(path_dataset_write_impar, phase+'_pathological_noannotation', label) # falta el patch_type[Mask] y file
    
        
    
    umbral = umbral_tejido if 'noannotation' in patch_type_mask else umbral_patologico
    
    
    img_mask = cv2.imread(path_wsi+f'/{patch_type_mask}/{file}')
    img_mask[img_mask<=20]=0
    img_mask[img_mask>20]=255

    file2 = file.replace('_b.jpg','.jpg')
    patch_type = patch_type_mask.replace('Mask','')
    # Pueden pasar que: 
    #     1) lo lea bien
    #     2) esté en la carpeta contraria (en test por ejemplo
    #        la carpeta 'annotation' está vacía y todas las imágenes,
    #        están sin filtrar en noannotation)

    path_img = path_wsi+f'/{patch_type}/{file2}'   # Tiene el nombre 
    if not os.path.exists(path_img): # Si no existe es que es benigno
        path_img = path_wsi+f'/no{patch_type}/{file2}'   # Tiene el nombre exacto

        if not os.path.exists(path_img):
            print(f'ERROR: {path_img}, no existe el archivo')

    img = cv2.imread(path_img)

    assert(not(img is None))
        
        
    """
    [:DIM][:DIM]                                   # (0,0)
    [:DIM][256-DIM/2:256+DIM/2]                    # (0,1)
    [:DIM][-DIM:]                                  # (0,2)
    
    [256-DIM/2:256+DIM/2][:DIM]                    # (1,0)
    [256-DIM/2:256+DIM/2][256-DIM/2:256+DIM/2]     # (1,1)
    [256-DIM/2:256+DIM/2][-DIM:]                   # (1,2)
    
    [-DIM:][:DIM]                                  # (2,0)
    [-DIM:][256-DIM/2:256+DIM/2]                   # (2,1)
    [-DIM:][-DIM:]                                 # (2,2)
    """    
    
    inicios_fila = [0,0,0, int(256-DIM/2), int(256-DIM/2), int(256-DIM/2),-DIM,-DIM,-DIM]
    finales_fila = [DIM, DIM, DIM, int(256+DIM/2), int(256+DIM/2), int(256+DIM/2), None, None, None]
    
    inicios_columna = [0, int(256-DIM/2), -DIM, 0, int(256-DIM/2), -DIM, 0, int(256-DIM/2), -DIM]
    finales_columna = [DIM, int(256+DIM/2), None, DIM, int(256+DIM/2), None, DIM, int(256+DIM/2), None]
    
    for i, f_ini,f_fin, c_ini,c_fin in zip(range(9),inicios_fila, finales_fila, inicios_columna, finales_columna):
        parche = img[f_ini:f_fin, c_ini:c_fin, :]
        parchemask = img_mask[f_ini:f_fin, c_ini:c_fin, :]
        
        if parchemask.mean()/255.0>umbral:
            # Nombre
            parche_name = f'__{i}.'.join(file2.split('.')) # sin el _b que tienen algunas imágenes
            
            # Imagen original
            if ('annotation'==patch_type and label=='Pathological') or ('noannotation'==patch_type and label=='Benign'):
                # Guardamos en los directorios de donde generamos el dataset para tratarlo
                os.makedirs(os.path.join(path_write).replace('\\','/'), exist_ok=True)
                cv2.imwrite(os.path.join(path_write, parche_name).replace('\\','/'), parche) 
                
                # Máscara annotation/noannotation
                os.makedirs(os.path.join(path_write_mask).replace('\\','/'), exist_ok=True)
                cv2.imwrite(os.path.join(path_write_mask, parche_name).replace('\\','/'), parchemask) 
            else:
                # Guardamos en los directorios de _info
                os.makedirs(os.path.join(path_write_pathological_noannotation, patch_type).replace('\\','/'), exist_ok=True)
                cv2.imwrite(os.path.join(path_write_pathological_noannotation, patch_type, parche_name).replace('\\','/'), parche)
                
                # Máscara pathological noannotation
                os.makedirs(os.path.join(path_write_pathological_noannotation, patch_type_mask).replace('\\','/'), exist_ok=True)
                cv2.imwrite(os.path.join(path_write_pathological_noannotation, patch_type_mask, parche_name).replace('\\','/'), parchemask) 
                
            
            
    
    
    
    
    

### Lectura de las imágenes

In [30]:
print(os.listdir(path_dataset_read))

def generate_dataset():
    for root, dirs, files in os.walk(path_dataset_read, topdown=False):
        for directory in dirs:
            if 'annotationmask' not in directory.lower():
                continue

            root = root.replace('\\','/')
            root_std = os.path.join(root, directory).replace('\\','/')
            split = root_std.split('/')[-4:-1]

            phase = split[0]               # test or train
            label = split[1]               # Pathological or Bening
            wsi   = split[2]               # id
            patch_type = directory         # annotationMask, noannotationMask
            print(split)         
            
            for file in os.listdir(root_std):
                if '.jpg' not in file.lower():
                    continue

                vertical = float(file.split('_xini')[0].split('_')[-1])
                horizontal = float(file.split('_xini')[0].split('_')[-2])
                
                # Necesitamos que los dos sean pares, o los dos impares
                if (vertical+horizontal)%2:
                    # Como son del mismo tipo, nos bastaría saber si uno es impar o par
                    # para meterlos en su clase
                    par_impar = horizontal%2
                    
                    generate_new_patches(file=file, 
                                         path_wsi = root, 
                                         phase=phase, 
                                         label=label, 
                                         wsi=wsi,
                                         patch_type_mask=patch_type,
                                         numero=par_impar)
           
        
def generate_validation_set(split_size = 0.2):
    for path_dataset_write_ in [path_dataset_write_par, path_dataset_write_impar]:
        for label in ['Benign', 'Pathological']:
            if os.path.exists(os.path.join(path_dataset_write_,'val',label)):
                continue

            # Cogemos los archivos y los desordenamos
            files = os.listdir(os.path.join(path_dataset_write_,'train',label))
            np.random.shuffle(files)

            print(os.path.join(path_dataset_write_,'train', label))
            # Creamos las rutas
            os.makedirs(os.path.join(path_dataset_write_,'val', label))
            os.makedirs(os.path.join(path_dataset_write_,'val_mask', label))
            
            # Movemos
            size_validation = int(split_size * len(files))
            for file in files[:size_validation]:
                # Image
                src = os.path.join(path_dataset_write_, 'train', label, file)
                dest = os.path.join(path_dataset_write_, 'val', label, file)
                shutil.move(src, dest)

                # Mask
                src = os.path.join(path_dataset_write_, 'train'+'_mask', label, file)
                dest = os.path.join(path_dataset_write_, 'val'+'_mask', label, file)
                shutil.move(src, dest)

        
        
            
            
    
#generate_dataset()
generate_validation_set()

['test', 'train']
../SICAPv1/224_patch_par\train\Pathological
../SICAPv1/224_patch_impar\train\Pathological


### Balanceo de clases: comprobación

In [26]:
for path_read, dataset_par_impar in zip([path_dataset_read, path_dataset_write_par, path_dataset_write_impar], ["ORIGINAL", "PAR", "IMPAR"]):
    test_balanceo = {'Benign':0, 'Pathological':0}
    train_balanceo = {'Benign':0, 'Pathological':0}
    
    he_entrado_veces = 0
    for root, dirs, files in os.walk(path_read):
        root_std = root.replace('\\','/')
        
        if dataset_par_impar == "ORIGINAL":
            try:
                curr_dir = root_std.split('/')[-1]
                phase = root_std.split('/')[-4]
                label = root_std.split('/')[-3]
            except:
                print("CUIDADO: ",root_std)
                continue
                
            
            if ('annotationMask'==curr_dir and label=='Pathological') or ('noannotation'==curr_dir and label=='Benign'):
                if 'train'==phase:
                    he_entrado_veces += 1
                    train_balanceo[f'{label}'] += len(files)
                elif phase=='test':
                    he_entrado_veces += 1
                    test_balanceo[f'{label}'] += len(files)
        else:
            phase = root_std.split('/')[-2]
            label = root_std.split('/')[-1]

            if phase=='train':
                train_balanceo[f'{label}'] += len(files)
            elif phase=='test':
                 test_balanceo[f'{label}'] += len(files)

    print("He entrado estas veces: ", he_entrado_veces)
    print(dataset_par_impar)
    print(f"\tTEST\t{test_balanceo}")
    try:
        print(f"\tBenign: {test_balanceo['Benign']/(test_balanceo['Benign']+test_balanceo['Pathological'])}\n\tPathological: {test_balanceo['Pathological']/(test_balanceo['Benign']+test_balanceo['Pathological'])}")
    except:
        print()
    print("\t"+"="*50)
    print(f"\tTRAIN\t{train_balanceo}")
    try:
        print(f"\tBenign: {train_balanceo['Benign']/(train_balanceo['Benign']+train_balanceo['Pathological'])}\n\tPathological: {train_balanceo['Pathological']/(train_balanceo['Benign']+train_balanceo['Pathological'])}")
    except:
        print()
    print("\n"+"#"*40)
    print("#"*40)



CUIDADO:  ../SICAPv1/512_patch
He entrado estas veces:  79
ORIGINAL
	TEST	{'Benign': 5681, 'Pathological': 3907}
	Benign: 0.592511472674176
	Pathological: 0.40748852732582397
	TRAIN	{'Benign': 28472, 'Pathological': 3697}
	Benign: 0.8850756939911094
	Pathological: 0.11492430600889054

########################################
########################################
He entrado estas veces:  0
PAR
	TEST	{'Benign': 4014, 'Pathological': 3322}
	Benign: 0.547164667393675
	Pathological: 0.45283533260632497
	TRAIN	{'Benign': 11387, 'Pathological': 2929}
	Benign: 0.7954037440625873
	Pathological: 0.20459625593741268

########################################
########################################
He entrado estas veces:  0
IMPAR
	TEST	{'Benign': 3751, 'Pathological': 3233}
	Benign: 0.5370847651775487
	Pathological: 0.46291523482245134
	TRAIN	{'Benign': 11292, 'Pathological': 2906}
	Benign: 0.795323284969714
	Pathological: 0.20467671503028595

########################################
#########

In [20]:
import numpy as np

for path_dataset_write_ in [path_dataset_write_par, path_dataset_write_impar]:
    for phase in ['train','test']:
        for label in ['Benign', 'Pathological']:
            a=np.array(os.listdir(os.path.join(path_dataset_write_,phase,label)))
            print("\n",a)
            print(np.random.shuffle(a))


 ['16B0022616_Block_Region_0_0_1_xini_4935_yini_141611__0.jpg'
 '16B0022616_Block_Region_0_0_1_xini_4935_yini_141611__1.jpg'
 '16B0022616_Block_Region_0_0_1_xini_4935_yini_141611__2.jpg' ...
 '16B0028822_Block_Region_5_8_33_xini_41306_yini_10787__3.jpg'
 '16B0028822_Block_Region_5_8_33_xini_41306_yini_10787__4.jpg'
 '16B0028822_Block_Region_5_8_33_xini_41306_yini_10787__5.jpg']
None

 ['16B0001851_Block_Region_1_0_1_xini_7827_yini_59786__0.jpg'
 '16B0001851_Block_Region_1_0_1_xini_7827_yini_59786__1.jpg'
 '16B0001851_Block_Region_1_0_1_xini_7827_yini_59786__2.jpg' ...
 '17B0024162_Block_Region_4_8_5_xini_21057_yini_44634__4.jpg'
 '17B0024162_Block_Region_4_8_5_xini_21057_yini_44634__6.jpg'
 '17B0024162_Block_Region_4_8_5_xini_21057_yini_44634__7.jpg']
None

 ['18B0006623A_Block_Region_0_0_11_xini_29998_yini_168002__6.jpg'
 '18B0006623A_Block_Region_0_0_11_xini_29998_yini_168002__7.jpg'
 '18B0006623A_Block_Region_0_0_13_xini_31022_yini_168002__6.jpg' ...
 '18B0006623I_Block_Region_8_8_

In [25]:
a=np.array(["hola", "que tal", "adios"])
np.random.shuffle(a)
print(a)

['adios' 'hola' 'que tal']
