In [2]:
from pathlib import Path
import pandas as pd


thispath = Path.cwd().resolve()

datadir = Path(thispath.parent / "data")

labels = pd.read_csv(Path(datadir / "labels.csv"), index_col='image_num')

count_scc = (labels['cancer_scc'] == 1).sum()
count_nscc_adeno = (labels['cancer_nscc_adeno'] == 1).sum()
count_nscc_squamous = (labels['cancer_nscc_squamous'] == 1).sum()
count_no_cancer = (labels['no_cancer'] == 1).sum()

print(f"Nº scc: {count_scc}")
print(f"Nº nscc_adeno: {count_nscc_adeno}")
print(f"Nº nscc_squamous: {count_nscc_squamous}")
print(f"Nº no_cancer: {count_no_cancer}")

wsi_scc = labels.index[labels['cancer_scc'] == 1].tolist()
wsi_nscc_adeno = labels.index[labels['cancer_nscc_adeno'] == 1].tolist()
wsi_nscc_squamous = labels.index[labels['cancer_nscc_squamous'] == 1].tolist()
wsi_no_cancer = labels.index[labels['no_cancer'] == 1].tolist()

scc_nscc_adeno = []
scc_nscc_squamous = []
scc_no_cancer = []
nscc_adeno_squamous = []
scc_nscc_adeno_squamous = []
nscc_adeno_no_cancer = []
nscc_squamous_no_cancer = []


for scc in wsi_scc:
    
    for nscc_adeno in wsi_nscc_adeno:
        if scc == nscc_adeno and nscc_adeno not in scc_nscc_adeno:
            scc_nscc_adeno.append(nscc_adeno)

        for nscc_squamous in wsi_nscc_squamous:
            if scc == nscc_squamous and nscc_squamous not in scc_nscc_squamous:
                scc_nscc_squamous.append(nscc_squamous)

            if nscc_adeno == nscc_squamous and nscc_squamous not in nscc_adeno_squamous:
                nscc_adeno_squamous.append(nscc_squamous)
            
            if scc == nscc_adeno == nscc_squamous and nscc_squamous not in scc_nscc_adeno_squamous:
                scc_nscc_adeno_squamous.append(nscc_squamous)


print("WSI with more than one label")
print(f"SCC and Adeno: {len(scc_nscc_adeno)}")
print(f"SCC and Squamous: {len(scc_nscc_squamous)}")
print(f"Adeno and Squamous: {len(nscc_adeno_squamous)}")
print(f"SCC, Adeno and Squamous: {len(scc_nscc_adeno_squamous)}")

Nº scc: 232
Nº nscc_adeno: 690
Nº nscc_squamous: 251
Nº no_cancer: 339
WSI with more than one label
SCC and Adeno: 13
SCC and Squamous: 180
Adeno and Squamous: 12
SCC, Adeno and Squamous: 11


Check the splits

In [3]:
from ast import literal_eval

k = 10
fold = 0

data_split = pd.read_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"), index_col=0)

# Data fold 0
train_dataset = literal_eval(data_split.loc[fold]["images_train"])
validation_dataset = literal_eval(data_split.loc[fold]["images_test"])
train_labels = literal_eval(data_split.loc[fold]["labels_train"])
validation_labels = literal_eval(data_split.loc[fold]["labels_test"])
print(train_dataset)
print(type(train_dataset))
print(validation_dataset)
print(len(validation_dataset))
print(train_labels)
print(len(train_labels))
print(validation_labels)
print(type(validation_labels))


['000029269500258388', '000029488200270022', '000029488200270028', '000029488200270029', '000029496800270408', '000029496800270409', '000029496800270412', '000029496800270413', '000029496800270415', '000029496800270416', '000029496800270417', '000029496800270418', '000029536100271824', '000029709100281888', '000029832000288268', '000029919100292999', '000030069800299917', '000030274100310453', '000030303300314205', '000030303300314206', '000030303300314209', '000030303300314210', '000030397000318685', '000030397000318686', '000030397000318689', '000030399700317760', '000030399700317761', '000030399700317762', '000030399700317763', '000030412700319317', '000030412700319318', '000030412700319319', '000030412700319320', '000030412700319321', '000030412700319322', '000030438500320102', '000030438500320110', '000030438500320111', '000030438500320112', '000030443200320380', '000030466600322332', '000030467500322347', '000030473100323233', '000030487200323364', '000030487200323365', '00003048

In [4]:
# Load features from MoCo model
experiment_name = "MoCo_try_Adam"

mocodir = Path(thispath.parent / 
               "trained_models" / 
               "MoCo" / 
               experiment_name)

df_features = pd.read_csv(mocodir / f"features_{experiment_name}.csv", index_col=0)


In [15]:
from natsort import natsorted
from tqdm import tqdm

# Train and validation
pyhistdir = Path(datadir / "Mask_PyHIST_v2")

dataset_path = natsorted([i for i in pyhistdir.rglob("*_densely_filtered_paths.csv")])

path_patches = {}
patches_names = {}
for wsi_patches in tqdm(dataset_path, desc="Selecting patches to extract features"):

    csv_instances = pd.read_csv(wsi_patches).to_numpy()
    
    name = wsi_patches.parent.stem
    path_patches[name] = csv_instances
    patches_names[name] = []

    for instance in csv_instances:
            patches_names[name].append(str(instance).split("/")[-1])

Selecting patches to extract features: 100%|██████████| 1368/1368 [00:50<00:00, 26.88it/s]


In [20]:
test_csv = pd.read_csv(Path(datadir / f"labels_test.csv"), index_col=0)
test_dataset = test_csv.index
test_dataset = [i.replace("/", "-") for i in test_dataset]
test_labels = test_csv.values

patches_train = []
patches_validation = []
patches_test = []
for value, key in zip(patches_names.values(), path_patches.keys()):
        
    if key in train_dataset:
        patches_train.extend(value)
    if key in validation_dataset:
        patches_validation.extend(value)
    if key in test_dataset:
        patches_test.extend(value)

print(len(patches_train))
print(len(patches_validation))
print(len(patches_test))


['18-11601', '18-12515', '18-12521', '18-12847', '18-13373', '18-13374', '18-14600', '18-15236', '18-15752', '18-16261', '18-16939', '18-17571', '18-17779', '18-17997', '18-18496', '18-19002', '18-19137', '18-19729', '18-20012', '18-20708', '18-2381', '18-3221', '18-3631', '18-5409', '18-6944a', '18-7357', '18-8259', '18-9844b', '19-10326', '19-10334', '19-10946', '19-1867', '19-2335', '19-2382', '19-312', '19-3162', '19-3493', '19-3796', '19-4072', '19-5287', '19-5741', '19-5928', '19-6510', '19-6511', '19-6552', '19-7424', '19-8224', '19-8416', '19-9378', '19-9801']
1968670
474965
12474


In [7]:
features_train = df_features.loc[patches_train]
features_valid = df_features.loc[patches_test]

In [13]:
k = 10

data_split = pd.read_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"), index_col=0)
train_dataset_k = []
validation_dataset_k = []
train_labels_k = []
validation_labels_k = []

for fold, _ in data_split.iterrows():
    train_wsi = literal_eval(data_split.loc[fold]["images_train"])
    validation_wsi = literal_eval(data_split.loc[fold]["images_test"])
    labels_train = literal_eval(data_split.loc[fold]["labels_train"])
    labels_validation = literal_eval(data_split.loc[fold]["labels_test"])
    train_dataset_k.append(train_wsi)
    validation_dataset_k.append(validation_wsi)
    train_labels_k.append(labels_train)
    validation_labels_k.append(labels_validation)

# Load fold 0
train_dataset = train_dataset_k[0]
validation_dataset = validation_dataset_k[0]
train_labels = train_labels_k[0]
validation_labels = validation_labels_k[0]

print(train_dataset)
print(len(train_dataset))
print(test_dataset)
print(len(test_dataset))
print(train_labels)
print(len(train_labels))
print(validation_labels)
print(len(validation_labels))


['000029269500258388', '000029488200270022', '000029488200270028', '000029488200270029', '000029496800270408', '000029496800270409', '000029496800270412', '000029496800270413', '000029496800270415', '000029496800270416', '000029496800270417', '000029496800270418', '000029536100271824', '000029709100281888', '000029832000288268', '000029919100292999', '000030069800299917', '000030274100310453', '000030303300314205', '000030303300314206', '000030303300314209', '000030303300314210', '000030397000318685', '000030397000318686', '000030397000318689', '000030399700317760', '000030399700317761', '000030399700317762', '000030399700317763', '000030412700319317', '000030412700319318', '000030412700319319', '000030412700319320', '000030412700319321', '000030412700319322', '000030438500320102', '000030438500320110', '000030438500320111', '000030438500320112', '000030443200320380', '000030466600322332', '000030467500322347', '000030473100323233', '000030487200323364', '000030487200323365', '00003048