In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

thispath = Path.cwd().resolve()

datadir = Path(thispath.parent / "data")

labels = pd.read_csv(Path(datadir / "labels.csv"), index_col='image_num')

count_scc = (labels['cancer_scc'] == 1).sum()
count_nscc_adeno = (labels['cancer_nscc_adeno'] == 1).sum()
count_nscc_squamous = (labels['cancer_nscc_squamous'] == 1).sum()
count_no_cancer = (labels['no_cancer'] == 1).sum()

print(f"Nº scc: {count_scc}")
print(f"Nº nscc_adeno: {count_nscc_adeno}")
print(f"Nº nscc_squamous: {count_nscc_squamous}")
print(f"Nº no_cancer: {count_no_cancer}")

wsi_scc = labels.index[labels['cancer_scc'] == 1].tolist()
wsi_nscc_adeno = labels.index[labels['cancer_nscc_adeno'] == 1].tolist()
wsi_nscc_squamous = labels.index[labels['cancer_nscc_squamous'] == 1].tolist()
wsi_no_cancer = labels.index[labels['no_cancer'] == 1].tolist()

scc_nscc_adeno = []
scc_nscc_squamous = []
scc_no_cancer = []
nscc_adeno_squamous = []
scc_nscc_adeno_squamous = []
nscc_adeno_no_cancer = []
nscc_squamous_no_cancer = []


for scc in wsi_scc:
    
    for nscc_adeno in wsi_nscc_adeno:
        if scc == nscc_adeno and nscc_adeno not in scc_nscc_adeno:
            scc_nscc_adeno.append(nscc_adeno)

        for nscc_squamous in wsi_nscc_squamous:
            if scc == nscc_squamous and nscc_squamous not in scc_nscc_squamous:
                scc_nscc_squamous.append(nscc_squamous)

            if nscc_adeno == nscc_squamous and nscc_squamous not in nscc_adeno_squamous:
                nscc_adeno_squamous.append(nscc_squamous)
            
            if scc == nscc_adeno == nscc_squamous and nscc_squamous not in scc_nscc_adeno_squamous:
                scc_nscc_adeno_squamous.append(nscc_squamous)


print("WSI with more than one label")
print(f"SCC and Adeno: {len(scc_nscc_adeno)}")
print(f"SCC and Squamous: {len(scc_nscc_squamous)}")
print(f"Adeno and Squamous: {len(nscc_adeno_squamous)}")
print(f"SCC, Adeno and Squamous: {len(scc_nscc_adeno_squamous)}")

Nº scc: 232
Nº nscc_adeno: 690
Nº nscc_squamous: 251
Nº no_cancer: 339
WSI with more than one label
SCC and Adeno: 13
SCC and Squamous: 180
Adeno and Squamous: 12
SCC, Adeno and Squamous: 11


In [51]:

k = 10

datadir = Path(thispath.parent / "data")

csv_dataset_AOEC = Path(datadir / "labels.csv")

#read data
dataset_AOEC = pd.read_csv(csv_dataset_AOEC, sep=',', header=0).values

mskf = MultilabelStratifiedKFold(n_splits=k, shuffle=True, random_state=33)

images = dataset_AOEC[:, 0]

labels = dataset_AOEC[:, 1:]

header = ["images_train", "images_test", "labels_train", "labels_test"]
folds = pd.DataFrame(columns=header)
i = 0

for train_index, test_index in mskf.split(images, labels):
   images_train, images_test = images[train_index], images[test_index]
   labels_train, labels_test = labels[train_index], labels[test_index] 
   folds.loc[i] = [images_train, images_test, labels_train, labels_test]
   i += 1

   print(f"Datasplit labels TRAIN: {np.sum(labels_train, axis=0)}"
         f"Datasplit labels TEST: {np.sum(labels_test, axis=0)}")

folds.to_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"))

print(f"{k}_fold_crossvalidation_data_split.csv in {datadir}")


Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [208 621 226 306]Datasplit labels TEST: [24 69 25 33]
Datasplit labels TRAIN: [209 621 225 305]Datasplit labels TEST: [23 69 26 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [209 621 226 305]Datasplit labels TEST: [23 69 25 34]
Datasplit labels TRAIN: [208 621 226 305]Datasplit labels TEST: [24 69 25 34]
10_fold_crossvalidation_data_split.csv in /home/lluis/histo_lung/data


Check the splits

In [52]:
k = 10

data_split = pd.read_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"), index_col=0)
fold_0 = data_split.loc[0]
print(fold_0)
train_dataset = fold_0["images_train"]

for image in train_dataset:
    print(image)

images_train    ['000030274100310453.svs' '000029536100271824....
images_test     ['000029832000288268.svs' '000030786300337665....
labels_train    [[0 1 0 0]\n [1 0 1 0]\n [0 1 0 0]\n ...\n [0 ...
labels_test     [[0 0 0 1]\n [0 1 0 0]\n [0 1 0 0]\n [0 1 0 0]...
Name: 0, dtype: object
[
'
0
0
0
0
3
0
2
7
4
1
0
0
3
1
0
4
5
3
.
s
v
s
'
 
'
0
0
0
0
2
9
5
3
6
1
0
0
2
7
1
8
2
4
.
s
v
s
'


 
'
0
0
0
0
3
1
0
1
6
2
0
0
5
6
5
0
7
2
.
s
v
s
'
 
.
.
.
 
'
0
0
0
0
3
6
4
2
9
9
0
0
7
7
2
9
2
2
.
s
v
s
'


 
'
0
0
0
0
3
6
4
2
9
9
0
0
7
7
2
9
2
3
.
s
v
s
'
 
'
0
0
0
0
3
6
4
3
4
9
0
0
7
7
3
0
5
6
.
s
v
s
'
]
