In [112]:
from pathlib import Path
import pandas as pd
import numpy as np

thispath = Path.cwd().resolve()


rumcdir = Path(thispath.parent / "data" / "csv_folder" / "rumc_data")

rumc_reports = pd.read_excel(rumcdir / "lung_rumc_biopsies_reports.xlsx")

wsi_id = rumc_reports["Studynumber"].values

wsi_type = rumc_reports["Type"].values

labels_string = rumc_reports["Label"].values
temp_type = wsi_type[0]
temp_labels = labels_string[0]

columns = ["cancer_scc", "cancer_nscc_adeno", "cancer_nscc_squamous", "no_cancer"]

correct_reports = {}
for i, type_one in enumerate(wsi_type):
    type_one = str(type_one)
    if "nan" not in type_one:
        correct_reports[wsi_id[i]] = (type_one, labels_string[i])
        temp_type = type_one
        temp_labels = labels_string[i]
    else:
        correct_reports[wsi_id[i]] = (temp_type, temp_labels)

ccr = pd.DataFrame.from_dict(correct_reports, orient="index")
ccr.to_csv(f"{rumcdir}/correct_reports.csv", header=None, index=None)

nolabels = ["cellymphoma", "SFT", "aspergilloma", "atypical", "carcinoid", "chondrosarcoma",
            "fibrosis", "lymphangioleiomyomatosis", "meso", "nec", "schwa", "x", "osteosarcoma",
            "pleioform", "fibrous", "synoviosarcoma", "ewing", "?"]

desired_labels = ["adeno", "squam", "normal", "small"]

dataset_rumc = {}
for key, value in correct_reports.items():
    if "lung" in value[0]:
        if (np.flatnonzero(np.core.defchararray.find(value[1], desired_labels)!=-1).size > 0 and
            "cellymphoma" not in value[1]):
            dataset_rumc[key] = value

dataset_rumc_df = pd.DataFrame.from_dict(dataset_rumc, orient="index")
dataset_rumc_df.to_csv(f"{rumcdir}/dataset_rumc_2.csv", header=None, index=None)

useful_data = {}
for key, value in dataset_rumc.items():
    if "," not in value[0] and "-" not in value[0]:
       useful_data[key] = value

    else:
        wsi_mini_id = key.split("_")[4][1:]

        if "-" in value[0] or "-" in value[1]:
            if "-" in value[0]:
                splits_types = value[0].split("-")
            
                if len(splits_types) < 3:
                    number_1 = int(splits_types[0][-1])
                    number_2 = int(splits_types[1][0])
                    real_numbers = '-'.join(np.arange(number_1, number_2).astype(str))
                    new_types = '-'.join(np.array([splits_types[0][:-1], real_numbers, splits_types[1]]))

                elif len(splits_types) > 2:
                    number_1 = int(splits_types[0][-1])
                    number_2 = int(splits_types[1][0])
                    number_3 = int(splits_types[1][-1])
                    number_4 = int(splits_types[2][0])
                    real_numbers_1 = '-'.join(np.arange(number_1, number_2).astype(str))
                    real_numbers_2 = '-'.join(np.arange(number_3, number_4).astype(str))
                    new_types = '-'.join(np.array([splits_types[0][:-1], real_numbers_1,
                                                splits_types[1], real_numbers_2, splits_types[2]]))
            else:
                new_types = None
            
            if "-" in value[1]:
                splits_labels = value[1].split("-")
                
                if len(splits_labels) < 3:
                    number_1 = int(splits_labels[0][-1])
                    number_2 = int(splits_labels[1][0])
                    real_numbers = '-'.join(np.arange(number_1, number_2).astype(str))
                    new_labels = '-'.join(np.array([splits_labels[0][:-1], real_numbers, splits_labels[1]]))

                elif len(splits_labels) > 2:
                    number_1 = int(splits_labels[0][-1])
                    number_2 = int(splits_labels[1][0])
                    number_3 = int(splits_labels[1][-1])
                    number_4 = int(splits_labels[2][0])
                    real_numbers_1 = '-'.join(np.arange(number_1, number_2).astype(str))
                    real_numbers_2 = '-'.join(np.arange(number_3, number_4).astype(str))
                    new_labels = '-'.join(np.array([splits_labels[0][:-1], real_numbers_1,
                                                    splits_labels[1], real_numbers_2, splits_types[2]]))
            else:
                new_labels = None

            if new_types != None and new_labels != None:
                value = (new_types, new_labels)
            elif new_types != None:
                value = (new_types, value[1])
            elif new_labels != None:
                value = (value[0], new_labels)

        patient_types = value[0].split(",")
        patient_labels = value[1].split(",")

        if len(patient_types) == len(patient_labels):
            for type_tissue, label in zip(patient_types, patient_labels):
                # if "-" in type_tissue:
                #     numbers = type_tissue.split("-")
                    
                #     if wsi_mini_id[0] in numbers:
                #         if "lung" in type_tissue and "10" not in label:
                #             value = (type_tissue, label)
                #             if np.flatnonzero(np.core.defchararray.find(value[1], desired_labels)!=-1).size > 0:
                #                 useful_data[key] = value
                # else:
                    if wsi_mini_id[0] in type_tissue:
                        if "lung" in type_tissue and "10" not in label:
                            value = (type_tissue, label)
                            if np.flatnonzero(np.core.defchararray.find(value[1], desired_labels)!=-1).size > 0:
                                useful_data[key] = value

useful_data_df = pd.DataFrame.from_dict(useful_data, orient="index")
useful_data_df.to_csv(f"{rumcdir}/useful_data.csv", header=None)

normal = {}
squam = {}
adeno = {}
small = {}
for key, value in useful_data.items():

    if "normal" in value[1]:
        normal[key] = [0,0,0,1]

    if "squam" in value[1]:
        squam[key] = [0,0,1,0]

    if "adeno" in value[1]:
        adeno[key] = [0,1,0,0]
    if "small" in value[1]:
        small[key] = [1,0,0,0]

print(normal)
print(squam)
print(adeno)
print(small)
print(len(normal))
print(len(squam))
print(len(adeno))
print(len(small))

wsi_of_interest = []
labels = {}

for key, value in normal.items():
    wsi_of_interest.append(key)
    labels[key] = [0,0,0,1]


for key, value in squam.items():
    wsi_of_interest.append(key)
    labels[key] = [0,0,1,0]

for key, value in adeno.items():
    wsi_of_interest.append(key)
    labels[key] = [0,1,0,0]

print(wsi_of_interest)
print(len(wsi_of_interest))

col_name = ["cancer_scc","cancer_nscc_adeno","cancer_nscc_squamous","no_cancer"]
labels_df = pd.DataFrame.from_dict(labels, orient="index", columns=col_name)
print(labels_df)
# wsi_of_interest_df = pd.DataFrame(wsi_of_interest)
# wsi_of_interest_df.to_csv(f"{rumcdir}/wsi_of_interest.csv", header=None, index=None)

{'EX_S09_P000001_C0001_B101_V01_T01_E001': [0, 0, 0, 1], 'EX_S09_P000001_C0001_B101_V02_T01_E001': [0, 0, 0, 1], 'EX_S09_P000006_C0001_B101_V01_T02': [0, 0, 0, 1], 'EX_S09_P000006_C0001_B102_V01_T02': [0, 0, 0, 1], 'EX_S09_P000006_C0001_B103_V01_T02': [0, 0, 0, 1], 'EX_S09_P000009_C0001_B101_V01_T01': [0, 0, 0, 1], 'EX_S09_P000009_C0001_B101_V02_T01': [0, 0, 0, 1], 'EX_S09_P000009_C0001_B101_V03_T01': [0, 0, 0, 1], 'EX_S09_P000009_C0001_B101_V04_T01': [0, 0, 0, 1], 'EX_S09_P000011_C0001_B103_V01_T02': [0, 0, 0, 1], 'EX_S09_P000011_C0001_B104_V01_T02': [0, 0, 0, 1], 'EX_S09_P000015_C0001_B201_V01_T01': [0, 0, 0, 1], 'EX_S09_P000020_C0001_B103_V01_T02': [0, 0, 0, 1], 'EX_S09_P000020_C0001_B201_V01_T02': [0, 0, 0, 1], 'EX_S09_P000020_C0001_B203_V01_T02': [0, 0, 0, 1], 'EX_S09_P000024_C0001_B102_V01_T02': [0, 0, 0, 1], 'EX_S09_P000024_C0001_B103_V01_T02': [0, 0, 0, 1], 'EX_S09_P000025_C0001_B101_V01_T01': [0, 0, 0, 1], 'EX_S09_P000025_C0001_B102_V01_T01': [0, 0, 0, 1], 'EX_S09_P000025_C000

In [3]:
labels_df.sort_index(inplace=True)

names = labels_df.index.values
print(names)
id_list = []
temp = names[0][7:15]
count = 0
for i, name in enumerate(names):
    if name[7:15] != temp:
        count += 1
        temp = name[7:15]
    id_list.append(str(count).zfill(3))

print(id_list)
print(len(id_list))

labels_df["ID"]=id_list

datadir = Path(thispath.parent / "data")

labels_df.to_csv(f"{datadir}/labels_id_rumc.csv")

<_PropertyMap {'openslide.level-count': '10', 'openslide.level[0].downsample': '1', 'openslide.level[0].height': '294144', 'openslide.level[0].tile-height': '512', 'openslide.level[0].tile-width': '512', 'openslide.level[0].width': '272128', 'openslide.level[1].downsample': '2', 'openslide.level[1].height': '147072', 'openslide.level[1].tile-height': '512', 'openslide.level[1].tile-width': '512', 'openslide.level[1].width': '136064', 'openslide.level[2].downsample': '4', 'openslide.level[2].height': '73536', 'openslide.level[2].tile-height': '512', 'openslide.level[2].tile-width': '512', 'openslide.level[2].width': '68032', 'openslide.level[3].downsample': '8', 'openslide.level[3].height': '36768', 'openslide.level[3].tile-height': '512', 'openslide.level[3].tile-width': '512', 'openslide.level[3].width': '34016', 'openslide.level[4].downsample': '16', 'openslide.level[4].height': '18384', 'openslide.level[4].tile-height': '512', 'openslide.level[4].tile-width': '512', 'openslide.level

KeyboardInterrupt: 

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

thispath = Path.cwd().resolve()

def create_folds(seq, k):
    avg = len(seq) / float(k)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

datadir = Path(thispath.parent/ "data")

csv_ids = Path(datadir / "labels_id_rumc.csv")

labels_ids = pd.read_csv(csv_ids, index_col=0)

# read data
dataset_rumc = labels_ids.index.values

pyhistdir_rumc = Path(datadir / "Mask_PyHIST")

patients_id = labels_ids["ID"]
df = patients_id.drop_duplicates(keep="first")
patients = df.values

metadata_dataset_rumc = pd.read_csv(pyhistdir_rumc / "metadata_slides_v2.csv", index_col=0,
                                   dtype={"ID wsi": str})  

labels = labels_ids.drop("ID", axis=1)
discard_wsi_dataset = []
if (metadata_dataset_rumc['number_filtered_patches'] < 10).any():
    for index, row in metadata_dataset_rumc.iterrows():
        if row['number_filtered_patches'] < 10:
            discard_wsi_dataset.append(index)
print(f"There is {len(discard_wsi_dataset)} WSI discarded in train/valid becasue <10 patches")

labels.drop(discard_wsi_dataset, inplace = True)


patients_test = patients[300:]
patinets = patients[:300]


folds = create_folds(patients[:300], 5)
header = ["images_train", "images_validation", "labels_train", "labels_validation"]
folds_dataset = pd.DataFrame(columns=header)

for i in range(5):
    train_patients = folds[:i] + folds[i+1:]
    train_patients = [item for sublist in train_patients for item in sublist]
    validation_patinets = folds[i]

    train_filenames = patients_id[patients_id.isin(train_patients)].index
    validation_filenames = patients_id[patients_id.isin(validation_patinets)].index
    train = labels[labels.index.isin(train_filenames)]
    validation = labels[labels.index.isin(validation_filenames)]

    images_train = train.index.to_list()
    labels_train = train.values.tolist()
    images_validation = validation.index.to_list()
    labels_validation = validation.values.tolist()

    folds_dataset.loc[i] = [images_train, images_validation, labels_train, labels_validation]

    print(f"Number WSI TRAIN: {len(images_train)}, Number WSI VALID: {len(images_validation)}")
    print(f"Datasplit labels TRAIN: {np.sum(labels_train, axis=0)}, "
        f"Datasplit labels TEST: {np.sum(labels_validation, axis=0)}")

# folds_dataset.index.name = "fold"
# folds_dataset.to_csv(Path(datadir / f"5_fold_crossvalidation_data_split_rumc.csv"))

print(f"5_fold_crossvalidation_data_split.csv in {datadir}")

In [None]:
test_filenames = patients_id[patients_id.isin(patients_test)].index
test = labels[labels.index.isin(test_filenames)]

test.to_csv(Path(datadir / f"labels_test_rumc.csv"))

In [None]:
import statistics as stat

"""
Number WSI TRAIN: 690, Number WSI VALID: 311
Datasplit labels TRAIN: [  0 200 135 355], Datasplit labels TEST: [  0  97  70 144]
Number WSI TRAIN: 821, Number WSI VALID: 180
Datasplit labels TRAIN: [  0 268 166 387], Datasplit labels TEST: [  0  29  39 112]
Number WSI TRAIN: 839, Number WSI VALID: 162
Datasplit labels TRAIN: [  0 239 175 425], Datasplit labels TEST: [ 0 58 30 74]
Number WSI TRAIN: 803, Number WSI VALID: 198
Datasplit labels TRAIN: [  0 230 177 396], Datasplit labels TEST: [  0  67  28 103]
Number WSI TRAIN: 851, Number WSI VALID: 150
Datasplit labels TRAIN: [  0 251 167 433], Datasplit labels TEST: [ 0 46 38 66]
"""

luad_train = [200, 268, 239, 230, 251]
lusc_train = [135, 166, 175, 177, 167]
nl_train = [355, 387, 425, 396, 433]

luad_valid = [97, 29, 58, 67, 46]
lusc_valid = [70, 39, 30, 28, 38]
nl_valid = [144, 112, 74, 103, 66]

mean_luad_train = stat.mean(luad_train)
mean_lusc_train = stat.mean(lusc_train)
mean_nl_train = stat.mean(nl_train)

std_luad_train = stat.stdev(luad_train)
std_lusc_train = stat.stdev(lusc_train)
std_nl_train = stat.stdev(nl_train)

mean_luad_valid = stat.mean(luad_valid)
mean_lusc_valid = stat.mean(lusc_valid)
mean_nl_valid = stat.mean(nl_valid)

std_luad_valid = stat.stdev(luad_valid)
std_lusc_valid = stat.stdev(lusc_valid)
std_nl_valid = stat.stdev(nl_valid)


print(f"LUAD train: {mean_luad_train} \u00B1 {std_luad_train}")
print(f"LUSC train: {mean_lusc_train} \u00B1 {std_lusc_train}")
print(f"NL train: {mean_nl_train} \u00B1 {std_nl_train}")


print(f"LUAD valid: {mean_luad_valid} \u00B1 {std_luad_valid}")
print(f"LUSC valid: {mean_lusc_valid} \u00B1 {std_lusc_valid}")
print(f"NL valid: {mean_nl_valid} \u00B1 {std_nl_valid}")

In [None]:
import pandas as pd
import numpy as np

test_labels = pd.read_csv((datadir / f"labels_test_rumc.csv"), index_col=0).values
print(np.sum(test_labels , axis=0))