In [76]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from natsort import natsorted

thispath = Path.cwd().resolve()

datadir = Path(thispath.parent / "data")
reportsdir = Path(datadir / "csv_folder" / "reports")

reports_path = natsorted([i for i in reportsdir.rglob("*.xls") if "Lung" in str(i)])

dfs = []
for report in reports_path:
    report = pd.read_excel(report, index_col=0, usecols=["FILENAME", "NATOIL"],
                           dtype={"FILENAME": str, "NATOIL": str})
    dfs.append(report)
    
reports = pd.concat(dfs)
reports.sort_values("NATOIL", inplace=True)

temp = reports.iloc[0][0]
id_list = []
count = 0
for index, row in reports.iterrows():
    if row[0] != temp:
        count += 1
        temp = row[0]
    id_list.append(str(count).zfill(3))

reports["ID"] = id_list
reports.drop("NATOIL", axis=1, inplace=True)
print(reports)
reports.to_csv(datadir / "patients_ID.csv")

                     ID
FILENAME               
000030069800299917  000
000030069800301408  000
000030069800301406  000
000033385500476542  001
000033340300479660  002
...                 ...
000032301600414125  223
000032301600414861  223
000032301600414863  223
000032302500414881  223
000032301600414865  223

[2087 rows x 1 columns]


In [142]:
import numpy as np

datadir = Path(thispath.parent / "data")
k = 5

csv_ids = Path(datadir / "patients_ID.csv")
csv_dataset_AOEC = Path(datadir / "labels.csv")

# read data
dataset_AOEC = pd.read_csv(csv_dataset_AOEC,
                            sep=',', 
                            index_col=0, 
                            dtype={"image_num":str})

patients_id = pd.read_csv(csv_ids,
                            sep=',', 
                            index_col=0, 
                            dtype={"FILENAME": str, "ID": str})

df = patients_id.drop_duplicates(subset='ID', keep="first")
patients = df.values

folds = create_folds(patients, k)
header = ["images_train", "images_validation", "labels_train", "labels_validation"]
folds_dataset = pd.DataFrame(columns=header)

for i in range(k):
    train_patients = folds[:i] + folds[i+1:]
    train_patients = [item for sublist in train_patients for item in sublist]
    train_patients = [item for sublist in train_patients for item in sublist]
    validation_patinets = folds[i]
    validation_patinets = [item for sublist in validation_patinets for item in sublist]

    train_filenames = patients_id[patients_id['ID'].isin(train_patients)].index
    validation_filenames = patients_id[patients_id['ID'].isin(validation_patinets)].index
    train = dataset_AOEC[dataset_AOEC.index.isin(train_filenames)]
    validation = dataset_AOEC[dataset_AOEC.index.isin(validation_filenames)]

    images_train = train.index.to_list()
    labels_train = train.values.tolist()
    images_validation = validation.index.to_list()
    labels_validation = validation.values.tolist()

    folds_dataset.loc[i] = [images_train, images_validation, labels_train, labels_validation]

    print(f"Number WSI TRAIN: {len(images_train)}, Number WSI VALID: {len(images_validation)}")
    print(f"Datasplit labels TRAIN: {np.sum(labels_train, axis=0)}, "
        f"Datasplit labels TEST: {np.sum(labels_validation, axis=0)}")

folds_dataset.index.name = "fold"
folds_dataset.to_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"))

print(f"{k}_fold_crossvalidation_data_split.csv in {datadir}")



Number WSI TRAIN: 1069, Number WSI VALID: 247
Datasplit labels TRAIN: [187 560 194 293], Datasplit labels TEST: [ 45 128  57  46]
Number WSI TRAIN: 1038, Number WSI VALID: 278
Datasplit labels TRAIN: [184 535 198 270], Datasplit labels TEST: [ 48 153  53  69]
Number WSI TRAIN: 1018, Number WSI VALID: 298
Datasplit labels TRAIN: [171 545 184 264], Datasplit labels TEST: [ 61 143  67  75]
Number WSI TRAIN: 1060, Number WSI VALID: 256
Datasplit labels TRAIN: [167 611 182 230], Datasplit labels TEST: [ 65  77  69 109]
Number WSI TRAIN: 1079, Number WSI VALID: 237
Datasplit labels TRAIN: [219 501 246 299], Datasplit labels TEST: [ 13 187   5  40]
5_fold_crossvalidation_data_split.csv in /home/lluis/histo_lung/data
