In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import click
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

thispath = Path.cwd().resolve()


"""Performs k-fold-crossvalidation on the LungAOEC dataset with a desired
number of folds (k).

Parameters
----------
k : int
    Number of folds
test_size : float
    Percentage of images that belongs to the test set, rest of images
    will go to the train set (from 0 to 1)
"""
k = 10
test_size = 0.2
datadir = Path(thispath.parent / "data")

csv_dataset_AOEC = Path(datadir / "labels.csv")

# read data
dataset_AOEC = pd.read_csv(csv_dataset_AOEC,
                            sep=',', 
                            header=0, 
                            index_col=0, 
                            dtype={'image_num':str})

mskf = MultilabelStratifiedShuffleSplit(n_splits=k,
                                        test_size=test_size,
                                        random_state=33)

images = dataset_AOEC.index
labels = dataset_AOEC.values

header = ["images_train", "images_validation", "labels_train", "labels_validation"]
folds = pd.DataFrame(columns=header)
i = 0

for train_index, test_index in mskf.split(images, labels):
    images_train, images_test = images[train_index], images[test_index]
    labels_train, labels_test = labels[train_index], labels[test_index]
    folds.loc[i] = [images_train.tolist(), images_test.tolist(),
                    labels_train.tolist(), labels_test.tolist()]
    i += 1

print(f"Datasplit labels TRAIN: {np.sum(labels_train, axis=0)}"
        f"Datasplit labels TEST: {np.sum(labels_test, axis=0)}")

folds.index.name = "fold"
folds.to_csv(Path(datadir / f"{k}_fold_crossvalidation_data_split.csv"))

print(f"{k}_fold_crossvalidation_data_split.csv in {datadir}")

Datasplit labels TRAIN: [185 550 201 271]Datasplit labels TEST: [ 47 138  50  68]
10_fold_crossvalidation_data_split.csv in /home/lluis/histo_lung/data
