Inspiration taken from https://github.com/e-pet/adni-bias/blob/main/Repeated_CV_Splitter.py

Similar to lr_split_aug.ipynb

In [11]:
from warnings import warn

import numpy as np
import pandas as pd

from image_augmentation import image_augmentation

Split data into training, validation and test tests for CNN, aka just keeping img_id, gender, is_cancerous and fitzpatrick with no synthetic data, but additional data from augmented images. Can split twice:

1: Augmenting each image at most one time

2: Augmenting each image the minimum amount of times necessairy to achieve an even 50/50 split

In [None]:
def get_ca_nc_split_dfs(df, split_col):
    col_vals = df[split_col].unique() # Gets the unique values for the coloumn we choose to split our data on
    col_vals.sort()
    assert (len(col_vals) == 2) # Checks that our value (sex) is binary in our dataframe 
    a_df = df[df[split_col] == col_vals[0]] # Female, since it's sorted alphabetically
    b_df = df[df[split_col] == col_vals[1]] # Male
    a_nc_df = a_df[a_df['diagnostic'].isin(non_cancerous_conditions)]
    a_ca_df = a_df[a_df['diagnostic'].isin(cancerous_conditions)]
    b_nc_df = b_df[b_df['diagnostic'].isin(non_cancerous_conditions)]
    b_ca_df = b_df[b_df['diagnostic'].isin(cancerous_conditions)]
    return a_ca_df, b_ca_df, a_nc_df, b_nc_df


def get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size, cancerous_fraction=None):
    if cancerous_fraction is None:
        ca_a_fraction = len(ca_a_df) / (len(ca_a_df) + len(nc_a_df))
        ca_b_fraction = len(ca_b_df) / (len(ca_b_df) + len(nc_b_df))
        warn(
            "Using legacy group-wise disease label stratification. Will lead to different disease prevalences in the different groups in the training set. NOT RECOMMENDED.")
    else:
        ca_a_fraction = ca_b_fraction = cancerous_fraction

    nc_a_fraction = 1 - ca_a_fraction
    nc_b_fraction = 1 - ca_b_fraction
    train_set_n_ca_a_nom = ca_a_fraction * train_set_size * ratio
    train_set_n_ca_a = round(train_set_n_ca_a_nom)
    train_set_n_ca_b_nom = ca_b_fraction * train_set_size * (1 - ratio)
    train_set_n_ca_b = round(train_set_n_ca_b_nom)
    train_set_n_nc_a_nom = nc_a_fraction * train_set_size * ratio
    train_set_n_nc_a = round(train_set_n_nc_a_nom)
    train_set_n_nc_b_nom = nc_b_fraction * train_set_size * (1 - ratio)
    train_set_n_nc_b = round(train_set_n_nc_b_nom)

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b < train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        max_diff_idx = diffs.index(max(diffs))
        if max_diff_idx == 0:
            train_set_n_ca_a += 1
        elif max_diff_idx == 1:
            train_set_n_ca_b += 1
        elif max_diff_idx == 2:
            train_set_n_nc_a += 1
        else:
            train_set_n_nc_b += 1

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b > train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        min_diff_idx = diffs.index(min(diffs))
        if min_diff_idx == 0:
            train_set_n_ca_a -= 1
        elif min_diff_idx == 1:
            train_set_n_ca_b -= 1
        elif min_diff_idx == 2:
            train_set_n_nc_a -= 1
        else:
            train_set_n_nc_b -= 1

    assert (train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b == train_set_size)

    return train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b


def assign_test_sets(df, n_test_sets, condition, rng):

    n_reruns = int(np.ceil(test_size_per_sex_per_group * n_test_sets / len(df)))
    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        df[test_set_name] = 0

    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)
        df[rerun_name] = 0
    
    df_without_aug = df[~df['img_id'].str.startswith('aug_')] # Makes a dataframe without the augmented data, so test sets are made with wholly original data
    df_with_aug = df[df['img_id'].str.startswith('aug_')] # Makes a dataframe with only the augmented data, to be concatenated when test splitting is done

    
    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)

        grouped_patient = df_without_aug.groupby('patient_id') # Group lesions from the same patient to ensure that they are not distributed across different tests or train/validation df

        for test_idx in range(0, n_test_sets):
            test_set_name = 'test_set_' + str(test_idx)

            while complement_lesions_to_use[test_idx]:
                patient_id = complement_lesions_to_use[test_idx].pop(0)
                lesion = grouped_patient.get_group(patient_id)
                remaining = sum(df_without_aug[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df_without_aug[test_set_name])
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]
                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df_without_aug) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df_without_aug.loc[local_sample.index, test_set_name] = 1
                df_without_aug.loc[local_sample.index, rerun_name] = 1
            
            for patient_id, lesion in grouped_patient:
                remaining = sum(df_without_aug[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df_without_aug[test_set_name])
                if missing < test_size_per_sex_per_group - 7 and 1 in lesion: # Only add patients with both cancerous and non-cancerous lesions in the first half of sampling process, to make sure there aren't too many complements to be used in the corresponding test set
                    continue
                if condition == 1 and 1 in lesion['both_cancerous_and_non_cancerous'].values: # Skip to next iteration if patient has both cancerous and non-cancerous lesions and we are in cancer set, as no more data is added afterwards, and therefore can't add more pairs
                    continue
                if (len(lesion) > missing): # Skip to next iteration if more lesions are grouped than are missing
                    continue                   
                if 1 in lesion['both_cancerous_and_non_cancerous'].values:
                    complement_lesions_to_use[test_idx].append(patient_id)
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]    

                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df_without_aug) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df_without_aug.loc[local_sample.index, test_set_name] = 1
                df_without_aug.loc[local_sample.index, rerun_name] = 1     
        if condition == 1:
            assert (complement_lesions_to_use[test_idx] == []) # Ensure that all lesions from same patient are used in the same test set

    df = pd.concat([df_without_aug, df_with_aug]) # Concatenate the augmented data back to the dataframe

    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        assert (sum(df[test_set_name] == 1) == test_size_per_sex_per_group)

    return df


def check_unique(df) -> None:
    if 'lesion_id' in df.columns:
        assert (df.lesion_id.is_unique)
    if 'img_id' in df.columns:
        assert (df.img_id.is_unique)        
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")    


def sort_df(df) -> None:
    if 'lesion_id' in df.columns:
        return df.sort_values('lesion_id')
    if 'img_id' in df.columns:
        return df.sort_values('img_id')
    if df.index.name in ['lesion_id', 'img_id']:
        return df.sort_index()
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns and df.index.name not in ['lesion_id', 'img_id']:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")

def clean_data(df):
    df = df.sort_values(["lesion_id"]) 
    df = df[~df["gender"].isna()] # Removing data without gender entry
    df = df.drop_duplicates("lesion_id", keep = "first") # Dropping duplicates, but keeping the first occurence of the lesion_id

    df['is_cancerous'] = df['diagnostic'].apply(lambda x: any(cancer in x for cancer in ['SCC', 'BCC', 'MEL'])).astype(int) # Add is_cancerous column to df, with 1 being cancerous and 0 being non-cancerous

    # Remove unnecessary columns for cleanliness
    df = df[['patient_id', 'img_id', 'lesion_id', 'gender', 'fitspatrick', 'is_cancerous', 'diagnostic']] # Pad-Ufes has misspelled fitzpatrick and I can't be bothered to change all our datasets
    return df

def get_augmented_data(df):
    # Call function for balancing dataset via augmented data
    df_once_augmented, df_50_50 = image_augmentation.balance_dataset(df, 'is_cancerous')
    
    return df_once_augmented, df_50_50


if __name__ == '__main__':
    
    n_test_sets = 5
    ratios = [0, 0.25, 0.5, 0.75, 1.0]
    n_folds = 5
    cancerous_conditions = ["BCC", "MEL", "SCC"]
    non_cancerous_conditions = ["ACK", "NEV", "SEK"]
    cancer = 1
    non_cancer = 0
    genders = ["FEMALE", "MALE"]
    rng = np.random.default_rng(1173).bit_generator
    complement_lesions_to_use = {}
    basepaths = ['../data/cnn/cnn_splitted_data_once_augmented/',  '../data/cnn/cnn_splitted_data_50_50_split/'] # List of basepaths for the two different augmented datasets

    test_size_per_sex_per_group = 26 # 26 each of male/female non-cancerous/cancerous patients, i.e., total test size is 104

    for i in range(0, n_test_sets): # Create a list for each test set to store the complement lesions to use
        complement_lesions_to_use[i] = []

    df = pd.read_csv("../data/metadata/fixed_metadata.csv") # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed
    df = clean_data(df) # Clean data after feature extraction
    df_once_augmented, df_50_50 = get_augmented_data(df) # add augmented images to dataset (df 50/50 not used in this cell)

    dfs = [df_once_augmented, df_50_50] # List of dataframes to iterate over

    df_once_augmented.to_csv('../data/metadata/once_augmented.csv', index=False)
    df_50_50.to_csv('../data/metadata/50_50_augmented.csv', index=False)

    # for i in range(len(basepaths)):
    basepath = basepaths[0]
    df = dfs[0]

    basename = basepath + 'm_f_ca_nc'
    check_unique(df)

    # Biggest training set possible, found by trial and error
    train_set_size = 535

    # Create boolean masks for cancerous and non-cancerous conditions
    cancerous_mask = df['diagnostic'].isin(cancerous_conditions)
    non_cancerous_mask = df['diagnostic'].isin(non_cancerous_conditions)

    # Calculate the fraction of cancerous conditions among all conditions
    cancerous_fraction = sum(cancerous_mask) / (sum(cancerous_mask) + sum(non_cancerous_mask))

    # Make dataframe with only non-cancerous conditions
    df_nc = df[non_cancerous_mask]

    # Check for whether a patient has both cancerous and non cancerous lesions 
    nc_ca_df = df.groupby(['patient_id'])['is_cancerous'].nunique()

    # Filter groups where both True (cancerous) and False (non-cancerous) exist
    nc_ca_df = nc_ca_df[nc_ca_df == 2]

    # List of patient_id's with both cancerous and non-cancerous lesions
    nc_ca_df_patient_ids = nc_ca_df.index.tolist()

    # Add identifier to the dataframe for patients with both cancerous and non-cancerous lesions
    df['both_cancerous_and_non_cancerous'] = 0
    df.loc[df['patient_id'].isin(nc_ca_df_patient_ids), 'both_cancerous_and_non_cancerous'] = 1

    # Split data into dataframes on gender and diagnostic
    ca_a_df, ca_b_df, nc_a_df, nc_b_df = get_ca_nc_split_dfs(df, 'gender') #ca: cancerous, nc: non-cancerous, a: female, b: male

    # Compute cross-validation fold sizes
    fold_size_base, fold_size_rem = divmod(train_set_size, n_folds)
    fold_sizes = []
    for ii in range(0, n_folds):
        if ii < fold_size_rem:
            fold_sizes.append(fold_size_base + 1)
        else:
            fold_sizes.append(fold_size_base)
    assert (sum(fold_sizes) == train_set_size)

    # Set up the desired test sets
    nc_a_df = assign_test_sets(nc_a_df, n_test_sets, non_cancer, rng)
    ca_a_df = assign_test_sets(ca_a_df, n_test_sets, cancer, rng)
    nc_b_df = assign_test_sets(nc_b_df, n_test_sets, non_cancer, rng)
    ca_b_df = assign_test_sets(ca_b_df, n_test_sets, cancer, rng)

    for test_idx in range(0, n_test_sets):

        test_set_name = 'test_set_' + str(test_idx)

        test_df = sort_df(pd.concat([ca_a_df[ca_a_df[test_set_name] == 1],
                                    nc_a_df[nc_a_df[test_set_name] == 1],
                                    ca_b_df[ca_b_df[test_set_name] == 1],
                                    nc_b_df[nc_b_df[test_set_name] == 1]]))

        check_unique(test_df)

        test_df.to_csv(basename + f'_test_{test_idx}.csv')

        # Mark corresponding augmented lesions of the ones used in test as used in test set as well
        img_with_corresponding_aug = test_df['img_id'].tolist() 
        img_with_corresponding_aug = ['aug_' + img_id for img_id in img_with_corresponding_aug]
        nc_a_df.to_csv(f'../test/test_csvs/before_addition_{test_idx}.csv')

        for img_id in img_with_corresponding_aug: # Since we only augment non-cancerous lesions, we only need to check these
            if img_id in nc_a_df['img_id'].values: 
                nc_a_df.loc[nc_a_df['img_id'] == img_id, test_set_name] = 1
            if img_id in nc_b_df['img_id'].values:
                nc_b_df.loc[nc_b_df['img_id'] == img_id, test_set_name] = 1

        # Mark that nothing has been used in the training / validation sets belong to this test set yet
        ca_a_df.loc[:, 'used_with_curr_test'] = 0
        ca_b_df.loc[:, 'used_with_curr_test'] = 0
        nc_a_df.loc[:, 'used_with_curr_test'] = 0
        nc_b_df.loc[:, 'used_with_curr_test'] = 0

        ratios.sort()
        # Set up the training and validation sets for each test set
        for ratio in ratios:
            # ----- Determine (based on ratio and train_set_size) how many CA/NC F/M there should be and
            # split into the four corresponding DFs.
            train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b = \
                get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size,
                                    cancerous_fraction=cancerous_fraction)

            # Compose a training + validation dataset with the desired sex ratio from the remaining non-test data
            # Reuse samples that have been used for earlier ratios wherever possible to minimize training set
            # variations across ratios.
            if ratio == min(ratios):
                # first ratio, just sample from scratch
                train_ca_a_df = ca_a_df[ca_a_df[test_set_name] == 0].sample(n=train_set_n_ca_a, random_state=rng)
                train_ca_b_df = ca_b_df[ca_b_df[test_set_name] == 0].sample(n=train_set_n_ca_b, random_state=rng)
                train_nc_a_df = nc_a_df[nc_a_df[test_set_name] == 0].sample(n=train_set_n_nc_a, random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df[test_set_name] == 0].sample(n=train_set_n_nc_b, random_state=rng)
            else:
                # We work with increasing ratios, i.e., we now have less males and more females than for the
                # previous ratio.
                # Draw males only from the ones that have been used so far
                train_ca_b_df = ca_b_df[ca_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_ca_b,
                                                                                    random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_nc_b,
                                                                                    random_state=rng)
                # Use all females used so far + draw new ones as needed
                n_prev = ca_a_df['used_with_curr_test'].sum()
                train_ca_a_df = pd.concat([ca_a_df[ca_a_df['used_with_curr_test'] == 1],
                                            ca_a_df[(ca_a_df[test_set_name] == 0) & (
                                                        ca_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_ca_a - n_prev, random_state=rng)])
                n_prev = nc_a_df['used_with_curr_test'].sum()
                train_nc_a_df = pd.concat([nc_a_df[nc_a_df['used_with_curr_test'] == 1],
                                            nc_a_df[(nc_a_df[test_set_name] == 0) & (
                                                        nc_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_nc_a - n_prev, random_state=rng)])

            # Mark which ones we have used so far with the current test set + this and previous ratios
            ca_a_df.loc[train_ca_a_df.index, 'used_with_curr_test'] = 1
            ca_b_df.loc[train_ca_b_df.index, 'used_with_curr_test'] = 1
            nc_a_df.loc[train_nc_a_df.index, 'used_with_curr_test'] = 1
            nc_b_df.loc[train_nc_b_df.index, 'used_with_curr_test'] = 1

            train_and_vali_df = sort_df(pd.concat([train_ca_a_df, train_ca_b_df, train_nc_a_df, train_nc_b_df]))
            

            # Set up folds for cross-validation
            train_and_vali_df['fold'] = np.nan
            # Randomly sampling augmented data with a corresponding value so each fold has the same amount of augmented data
            aug_train_and_vali_df = train_and_vali_df[train_and_vali_df['img_id'].str.startswith('aug_')]
            img_ids_train_and_vali = train_and_vali_df[~train_and_vali_df['img_id'].str.startswith('aug_')]['img_id'].tolist()

            # remove prefix from all img_ids in aug for easier comparison
            aug_train_and_vali_df['img_id'] = aug_train_and_vali_df['img_id'].str.replace('aug_', '')

            # Only keep values in aug where img_id is in img_ids
            aug_train_and_vali_df = aug_train_and_vali_df[aug_train_and_vali_df['img_id'].isin(img_ids_train_and_vali)]

            # Add prefix back to img_ids in aug
            aug_train_and_vali_df['img_id'] = 'aug_' + aug_train_and_vali_df['img_id']

            len_aug = len(aug_train_and_vali_df)

            fold_aug_size_base, fold_aug_size_rem = divmod(len_aug, n_folds)
            fold_aug_sizes = []
            for iii in range(0, n_folds):
                if iii < fold_aug_size_rem:
                    fold_aug_sizes.append(fold_aug_size_base + 1)
                else:
                    fold_aug_sizes.append(fold_aug_size_base)
            assert (sum(fold_aug_sizes) == len_aug)

            for fold_aug_idx, fold_aug_size in enumerate(fold_aug_sizes):
                # Split augmented data into folds
                aug_train_and_vali_df.loc[aug_train_and_vali_df[aug_train_and_vali_df.fold.isna()].sample(n=fold_aug_size,
                                                                                                random_state=rng).index, 'fold'] = fold_aug_idx
            
            
            train_and_vali_df = train_and_vali_df.set_index('img_id')
            aug_train_and_vali_df = aug_train_and_vali_df.set_index('img_id')
            train_and_vali_df.update(aug_train_and_vali_df)
            aug_train_and_vali_df = aug_train_and_vali_df.reset_index()
            aug_train_and_vali_df['img_id'] = aug_train_and_vali_df['img_id'].str.replace('aug_', '')
            
            # Assign the fold of the augmented image to the corresponding image
            fold_map = aug_train_and_vali_df.set_index('img_id')['fold'].to_dict()

            train_and_vali_df['fold'] = train_and_vali_df.index.to_series().apply(lambda x: fold_map.get(x, train_and_vali_df.loc[x, 'fold']))
            train_and_vali_df = train_and_vali_df.reset_index()

            for fold_idx, fold_size in enumerate(fold_sizes):
                assert (train_and_vali_df.index.is_unique)
                # Amount of data added to fold so far from augs is then subtracted from fold_size
                train_and_vali_df.loc[train_and_vali_df[train_and_vali_df.fold.isna()].sample(n=fold_size - fold_aug_sizes[fold_idx] * 2,
                                                                                                random_state=rng).index, 'fold'] = fold_idx
            
            # check that everything looks nice
            check_unique(test_df)
            check_unique(train_and_vali_df)
            assert (~train_and_vali_df.fold.isna().any())   

            for fold_idx in range(0, n_folds):
                train_df = train_and_vali_df[train_and_vali_df.fold != fold_idx]
                val_df = train_and_vali_df[train_and_vali_df.fold == fold_idx]
                all_dfs:list[pd.DataFrame] = [train_df, val_df, test_df]
                all_df = pd.concat(all_dfs)
                check_unique(all_df)

                train_df.to_csv(basename + f'_train_{test_idx}_{ratio:.2f}_{fold_idx}.csv')
                val_df.to_csv(basename + f'_val_{test_idx}_{ratio:.2f}_{fold_idx}.csv')