Inspiration taken from https://github.com/e-pet/adni-bias/blob/main/Repeated_CV_Splitter.py

In [57]:
from warnings import warn

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import random
from collections import Counter

from image_augmentation import image_augmentation

import sys
sys.path.append('../')
from handcrafted_methods import get_all_features as features

Create and add all features to master_dataset

In [20]:
if __name__ == '__main__':

    img = "../data/images/lesion_images/" 
    masks = "../data/images/lesion_masks"
    df = pd.read_csv("../data/metadata/fixed_metadata.csv", index_col=0)

    df = features.getAllFeatures(df, img, masks)
    
    df.to_csv("../data/metadata/master_data.csv", index=False)

Superposed PAT_887_1688_436.png
Superposed PAT_431_850_472.png
Superposed PAT_554_1051_301.png
Superposed PAT_1468_1622_219.png
Superposed PAT_759_1433_973.png
Superposed PAT_1483_1678_538.png
Superposed PAT_1031_148_66.png


KeyboardInterrupt: 

Split data into training, validation and test tests for LR, aka. with all features and SMOTE data

In [32]:
def get_ca_nc_split_dfs(df, split_col):

    print("pad-ufes before split \n", df.head())

    col_vals = df[split_col].unique() # Gets the unique values for the coloumn we choose to split our data on
    col_vals.sort()
    assert (len(col_vals) == 2) # Checks that our value (sex) is binary in our dataframe 
    a_df = df[df[split_col] == col_vals[0]] # Female, since it's sorted alphabetically
    b_df = df[df[split_col] == col_vals[1]] # Male
    a_nc_df = a_df[a_df['diagnostic'].isin(non_cancerous_conditions)]
    a_ca_df = a_df[a_df['diagnostic'].isin(cancerous_conditions)]
    b_nc_df = b_df[b_df['diagnostic'].isin(non_cancerous_conditions)]
    b_ca_df = b_df[b_df['diagnostic'].isin(cancerous_conditions)]
    return a_ca_df, b_ca_df, a_nc_df, b_nc_df


def get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size, cancerous_fraction=None):
    if cancerous_fraction is None:
        ca_a_fraction = len(ca_a_df) / (len(ca_a_df) + len(nc_a_df))
        ca_b_fraction = len(ca_b_df) / (len(ca_b_df) + len(nc_b_df))
        warn(
            "Using legacy group-wise disease label stratification. Will lead to different disease prevalences in the different groups in the training set. NOT RECOMMENDED.")
    else:
        ca_a_fraction = ca_b_fraction = cancerous_fraction

    nc_a_fraction = 1 - ca_a_fraction
    nc_b_fraction = 1 - ca_b_fraction
    train_set_n_ca_a_nom = ca_a_fraction * train_set_size * ratio
    train_set_n_ca_a = round(train_set_n_ca_a_nom)
    train_set_n_ca_b_nom = ca_b_fraction * train_set_size * (1 - ratio)
    train_set_n_ca_b = round(train_set_n_ca_b_nom)
    train_set_n_nc_a_nom = nc_a_fraction * train_set_size * ratio
    train_set_n_nc_a = round(train_set_n_nc_a_nom)
    train_set_n_nc_b_nom = nc_b_fraction * train_set_size * (1 - ratio)
    train_set_n_nc_b = round(train_set_n_nc_b_nom)

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b < train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        max_diff_idx = diffs.index(max(diffs))
        if max_diff_idx == 0:
            train_set_n_ca_a += 1
        elif max_diff_idx == 1:
            train_set_n_ca_b += 1
        elif max_diff_idx == 2:
            train_set_n_nc_a += 1
        else:
            train_set_n_nc_b += 1

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b > train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        min_diff_idx = diffs.index(min(diffs))
        if min_diff_idx == 0:
            train_set_n_ca_a -= 1
        elif min_diff_idx == 1:
            train_set_n_ca_b -= 1
        elif min_diff_idx == 2:
            train_set_n_nc_a -= 1
        else:
            train_set_n_nc_b -= 1

    assert (train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b == train_set_size)

    return train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b


def assign_test_sets(df, n_test_sets, condition, rng):
    n_reruns = int(np.ceil(test_size_per_sex_per_group * n_test_sets / len(df)))
    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        df[test_set_name] = 0

    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)
        df[rerun_name] = 0

    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)

        grouped_patient = df.groupby('patient_id') # Group lesions from the same patient to ensure that they are not distributed across different tests or train/validation df

        for test_idx in range(0, n_test_sets):
            test_set_name = 'test_set_' + str(test_idx)

            while complement_lesions_to_use[test_idx]:
                patient_id = complement_lesions_to_use[test_idx].pop(0)
                print("patient_id: ", patient_id)
                print("without the patient_id: ", complement_lesions_to_use[test_idx])
                lesion = grouped_patient.get_group(patient_id)
                remaining = sum(df[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df[test_set_name])
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]
                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df.loc[local_sample.index, test_set_name] = 1
                df.loc[local_sample.index, rerun_name] = 1
            
            for patient_id, lesion in grouped_patient:
                remaining = sum(df[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df[test_set_name])
                if missing < test_size_per_sex_per_group - 7 and 1 in lesion: # Only add patients with both cancerous and non-cancerous lesions in the first half of sampling process, to make sure there aren't too many complements to be used in the corresponding test set
                    continue
                if condition == 1 and 1 in lesion['both_cancerous_and_non_cancerous'].values: # Skip to next iteration if patient has both cancerous and non-cancerous lesions and we are in cancer set, as no more data is added afterwards, and therefore can't add more pairs
                    continue
                if (len(lesion) > missing): # Skip to next iteration if more lesions are grouped than are missing
                    continue                   
                if 1 in lesion['both_cancerous_and_non_cancerous'].values:
                    complement_lesions_to_use[test_idx].append(patient_id)
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]    

                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df.loc[local_sample.index, test_set_name] = 1
                df.loc[local_sample.index, rerun_name] = 1     
            print("test set: ", test_set_name, " complement lesions remaining: ", complement_lesions_to_use[test_idx])
        if condition == 1:
            assert (complement_lesions_to_use[test_idx] == []) # Ensure that all lesions from same patient are used in the same test set
       

    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        print("length of test set: ", sum(df[test_set_name] == 1))
        assert (sum(df[test_set_name] == 1) == test_size_per_sex_per_group)
        
    return df


def check_unique(df) -> None:
    if 'lesion_id' in df.columns:
        assert (df.lesion_id.is_unique)
    if 'img_id' in df.columns:
        assert (df.img_id.is_unique)        
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")    


def sort_df(df) -> None:
    if 'lesion_id' in df.columns:
        return df.sort_values('lesion_id')
    if 'img_id' in df.columns:
        return df.sort_values('img_id')
    if df.index.name in ['lesion_id', 'img_id']:
        return df.sort_index()
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns and df.index.name not in ['lesion_id', 'img_id']:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")
        

def clean_data(df):
    df = df.sort_values(["lesion_id"]) 
    df = df[~df["gender"].isna()] # Removing data without gender entry
    df = df.drop_duplicates("lesion_id", keep = "first") # Dropping duplicates, but keeping the first occurence of the lesion_id
    return df

def synthetic_fold(synthetic_length, fold_sizes_lst):

    # Calculate the equal part
    equal_part = synthetic_length // 5
    # Calculate the remainder
    remainder = synthetic_length % 5

    print("equal part: ", equal_part)
    print("remainder: ", remainder)

    # Add equal part to each element of the list
    fold_sizes_new_lst = [x + equal_part for x in fold_sizes_lst]

    print("fold sizes after equal part added: ", fold_sizes)
    # Increment the lowest value(s) by 1 for each remainder
    for _ in range(remainder):
        min_val = min(fold_sizes)
        min_indices = [i for i, val in enumerate(fold_sizes) if val == min_val]
        idx = random.choice(min_indices)
        fold_sizes_new_lst[idx] += 1

    return fold_sizes_new_lst

if __name__ == '__main__':

    n_test_sets = 5
    ratios = [0, 0.25, 0.5, 0.75, 1.0]
    n_folds = 5
    cancerous_conditions = ["BCC", "MEL", "SCC"]
    non_cancerous_conditions = ["ACK", "NEV", "SEK"]
    cancer = 1
    non_cancer = 0
    genders = ["FEMALE", "MALE"]
    rng = np.random.default_rng(1173).bit_generator
    complement_lesions_to_use = {}
    basepath = '../data/lr/lr_splitted_data/'
    basename = basepath + 'm_f_ca_nc'

    test_size_per_sex_per_group = 25  # 25 each of male/female non-cancerous/cancerous patients, i.e., total test size is 100

    for i in range(0, n_test_sets): # Create a list for each test set to store the complement lesions to use
        complement_lesions_to_use[i] = []

    # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed
    df = pd.read_csv("../data/metadata/master_data.csv")
    df = clean_data(df) # Clean data after feature extraction
    check_unique(df)
    df['is_cancerous'] = df['diagnostic'].apply(lambda x: any(cancer in x for cancer in ['SCC', 'BCC', 'MEL'])).astype(int)
    print("pad-ufes length after clean \n", len(df))

    # Biggest training set possible, found by trial and error
    train_set_size = 415

    # Create boolean masks for cancerous and non-cancerous conditions
    cancerous_mask = df['diagnostic'].isin(cancerous_conditions)
    non_cancerous_mask = df['diagnostic'].isin(non_cancerous_conditions)

    # Calculate the fraction of cancerous conditions among all conditions
    cancerous_fraction = sum(cancerous_mask) / (sum(cancerous_mask) + sum(non_cancerous_mask))

    # Make dataframe with only non-cancerous conditions
    df_nc = df[non_cancerous_mask]

    # check for whether a patient has both cancerous and non cancerous lesions 
    nc_ca_df = df.groupby(['patient_id'])['is_cancerous'].nunique()

    # Filter groups where both True (cancerous) and False (non-cancerous) exist
    nc_ca_df = nc_ca_df[nc_ca_df == 2]

    # List of patient_id's with both cancerous and non-cancerous lesions
    nc_ca_df_patient_ids = nc_ca_df.index.tolist()
    print("length of patients with both cancerous and non-cancerous lesions: ", len(nc_ca_df_patient_ids))
    print("list: ", nc_ca_df_patient_ids)

    # Add identifier to the dataframe for patients with both cancerous and non-cancerous lesions
    df['both_cancerous_and_non_cancerous'] = 0
    df.loc[df['patient_id'].isin(nc_ca_df_patient_ids), 'both_cancerous_and_non_cancerous'] = 1

    # Split data into dataframes on gender and diagnostic
    ca_a_df, ca_b_df, nc_a_df, nc_b_df = get_ca_nc_split_dfs(df, 'gender') #ca: cancerous, nc: non-cancerous, a: female, b: male

    # Compute cross-validation fold sizes
    fold_size_base, fold_size_rem = divmod(train_set_size, n_folds)
    fold_sizes = []
    for ii in range(0, n_folds):
        if ii < fold_size_rem:
            fold_sizes.append(fold_size_base + 1)
        else:
            fold_sizes.append(fold_size_base)
    assert (sum(fold_sizes) == train_set_size)

    print("Just after creation: ", fold_sizes)

    # Set up the desired test sets
    nc_a_df = assign_test_sets(nc_a_df, n_test_sets, non_cancer, rng)
    ca_a_df = assign_test_sets(ca_a_df, n_test_sets, cancer, rng)
    nc_b_df = assign_test_sets(nc_b_df, n_test_sets, non_cancer, rng)
    ca_b_df = assign_test_sets(ca_b_df, n_test_sets, cancer, rng)

    print("length of female without cancer: ", len(nc_a_df))
    print("length of male without cancer: ", len(nc_b_df))

    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)

        test_df = sort_df(pd.concat([ca_a_df[ca_a_df[test_set_name] == 1],
                                     nc_a_df[nc_a_df[test_set_name] == 1],
                                     ca_b_df[ca_b_df[test_set_name] == 1],
                                     nc_b_df[nc_b_df[test_set_name] == 1]]))
        

        check_unique(test_df)

        # Convert test set to the same format as train/validation sets
        categorical_cols = ['smoke', 'drink', 'background_father', 'background_mother', 
                    'pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                    'has_piped_water', 'has_sewage_system', 'region', 'itch', 'grew', 
                    'hurt', 'changed', 'bleed', 'elevation', 'biopsed'] # Features with string or boolean values
        test_df = pd.get_dummies(test_df, columns=categorical_cols)

        test_df.to_csv(basename + f'_test_{test_idx}.csv')

        # Mark that nothing has been used in the training / validation sets belong to this test set yet
        ca_a_df.loc[:, 'used_with_curr_test'] = 0
        ca_b_df.loc[:, 'used_with_curr_test'] = 0
        nc_a_df.loc[:, 'used_with_curr_test'] = 0
        nc_b_df.loc[:, 'used_with_curr_test'] = 0

        print("Fold sizes before ratios: ", fold_sizes)
        ratios.sort()
        print("Fold sizes after ratios: ", fold_sizes)
        # Set up the training and validation sets for each test set
        for ratio in ratios:
            print("started loop: ", ratio)
            # ----- Determine (based on ratio and train_set_size) how many AD/HC F/M/1.5/3 there should be and
            # split into the four corresponding DFs.
            train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b = \
                get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size,
                                    cancerous_fraction=cancerous_fraction)

            # Compose a training + validation dataset with the desired sex ratio from the remaining non-test data
            # Reuse samples that have been used for earlier ratios wherever possible to minimize training set
            # variations across ratios.
            if ratio == min(ratios):
                # first ratio, just sample from scratch
                train_ca_a_df = ca_a_df[ca_a_df[test_set_name] == 0].sample(n=train_set_n_ca_a, random_state=rng)
                train_ca_b_df = ca_b_df[ca_b_df[test_set_name] == 0].sample(n=train_set_n_ca_b, random_state=rng)
                train_nc_a_df = nc_a_df[nc_a_df[test_set_name] == 0].sample(n=train_set_n_nc_a, random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df[test_set_name] == 0].sample(n=train_set_n_nc_b, random_state=rng)
            else:
                # We work with increasing ratios, i.e., we now have less males and more females than for the
                # previous ratio.
                # Draw males only from the ones that have been used so far
                train_ca_b_df = ca_b_df[ca_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_ca_b,
                                                                                    random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_nc_b,
                                                                                    random_state=rng)
                # Use all females used so far + draw new ones as needed
                n_prev = ca_a_df['used_with_curr_test'].sum()
                train_ca_a_df = pd.concat([ca_a_df[ca_a_df['used_with_curr_test'] == 1],
                                            ca_a_df[(ca_a_df[test_set_name] == 0) & (
                                                        ca_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_ca_a - n_prev, random_state=rng)])
                n_prev = nc_a_df['used_with_curr_test'].sum()
                train_nc_a_df = pd.concat([nc_a_df[nc_a_df['used_with_curr_test'] == 1],
                                            nc_a_df[(nc_a_df[test_set_name] == 0) & (
                                                        nc_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_nc_a - n_prev, random_state=rng)])

            # Mark which ones we have used so far with the current test set + this and previous ratios
            ca_a_df.loc[train_ca_a_df.index, 'used_with_curr_test'] = 1
            ca_b_df.loc[train_ca_b_df.index, 'used_with_curr_test'] = 1
            nc_a_df.loc[train_nc_a_df.index, 'used_with_curr_test'] = 1
            nc_b_df.loc[train_nc_b_df.index, 'used_with_curr_test'] = 1

            train_and_vali_df = sort_df(pd.concat([train_ca_a_df, train_ca_b_df, train_nc_a_df, train_nc_b_df]))
            
            # SMOTE
            train_val_size = len(train_and_vali_df)
            cancerous_count = train_and_vali_df[train_and_vali_df['is_cancerous'] == 1].shape[0]
            non_cancerous_count = train_and_vali_df[train_and_vali_df['is_cancerous'] == 0].shape[0]

            # Calculate percentages
            non_cancerous_percentage = non_cancerous_count / train_val_size * 100
            cancerous_percentage = cancerous_count / train_val_size * 100

            # Print results
            print("Percentage of non-cancerous rows:", non_cancerous_percentage)
            print("Percentage of cancerous rows:", cancerous_percentage)


            # Apply SMOTE to the training and validation set in relation to cancer/non-cancer
            
            # Encode categorical string features to numerical values
            categorical_cols = ['smoke', 'drink', 'background_father', 'background_mother', 
                    'pesticide', 'gender', 'skin_cancer_history', 'cancer_history', 
                    'has_piped_water', 'has_sewage_system', 'region', 'itch', 'grew', 
                    'hurt', 'changed', 'bleed', 'elevation', 'biopsed'] # Features with string values
            df_encoded = pd.get_dummies(train_and_vali_df, columns=categorical_cols)

            df_encoded['is_cancerous'] = df_encoded['is_cancerous'].astype('category')
            X = df_encoded.drop(['is_cancerous', 'diagnostic', 'patient_id', 'img_id', 'lesion_id', 'asymmetry'], axis=1) # Asymmetry dropped, because it contains nan values
            y = df_encoded['is_cancerous'] # Label
            
            X.to_csv('../test/testSmote/XBefore.csv')
            y.to_csv('../test/testSmote/yBefore.csv')
            smote = SMOTE(sampling_strategy='not majority', random_state=2024) # our rng doesn't work with SMOTE
            print("y before resampling: ", y.value_counts())
            X_resampled, y_resampled = smote.fit_resample(X, y)
            X_resampled.to_csv('../test/testSmote/XAfter.csv')
            y_resampled.to_csv('../test/testSmote/yAfter.csv')
            print("y-labels after resampling: ", Counter(y_resampled))
            print("Resampled data :")
            print("y_resampled: ", y_resampled.value_counts())
            print("x_resampled: ", X_resampled)
            
            authentic_data_length = len(train_and_vali_df)

            train_and_vali_df = pd.merge(X_resampled, y_resampled, how ='outer', left_index=True, right_index=True)
            print("Train and vali dataframe after SMOTE: ", train_and_vali_df.head())

            synthetic_data_length = len(train_and_vali_df) - authentic_data_length

            print("Length of synthetic data: ", synthetic_data_length)

            print("fold_sizes before synthetic: ", fold_sizes) 
            ratio_fold_sizes = synthetic_fold(synthetic_data_length, fold_sizes) 
                  

            # Set up folds for cross-validation
            train_and_vali_df['fold'] = np.nan
            for fold_idx, fold_size in enumerate(ratio_fold_sizes):
                print("length of train and vali df: ", len(train_and_vali_df))
                print(train_and_vali_df.index)
                assert (train_and_vali_df.index.is_unique)
                train_and_vali_df.loc[train_and_vali_df[train_and_vali_df.fold.isna()].sample(n=fold_size,
                                                                                                random_state=rng).index, 'fold'] = fold_idx
            
            # check that everything looks nice
            check_unique(test_df)
            check_unique(train_and_vali_df)
            print("sum of foldsizes: ", sum(ratio_fold_sizes))
            print("length of train_and_vali_df: ", len(train_and_vali_df))
            assert (~train_and_vali_df.fold.isna().any())
            assert (len(train_and_vali_df) == train_set_size + synthetic_data_length)     

            for fold_idx in range(0, n_folds):
                train_df = train_and_vali_df[train_and_vali_df.fold != fold_idx]
                val_df = train_and_vali_df[train_and_vali_df.fold == fold_idx]
                
                # TODO: Make this test case work. Difficult because we removed id's for SMOTE
                # all_dfs:list[pd.DataFrame] = [train_df, val_df, test_df]
                # all_df = pd.concat(all_dfs)
                # check_unique(all_df)

                train_df.to_csv(basename + f'_train_{test_idx}_{ratio:.2f}_{fold_idx}.csv')
                val_df.to_csv(basename + f'_val_{test_idx}_{ratio:.2f}_{fold_idx}.csv')


            print("Completed loop with ratio ", ratio)  

pad-ufes length after clean 
 1173
length of patients with both cancerous and non-cancerous lesions:  110
list:  ['PAT_104', 'PAT_108', 'PAT_110', 'PAT_126', 'PAT_15', 'PAT_155', 'PAT_158', 'PAT_166', 'PAT_167', 'PAT_177', 'PAT_180', 'PAT_181', 'PAT_192', 'PAT_198', 'PAT_202', 'PAT_207', 'PAT_21', 'PAT_217', 'PAT_220', 'PAT_223', 'PAT_224', 'PAT_233', 'PAT_256', 'PAT_263', 'PAT_265', 'PAT_267', 'PAT_270', 'PAT_277', 'PAT_279', 'PAT_281', 'PAT_300', 'PAT_302', 'PAT_304', 'PAT_306', 'PAT_307', 'PAT_309', 'PAT_316', 'PAT_325', 'PAT_328', 'PAT_333', 'PAT_337', 'PAT_354', 'PAT_356', 'PAT_359', 'PAT_36', 'PAT_366', 'PAT_368', 'PAT_373', 'PAT_379', 'PAT_38', 'PAT_380', 'PAT_386', 'PAT_388', 'PAT_409', 'PAT_412', 'PAT_419', 'PAT_433', 'PAT_441', 'PAT_45', 'PAT_457', 'PAT_468', 'PAT_482', 'PAT_492', 'PAT_493', 'PAT_505', 'PAT_522', 'PAT_526', 'PAT_531', 'PAT_532', 'PAT_544', 'PAT_571', 'PAT_587', 'PAT_59', 'PAT_632', 'PAT_639', 'PAT_645', 'PAT_650', 'PAT_655', 'PAT_677', 'PAT_691', 'PAT_698', '

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0
started loop:  0.25
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     3.000000    6.000000    6.000000                  4.789734   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     62     2.000000    1.200000    1.200000                  1.884460   
3     71     2.000000   20.000000   16.000000                  9.794617   
4     53     2.000000   11.000000    7.000000                  7.284546   
..   ...          ...         ...         ...                       ...   
581   70     2.038638    7.154553    5.115914                  9.125734   
582   56     2.

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.25
started loop:  0.5
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     3.000000    6.000000    6.000000                  4.789734   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     71     2.000000   20.000000   16.000000                  9.794617   
3     70     2.000000   10.000000    6.000000                  5.009460   
4     55     3.000000    7.000000    6.000000                  7.095337   
..   ...          ...         ...         ...                       ...   
581   75     3.000000   24.536342   10.000000                  3.243553   
582   77     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.5
started loop:  0.75
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     3.000000    6.000000    6.000000                  4.789734   
1     71     2.000000   20.000000   16.000000                  9.794617   
2     53     2.000000   11.000000    7.000000                  7.284546   
3     55     2.000000   16.000000    5.000000                  6.072998   
4     78     1.000000   10.000000   10.000000                  2.589417   
..   ...          ...         ...         ...                       ...   
581   76     2.961362   24.652257    9.806809                  3.018867   
582   63     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.75
started loop:  1.0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74          3.0    6.000000    6.000000                  4.789734   
1     71          2.0   20.000000   16.000000                  9.794617   
2     54          1.0   10.000000    8.000000                  8.815002   
3     55          2.0   16.000000    5.000000                  6.072998   
4     66          2.0   17.000000    9.000000                  8.137512   
..   ...          ...         ...         ...                       ...   
581   77          3.0   24.420428    9.884086                  4.868109   
582   61     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  1.0
Fold sizes before ratios:  [83, 83, 83, 83, 83]
Fold sizes after ratios:  [83, 83, 83, 83, 83]
started loop:  0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     59     4.000000   11.000000   10.000000                  9.756470   
..   ...          ...         ...         ...                       ...

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0
started loop:  0.25
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     65     2.000000   35.000000   15.000000                  1.293945   
3     55     3.000000    6.000000    5.000000                  8.799744   
4     62     2.000000    1.200000    1.200000                  1.884460   
..   ...          ...         ...         ...                       ...   
581   62     2.000000   14.961362   11.038638                 10.558914   
582   51     2.

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.25
started loop:  0.5
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     1.000000   15.000000   10.000000                  5.961609   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     71     2.000000   20.000000   16.000000                  9.794617   
3     78     3.000000   11.000000    6.000000                  2.258301   
4     78     3.000000   11.000000    6.000000                  1.382446   
..   ...          ...         ...         ...                       ...   
581   62     2.038638   15.077276   11.154553                 10.409045   
582   61     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.5
started loop:  0.75
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     74     1.000000   15.000000   10.000000                  5.961609   
2     90     1.000000    7.000000    4.000000                  8.944702   
3     65     2.000000   35.000000   15.000000                  1.293945   
4     71     2.000000   20.000000   16.000000                  9.794617   
..   ...          ...         ...         ...                       ...   
581   79     3.000000    5.231829    4.077276                  5.700280   
582   30     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.75
started loop:  1.0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     1.000000   15.000000   10.000000                  5.961609   
1     74     3.000000    6.000000    6.000000                  4.789734   
2     90     1.000000    7.000000    4.000000                  8.944702   
3     71     2.000000   20.000000   16.000000                  9.794617   
4     54     3.000000   15.000000   13.000000                  3.039551   
..   ...          ...         ...         ...                       ...   
581   76     2.961362   24.652257    9.806809                  3.018867   
582   71     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  1.0
Fold sizes before ratios:  [83, 83, 83, 83, 83]
Fold sizes after ratios:  [83, 83, 83, 83, 83]
started loop:  0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     59     4.000000   11.000000   10.000000                  9.756470   
..   ...          ...         ...         ...                       ...

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0
started loop:  0.25
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     65     2.000000   35.000000   15.000000                  1.293945   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     59     4.000000   11.000000   10.000000                  9.756470   
..   ...          ...         ...         ...                       ...   
581   84     1.961362   10.806809    9.806809                  2.748914   
582   70     3.

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.25
started loop:  0.5
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     90     1.000000    7.000000    4.000000                  8.944702   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     71     2.000000   20.000000   16.000000                  9.794617   
..   ...          ...         ...         ...                       ...   
581   67     2.000000   14.652257   11.690895                  4.960862   
582   45     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.5
started loop:  0.75
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     74     1.000000   15.000000   10.000000                  5.961609   
2     90     1.000000    7.000000    4.000000                  8.944702   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     71     2.000000   20.000000   16.000000                  9.794617   
..   ...          ...         ...         ...                       ...   
581   58     4.922724    5.231829    5.038638                  1.721988   
582   47     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.75
started loop:  1.0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     1.000000   15.000000   10.000000                  5.961609   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     71     2.000000   20.000000   16.000000                  9.794617   
3     78     3.000000   11.000000    6.000000                  2.258301   
4     56     2.000000    9.000000    8.000000                  8.895874   
..   ...          ...         ...         ...                       ...   
581   62     2.038638   18.922724   12.115914                  0.315503   
582   72     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  1.0
Fold sizes before ratios:  [83, 83, 83, 83, 83]
Fold sizes after ratios:  [83, 83, 83, 83, 83]
started loop:  0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     59     4.000000   11.000000   10.000000                  9.756470   
4     59     4.000000   11.000000   10.000000                  0.817871   
..   ...          ...         ...         ...                       ...

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0
started loop:  0.25
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     59     4.000000   11.000000   10.000000                  9.756470   
4     59     4.000000   11.000000   10.000000                  0.817871   
..   ...          ...         ...         ...                       ...   
581   61     2.038638   14.806809   10.922724                 10.351208   
582   67     2.

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.25
started loop:  0.5
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     54     3.000000   15.000000   13.000000                  3.039551   
3     59     4.000000   11.000000   10.000000                  9.756470   
4     59     4.000000   11.000000   10.000000                  0.817871   
..   ...          ...         ...         ...                       ...   
581   54     2.961362   10.038638   10.000000                  2.855818   
582   52     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.5
started loop:  0.75
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     74     1.000000   15.000000   10.000000                  5.961609   
2     90     1.000000    7.000000    4.000000                  8.944702   
3     71     2.000000   20.000000   16.000000                  9.794617   
4     54     3.000000   15.000000   13.000000                  3.039551   
..   ...          ...         ...         ...                       ...   
581   76     2.961362   24.343151    9.884086                  4.789873   
582   60     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.75
started loop:  1.0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     1.000000   15.000000   10.000000                  5.961609   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     71     2.000000   20.000000   16.000000                  9.794617   
3     54     3.000000   15.000000   13.000000                  3.039551   
4     54     2.000000   12.000000   10.000000                 20.947266   
..   ...          ...         ...         ...                       ...   
581   50     1.077276   24.536342   10.922724                  4.656934   
582   60     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  1.0
Fold sizes before ratios:  [83, 83, 83, 83, 83]
Fold sizes after ratios:  [83, 83, 83, 83, 83]
started loop:  0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     59     4.000000   11.000000   10.000000                  9.756470   
..   ...          ...         ...         ...                       ...

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0
started loop:  0.25
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     52     1.000000   13.000000   10.000000                  4.364014   
..   ...          ...         ...         ...                       ...   
581   61     2.000000   14.690895   10.768171                 10.357045   
582   77     1.

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.25
started loop:  0.5
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({0: 293, 1: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     90     1.000000    7.000000    4.000000                  8.944702   
1     65     2.000000   35.000000   15.000000                  1.293945   
2     55     3.000000    6.000000    5.000000                  8.799744   
3     62     2.000000    1.200000    1.200000                  1.884460   
4     52     1.000000   13.000000   10.000000                  4.364014   
..   ...          ...         ...         ...                       ...   
581   70     2.038638    7.154553    5.115914                  9.125734   
582   61     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.5
started loop:  0.75
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     65     2.000000    7.000000    5.000000                  4.605103   
1     74     1.000000   15.000000   10.000000                  5.961609   
2     90     1.000000    7.000000    4.000000                  8.944702   
3     54     3.000000   15.000000   13.000000                  3.039551   
4     52     1.000000   13.000000   10.000000                  4.364014   
..   ...          ...         ...         ...                       ...   
581   51     1.038638   24.497704   10.961362                  3.966546   
582   86     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Completed loop with ratio  0.75
started loop:  1.0
Percentage of non-cancerous rows: 29.397590361445786
Percentage of cancerous rows: 70.60240963855422
y before resampling:  is_cancerous
1    293
0    122
Name: count, dtype: int64
y-labels after resampling:  Counter({1: 293, 0: 293})
Resampled data :
y_resampled:  is_cancerous
0    293
1    293
Name: count, dtype: int64
x_resampled:       age  fitspatrick  diameter_1  diameter_2  pigment_network_coverage  \
0     74     1.000000   15.000000   10.000000                  5.961609   
1     90     1.000000    7.000000    4.000000                  8.944702   
2     54     3.000000   15.000000   13.000000                  3.039551   
3     52     1.000000   13.000000   10.000000                  4.364014   
4     78     3.000000   11.000000    6.000000                  2.258301   
..   ...          ...         ...         ...                       ...   
581   77     3.000000   24.420428    9.884086                  4.868109   
582   64     

  warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")


Split data into training, validation and test tests for CNN, aka just img_id, gender, is_cancerous and fitzpatrick with no synthetic data, but additional data from augmented images. Splits twice, once augmenting images at most one time, and another augmenting each image the minimum amount of times necessairy to achieve 50/50 split

In [67]:
def get_ca_nc_split_dfs(df, split_col):

    print("pad-ufes before split \n", df.head())

    col_vals = df[split_col].unique() # Gets the unique values for the coloumn we choose to split our data on
    col_vals.sort()
    assert (len(col_vals) == 2) # Checks that our value (sex) is binary in our dataframe 
    a_df = df[df[split_col] == col_vals[0]] # Female, since it's sorted alphabetically
    b_df = df[df[split_col] == col_vals[1]] # Male
    a_nc_df = a_df[a_df['diagnostic'].isin(non_cancerous_conditions)]
    a_ca_df = a_df[a_df['diagnostic'].isin(cancerous_conditions)]
    b_nc_df = b_df[b_df['diagnostic'].isin(non_cancerous_conditions)]
    b_ca_df = b_df[b_df['diagnostic'].isin(cancerous_conditions)]
    return a_ca_df, b_ca_df, a_nc_df, b_nc_df


def get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size, cancerous_fraction=None):
    if cancerous_fraction is None:
        ca_a_fraction = len(ca_a_df) / (len(ca_a_df) + len(nc_a_df))
        ca_b_fraction = len(ca_b_df) / (len(ca_b_df) + len(nc_b_df))
        warn(
            "Using legacy group-wise disease label stratification. Will lead to different disease prevalences in the different groups in the training set. NOT RECOMMENDED.")
    else:
        ca_a_fraction = ca_b_fraction = cancerous_fraction

    nc_a_fraction = 1 - ca_a_fraction
    nc_b_fraction = 1 - ca_b_fraction
    train_set_n_ca_a_nom = ca_a_fraction * train_set_size * ratio
    train_set_n_ca_a = round(train_set_n_ca_a_nom)
    train_set_n_ca_b_nom = ca_b_fraction * train_set_size * (1 - ratio)
    train_set_n_ca_b = round(train_set_n_ca_b_nom)
    train_set_n_nc_a_nom = nc_a_fraction * train_set_size * ratio
    train_set_n_nc_a = round(train_set_n_nc_a_nom)
    train_set_n_nc_b_nom = nc_b_fraction * train_set_size * (1 - ratio)
    train_set_n_nc_b = round(train_set_n_nc_b_nom)

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b < train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        max_diff_idx = diffs.index(max(diffs))
        if max_diff_idx == 0:
            train_set_n_ca_a += 1
        elif max_diff_idx == 1:
            train_set_n_ca_b += 1
        elif max_diff_idx == 2:
            train_set_n_nc_a += 1
        else:
            train_set_n_nc_b += 1

    while train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b > train_set_size:
        diffs = [train_set_n_ca_a_nom - train_set_n_ca_a, train_set_n_ca_b_nom - train_set_n_ca_b,
                 train_set_n_nc_a_nom - train_set_n_nc_a, train_set_n_nc_b_nom - train_set_n_nc_b]
        min_diff_idx = diffs.index(min(diffs))
        if min_diff_idx == 0:
            train_set_n_ca_a -= 1
        elif min_diff_idx == 1:
            train_set_n_ca_b -= 1
        elif min_diff_idx == 2:
            train_set_n_nc_a -= 1
        else:
            train_set_n_nc_b -= 1

    assert (train_set_n_ca_a + train_set_n_ca_b + train_set_n_nc_a + train_set_n_nc_b == train_set_size)

    return train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b


def assign_test_sets(df, n_test_sets, condition, rng):

    n_reruns = int(np.ceil(test_size_per_sex_per_group * n_test_sets / len(df)))
    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        df[test_set_name] = 0

    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)
        df[rerun_name] = 0
    
    df_without_aug = df[~df['img_id'].str.startswith('aug_')] # Makes a dataframe without the augmented data, so test sets are made with wholly original data
    df_with_aug = df[df['img_id'].str.startswith('aug_')] # Makes a dataframe with only the augmented data, to be concatenated with the test sets afterwards

    
    for rerun_idx in range(0, n_reruns):
        rerun_name = 'rerun_' + str(rerun_idx)

        grouped_patient = df_without_aug.groupby('patient_id') # Group lesions from the same patient to ensure that they are not distributed across different tests or train/validation df

        print("grouped patient: ", grouped_patient.describe())

        for test_idx in range(0, n_test_sets):
            test_set_name = 'test_set_' + str(test_idx)

            while complement_lesions_to_use[test_idx]:
                patient_id = complement_lesions_to_use[test_idx].pop(0)
                print("patient_id: ", patient_id)
                print("without the patient_id: ", complement_lesions_to_use[test_idx])
                lesion = grouped_patient.get_group(patient_id)
                remaining = sum(df_without_aug[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df_without_aug[test_set_name])
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]
                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df_without_aug) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df_without_aug.loc[local_sample.index, test_set_name] = 1
                df_without_aug.loc[local_sample.index, rerun_name] = 1
            
            for patient_id, lesion in grouped_patient:
                remaining = sum(df_without_aug[rerun_name] == 0)
                missing = test_size_per_sex_per_group - sum(df_without_aug[test_set_name])
                if missing < test_size_per_sex_per_group - 7 and 1 in lesion: # Only add patients with both cancerous and non-cancerous lesions in the first half of sampling process, to make sure there aren't too many complements to be used in the corresponding test set
                    continue
                if condition == 1 and 1 in lesion['both_cancerous_and_non_cancerous'].values: # Skip to next iteration if patient has both cancerous and non-cancerous lesions and we are in cancer set, as no more data is added afterwards, and therefore can't add more pairs
                    continue
                if (len(lesion) > missing): # Skip to next iteration if more lesions are grouped than are missing
                    continue                   
                if 1 in lesion['both_cancerous_and_non_cancerous'].values:
                    complement_lesions_to_use[test_idx].append(patient_id)
                eligibles = lesion[(lesion[test_set_name] == 0) & (lesion[rerun_name] == 0)]    

                eligible_count = len(eligibles)
                sample_size = min([missing, min([remaining, int(np.ceil(len(df_without_aug) / n_reruns))])])
                sample_size = min(sample_size, eligible_count)  # Ensure sample size doesn't exceed eligible items
                local_sample = eligibles.sample(n=sample_size, replace=False, random_state=rng)

                df_without_aug.loc[local_sample.index, test_set_name] = 1
                df_without_aug.loc[local_sample.index, rerun_name] = 1     
            print("test set: ", test_set_name, " complement lesions remaining: ", complement_lesions_to_use[test_idx])
        if condition == 1:
            assert (complement_lesions_to_use[test_idx] == []) # Ensure that all lesions from same patient are used in the same test set

    df = pd.concat([df_without_aug, df_with_aug]) # Concatenate the augmented data back to the dataframe

    for test_idx in range(0, n_test_sets):
        test_set_name = 'test_set_' + str(test_idx)
        assert (sum(df[test_set_name] == 1) == test_size_per_sex_per_group)

    return df


def check_unique(df) -> None:
    if 'lesion_id' in df.columns:
        assert (df.lesion_id.is_unique)
    if 'img_id' in df.columns:
        assert (df.img_id.is_unique)        
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")    


def sort_df(df) -> None:
    if 'lesion_id' in df.columns:
        return df.sort_values('lesion_id')
    if 'img_id' in df.columns:
        return df.sort_values('img_id')
    if df.index.name in ['lesion_id', 'img_id']:
        return df.sort_index()
    if 'lesion_id' not in df.columns and 'img_id' not in df.columns and df.index.name not in ['lesion_id', 'img_id']:
        warn("could not check subject uniqueness since neither 'lesion_id' nor 'img_id' column present")

def clean_data(df):
    df = df.sort_values(["lesion_id"]) 
    df = df[~df["gender"].isna()] # Removing data without gender entry
    df = df.drop_duplicates("lesion_id", keep = "first") # Dropping duplicates, but keeping the first occurence of the lesion_id

    df['is_cancerous'] = df['diagnostic'].apply(lambda x: any(cancer in x for cancer in ['SCC', 'BCC', 'MEL'])).astype(int) # Add is_cancerous column to df, with 1 being cancerous and 0 being non-cancerous

    # Remove unnecessary columns for cleanliness
    df = df[['patient_id', 'img_id', 'lesion_id', 'gender', 'fitspatrick', 'is_cancerous', 'diagnostic']] # Pad-Ufes has misspelled fitzpatrick and I can't be bothered to change all our datasets right now
    
    return df

def get_augmented_data(df):
    # Call function for balancing dataset via augmented data
    df_once_augmented, df_50_50 = image_augmentation.balance_dataset(df, 'is_cancerous')
    
    return df_once_augmented, df_50_50


if __name__ == '__main__':
    
    n_test_sets = 5
    ratios = [0, 0.25, 0.5, 0.75, 1.0]
    n_folds = 5
    cancerous_conditions = ["BCC", "MEL", "SCC"]
    non_cancerous_conditions = ["ACK", "NEV", "SEK"]
    cancer = 1
    non_cancer = 0
    genders = ["FEMALE", "MALE"]
    rng = np.random.default_rng(1173).bit_generator

    basepaths = ['../data/cnn/cnn_splitted_data_once_augmented/',  '../data/cnn/cnn_splitted_data_50_50_split/'] # List of basepaths for the two different augmented datasets

    test_size_per_sex_per_group = 26 # 24 each of male/female non-cancerous/cancerous patients, i.e., total test size is 100

    df = pd.read_csv("../data/metadata/fixed_metadata.csv") # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed
    df = clean_data(df) # Clean data after feature extraction
    df_once_augmented, df_50_50 = get_augmented_data(df) # add augmented images to dataset (df 50/50 not used in this cell)

    dfs = [df_once_augmented, df_50_50] # List of dataframes to iterate over

    df_once_augmented.to_csv('../data/metadata/once_augmented.csv', index=False)
    df_50_50.to_csv('../data/metadata/50_50_augmented.csv', index=False)

    # for i in range(len(basepaths)):
    basepath = basepaths[0]
    df = dfs[0]

    basename = basepath + 'm_f_ca_nc'
    check_unique(df)
    
    print("pad-ufes length after clean \n", len(df))

    # Biggest training set possible, found by trial and error
    train_set_size = 535

    # Create boolean masks for cancerous and non-cancerous conditions
    cancerous_mask = df['diagnostic'].isin(cancerous_conditions)
    non_cancerous_mask = df['diagnostic'].isin(non_cancerous_conditions)

    # Calculate the fraction of cancerous conditions among all conditions
    cancerous_fraction = sum(cancerous_mask) / (sum(cancerous_mask) + sum(non_cancerous_mask))

    # Make dataframe with only non-cancerous conditions
    df_nc = df[non_cancerous_mask]

       # check for whether a patient has both cancerous and non cancerous lesions 
    nc_ca_df = df.groupby(['patient_id'])['is_cancerous'].nunique()

    # Filter groups where both True (cancerous) and False (non-cancerous) exist
    nc_ca_df = nc_ca_df[nc_ca_df == 2]

    # List of patient_id's with both cancerous and non-cancerous lesions
    nc_ca_df_patient_ids = nc_ca_df.index.tolist()
    print("length of patients with both cancerous and non-cancerous lesions: ", len(nc_ca_df_patient_ids))
    print("list: ", nc_ca_df_patient_ids)

    # Add identifier to the dataframe for patients with both cancerous and non-cancerous lesions
    df['both_cancerous_and_non_cancerous'] = 0
    df.loc[df['patient_id'].isin(nc_ca_df_patient_ids), 'both_cancerous_and_non_cancerous'] = 1

    # Split data into dataframes on gender and diagnostic
    ca_a_df, ca_b_df, nc_a_df, nc_b_df = get_ca_nc_split_dfs(df, 'gender') #ca: cancerous, nc: non-cancerous, a: female, b: male

    # Compute cross-validation fold sizes
    fold_size_base, fold_size_rem = divmod(train_set_size, n_folds)
    fold_sizes = []
    for ii in range(0, n_folds):
        if ii < fold_size_rem:
            fold_sizes.append(fold_size_base + 1)
        else:
            fold_sizes.append(fold_size_base)
    assert (sum(fold_sizes) == train_set_size)

    print("Just after creation: ", fold_sizes)

    # Set up the desired test sets
    nc_a_df = assign_test_sets(nc_a_df, n_test_sets, non_cancer, rng)
    ca_a_df = assign_test_sets(ca_a_df, n_test_sets, cancer, rng)
    nc_b_df = assign_test_sets(nc_b_df, n_test_sets, non_cancer, rng)
    ca_b_df = assign_test_sets(ca_b_df, n_test_sets, cancer, rng)

    print("length of female without cancer: ", len(nc_a_df))
    print("length of male without cancer: ", len(nc_b_df))

    for test_idx in range(0, n_test_sets):

        test_set_name = 'test_set_' + str(test_idx)

        test_df = sort_df(pd.concat([ca_a_df[ca_a_df[test_set_name] == 1],
                                    nc_a_df[nc_a_df[test_set_name] == 1],
                                    ca_b_df[ca_b_df[test_set_name] == 1],
                                    nc_b_df[nc_b_df[test_set_name] == 1]]))

        check_unique(test_df)

        test_df.to_csv(basename + f'_test_{test_idx}.csv')

        # Mark corresponding augmented lesions ot the ones used in test as used in test set as well
        img_with_corresponding_aug = test_df['img_id'].tolist() 
        img_with_corresponding_aug = ['aug_' + img_id for img_id in img_with_corresponding_aug]
        print("img with corresponding aug: ", img_with_corresponding_aug)
        print("non-cancerous dataframes before marking: ", nc_a_df, nc_b_df)
        nc_a_df.to_csv(f'../test/test_csvs/before_addition_{test_idx}.csv')

        for img_id in img_with_corresponding_aug:
            if img_id in nc_a_df['img_id'].values: # Since we only augment non-cancerous lesions, we only need to check these
                nc_a_df.loc[nc_a_df['img_id'] == img_id, test_set_name] = 1
            if img_id in nc_b_df['img_id'].values: # Since we only augment non-cancerous lesions, we only need to check these
                nc_b_df.loc[nc_b_df['img_id'] == img_id, test_set_name] = 1
        
        print("non-cancerous dataframes before marking: ", nc_a_df, nc_b_df)
        nc_a_df.to_csv(f'../test/test_csvs/after_addition_{test_idx}.csv')

        print("length of each test set for test set: ", test_set_name, len(ca_a_df[ca_a_df[test_set_name] == 1]), len(nc_a_df[nc_a_df[test_set_name] == 1]), len(ca_b_df[ca_b_df[test_set_name] == 1]), len(nc_b_df[nc_b_df[test_set_name] == 1]))
        print("length of each split: ", test_set_name, len(ca_a_df), len(nc_a_df), len(ca_b_df), len(nc_b_df))


        # Mark that nothing has been used in the training / validation sets belong to this test set yet
        ca_a_df.loc[:, 'used_with_curr_test'] = 0
        ca_b_df.loc[:, 'used_with_curr_test'] = 0
        nc_a_df.loc[:, 'used_with_curr_test'] = 0
        nc_b_df.loc[:, 'used_with_curr_test'] = 0

        print("Fold sizes before ratios: ", fold_sizes)
        ratios.sort()
        print("Fold sizes after ratios: ", fold_sizes)
        # Set up the training and validation sets for each test set
        for ratio in ratios:
            print("started loop: ", ratio)
            # ----- Determine (based on ratio and train_set_size) how many AD/HC F/M/1.5/3 there should be and
            # split into the four corresponding DFs.
            train_set_n_ca_a, train_set_n_ca_b, train_set_n_nc_a, train_set_n_nc_b = \
                get_train_set_sizes(ca_a_df, ca_b_df, nc_a_df, nc_b_df, ratio, train_set_size,
                                    cancerous_fraction=cancerous_fraction)

            # Compose a training + validation dataset with the desired sex ratio from the remaining non-test data
            # Reuse samples that have been used for earlier ratios wherever possible to minimize training set
            # variations across ratios.
            if ratio == min(ratios):
                # first ratio, just sample from scratch
                train_ca_a_df = ca_a_df[ca_a_df[test_set_name] == 0].sample(n=train_set_n_ca_a, random_state=rng)
                train_ca_b_df = ca_b_df[ca_b_df[test_set_name] == 0].sample(n=train_set_n_ca_b, random_state=rng)
                train_nc_a_df = nc_a_df[nc_a_df[test_set_name] == 0].sample(n=train_set_n_nc_a, random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df[test_set_name] == 0].sample(n=train_set_n_nc_b, random_state=rng)
            else:
                # We work with increasing ratios, i.e., we now have less males and more females than for the
                # previous ratio.
                # Draw males only from the ones that have been used so far
                train_ca_b_df = ca_b_df[ca_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_ca_b,
                                                                                    random_state=rng)
                train_nc_b_df = nc_b_df[nc_b_df['used_with_curr_test'] == 1].sample(n=train_set_n_nc_b,
                                                                                    random_state=rng)
                # Use all females used so far + draw new ones as needed
                n_prev = ca_a_df['used_with_curr_test'].sum()
                train_ca_a_df = pd.concat([ca_a_df[ca_a_df['used_with_curr_test'] == 1],
                                            ca_a_df[(ca_a_df[test_set_name] == 0) & (
                                                        ca_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_ca_a - n_prev, random_state=rng)])
                n_prev = nc_a_df['used_with_curr_test'].sum()
                train_nc_a_df = pd.concat([nc_a_df[nc_a_df['used_with_curr_test'] == 1],
                                            nc_a_df[(nc_a_df[test_set_name] == 0) & (
                                                        nc_a_df['used_with_curr_test'] == 0)].sample(
                                                n=train_set_n_nc_a - n_prev, random_state=rng)])

            # Mark which ones we have used so far with the current test set + this and previous ratios
            ca_a_df.loc[train_ca_a_df.index, 'used_with_curr_test'] = 1
            ca_b_df.loc[train_ca_b_df.index, 'used_with_curr_test'] = 1
            nc_a_df.loc[train_nc_a_df.index, 'used_with_curr_test'] = 1
            nc_b_df.loc[train_nc_b_df.index, 'used_with_curr_test'] = 1

            train_and_vali_df = sort_df(pd.concat([train_ca_a_df, train_ca_b_df, train_nc_a_df, train_nc_b_df]))
            

            # Set up folds for cross-validation
            train_and_vali_df['fold'] = np.nan
            for fold_idx, fold_size in enumerate(fold_sizes):
                print("length of train and vali df: ", len(train_and_vali_df))
                print(train_and_vali_df.index)
                assert (train_and_vali_df.index.is_unique)
                train_and_vali_df.loc[train_and_vali_df[train_and_vali_df.fold.isna()].sample(n=fold_size,
                                                                                                random_state=rng).index, 'fold'] = fold_idx
            
            # check that everything looks nice
            check_unique(test_df)
            check_unique(train_and_vali_df)
            print("length of train_and_vali_df: ", len(train_and_vali_df))
            assert (~train_and_vali_df.fold.isna().any())   

            for fold_idx in range(0, n_folds):
                train_df = train_and_vali_df[train_and_vali_df.fold != fold_idx]
                val_df = train_and_vali_df[train_and_vali_df.fold == fold_idx]
                
                all_dfs:list[pd.DataFrame] = [train_df, val_df, test_df]
                all_df = pd.concat(all_dfs)
                check_unique(all_df)

                train_df.to_csv(basename + f'_train_{test_idx}_{ratio:.2f}_{fold_idx}.csv')
                val_df.to_csv(basename + f'_val_{test_idx}_{ratio:.2f}_{fold_idx}.csv')


            print("Completed loop with ratio ", ratio)  

Augmenting images of class 0
Augmenting 487 images of  346  images
pad-ufes length after clean 
 1525
length of patients with both cancerous and non-cancerous lesions:  110
list:  ['PAT_104', 'PAT_108', 'PAT_110', 'PAT_126', 'PAT_15', 'PAT_155', 'PAT_158', 'PAT_166', 'PAT_167', 'PAT_177', 'PAT_180', 'PAT_181', 'PAT_192', 'PAT_198', 'PAT_202', 'PAT_207', 'PAT_21', 'PAT_217', 'PAT_220', 'PAT_223', 'PAT_224', 'PAT_233', 'PAT_256', 'PAT_263', 'PAT_265', 'PAT_267', 'PAT_270', 'PAT_277', 'PAT_279', 'PAT_281', 'PAT_300', 'PAT_302', 'PAT_304', 'PAT_306', 'PAT_307', 'PAT_309', 'PAT_316', 'PAT_325', 'PAT_328', 'PAT_333', 'PAT_337', 'PAT_354', 'PAT_356', 'PAT_359', 'PAT_36', 'PAT_366', 'PAT_368', 'PAT_373', 'PAT_379', 'PAT_38', 'PAT_380', 'PAT_386', 'PAT_388', 'PAT_409', 'PAT_412', 'PAT_419', 'PAT_433', 'PAT_441', 'PAT_45', 'PAT_457', 'PAT_468', 'PAT_482', 'PAT_492', 'PAT_493', 'PAT_505', 'PAT_522', 'PAT_526', 'PAT_531', 'PAT_532', 'PAT_544', 'PAT_571', 'PAT_587', 'PAT_59', 'PAT_632', 'PAT_639', 

In [2]:

def clean_data(df):
    df = df.sort_values(["lesion_id"]) 
    df = df[~df["gender"].isna()] # Removing data without gender entry
    df = df.drop_duplicates("lesion_id", keep = "first") # Dropping duplicates, but keeping the first occurence of the lesion_id

    df['is_cancerous'] = df['diagnostic'].apply(lambda x: any(cancer in x for cancer in ['SCC', 'BCC', 'MEL'])).astype(int) # Add is_cancerous column to df, with 1 being cancerous and 0 being non-cancerous

    # Remove unnecessary columns for cleanliness
    df = df[['img_id', 'lesion_id', 'gender', 'fitspatrick', 'is_cancerous', 'diagnostic']] # Pad-Ufes has misspelled fitzpatrick and I can't be bothered to change all our datasets right now
    
    return df


def get_augmented_data(df):
    # Call function for balancing dataset via augmented data
    df_once_augmented, df_50_50 = image_augmentation.balance_dataset(df, 'is_cancerous')
    
    return df_once_augmented, df_50_50

df = pd.read_csv("../data/metadata/fixed_metadata.csv") # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed
df = clean_data(df) # Clean data after feature extraction
df_once_augmented, df_50_50 = get_augmented_data(df) # add augmented images to dataset

df_once_augmented.to_csv('../data/metadata/once_augmented.csv', index=False)

df_50_50.to_csv('../data/metadata/50_50_augmented.csv', index=False)


Counts of class 0 and class
Augmenting images of class 0
Number of augmentable images:  346
Augmenting 487 images of  346  images
Entire dataframe of images to augment:                      img_id  lesion_id  gender  fitspatrick  is_cancerous  \
200       PAT_90_3_648.png          3  FEMALE          3.0             0   
286      PAT_256_4_583.png          4  FEMALE          1.0             0   
355       PAT_91_5_332.png          5    MALE          2.0             0   
363      PAT_236_7_180.png          7    MALE          3.0             0   
609     PAT_180_10_831.png         10  FEMALE          3.0             0   
...                    ...        ...     ...          ...           ...   
2266  PAT_886_4635_304.png       4635  FEMALE          3.0             0   
2274  PAT_877_4672_612.png       4672    MALE          2.0             0   
2278  PAT_492_4723_890.png       4723    MALE          2.0             0   
2282  PAT_975_4734_783.png       4734    MALE          2.0            

In [5]:
df_once = pd.read_csv("../data/metadata/once_augmented.csv") # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed
df_50_50 = pd.read_csv("../data/metadata/50_50_augmented.csv") # Dataframe with all metadata for the PAD-UFES-20 dataset where duplicate lesion_id's have been renamed

print("cancerous once augmented: ", df_once[df_once['is_cancerous'] == 1].shape[0])
print("non-cancerous once augmented: ", df_once[df_once['is_cancerous'] == 0].shape[0])

print("ratio: ", (df_once[df_once['is_cancerous'] == 1].shape[0] / df_once[df_once['is_cancerous'] ==0].shape[0]))

print("cancerous once augmented: ", df_50_50[df_50_50['is_cancerous'] == 1].shape[0])
print("non-cancerous once augmented: ", df_50_50[df_50_50['is_cancerous'] == 0].shape[0])

print("ratio: ", (df_50_50[df_50_50['is_cancerous'] == 1].shape[0] / df_50_50[df_50_50['is_cancerous'] ==0].shape[0]))

_


In [26]:
import pandas as pd

# Assuming df is your dataframe
# Group by patient_id and lesion_id, count unique values of is_cancerous
df = pd.read_csv("../data/metadata/once_augmented.csv")
grouped = df.groupby(['patient_id'])['is_cancerous'].nunique()

# Filter groups where both True (cancerous) and False (non-cancerous) exist
filtered = grouped[grouped == 2]

# Count the number of unique patients from filtered groups
num_patients = filtered.index.get_level_values('patient_id').nunique()

print("Number of patients with different lesions having both cancerous and non-cancerous:", num_patients)

# Assuming df is your dataframe

# Group by patient_id and count unique lesions for each patient
lesion_counts = df.groupby('patient_id')['lesion_id'].nunique()

# Filter patients with multiple lesions
patients_with_multiple_lesions = lesion_counts[lesion_counts > 2]

# Count the number of patients with multiple lesions
num_patients_with_multiple_lesions = patients_with_multiple_lesions.count()

# Print the number of patients with multiple lesions and their respective lesion counts
print("Number of patients with multiple lesions:", num_patients_with_multiple_lesions)
print("Lesion counts for patients with multiple lesions:")
print(patients_with_multiple_lesions)


Number of patients with different lesions having both cancerous and non-cancerous: 110
Number of patients with multiple lesions: 174
Lesion counts for patients with multiple lesions:
patient_id
PAT_104    5
PAT_108    5
PAT_110    3
PAT_115    4
PAT_126    5
          ..
PAT_935    4
PAT_963    4
PAT_966    3
PAT_975    3
PAT_983    4
Name: lesion_id, Length: 174, dtype: int64
