In [2]:
# Splits nodules between training and testing sets, using subtype data to stratify the sets

In [3]:
import pandas as pd
import numpy as np
import random

In [29]:
# Path to data folder
data_folder = "../data/"

# Paths to data csv files
subtypes_path = data_folder + "lidc_spic_subgrouped.csv"
max_slice_path = data_folder + "LIDC_20130817_AllFeatures2D_MaxSlicePerNodule_inLineRatings.csv"

# Filepath to store the training & testing flags for each nodule
split_path = data_folder + "lidc_train_test_split_stratified.csv"

In [30]:
# Sort each nodule by subgroups

subtype_df = pd.read_csv(subtypes_path)
subtype_col = "subgroup"
nodule_col = "noduleID"

nodules = {}

for subtype in subtype_df[subtype_col].unique():
    nodules[subtype] = subtype_df[subtype_df[subtype_col] == subtype][nodule_col]

In [34]:
# Randomly sample a portion of each subtype into training and test sets

train_frac = 0.8

train_set, test_set = [], []

for subtype in nodules:
    threshold = int(train_frac * len(nodules[subtype]))
    
    # shuffle nodules to ensure random sampling
    nodule_list = nodules[subtype].values
    random.shuffle(nodule_list)
    
    train_set.extend(nodule_list[:threshold])
    test_set.extend(nodule_list[threshold:])

In [35]:
# Format into dataframe

train_test_df = pd.DataFrame({nodule_col: subtype_df[nodule_col], "dataset": np.empty(len(subtype_df[nodule_col]))})

for i in subtype_df.index:
    nodule_id = subtype_df.at[i, nodule_col]
    train_test_df.at[i, "dataset"] = "train" if nodule_id in train_set else "test"

In [36]:
# Save to csv

train_test_df.to_csv(split_path, index=False)