# Config

In [18]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import GroupShuffleSplit

In [19]:
PROJECT_DIR = '/home/jupyter'
METADATA_DIR = os.path.join(PROJECT_DIR, 'data', 'processed_metadata')

# Reproducibility

In [20]:
# Ensure deterministic behavior
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)

# Split train, validation and test sets for each diagnosed spinal condition

<div style="font-size: 16px;">
    Note: We split data according to study IDs to avoid data leakage! (i.e., patients in each set does not overlap.)
</div>

In [21]:
df = pd.read_csv(os.path.join(METADATA_DIR, 'processed_metadata.csv'))
condition_types = df['condition'].unique().tolist()

In [22]:
def group_split(df, group_col, test_size=0.15, random_state=42):
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    split = splitter.split(df, groups=df[group_col])
    train_idx, test_idx = next(split)
    return df.iloc[train_idx], df.iloc[test_idx]

In [26]:
for condition in condition_types:
    print(condition, ":")
    # Remove spaces to write into file names
    condition = condition.replace(" ", "")
    
    # Read in the dataset exclusive to the condition
    df_condition = pd.read_csv(os.path.join(METADATA_DIR, 'processed_metadata_' + condition + '.csv'))
    
    # Split into train, validation and test
    train_val_df, test_df = group_split(df_condition, 'study_id')
    train_df, val_df = group_split(train_val_df, 'study_id')
    print(f"Number of samples in training set: {train_df.shape[0]}")
    print(f"Number of samples in validation set: {val_df.shape[0]}")
    print(f"Number of samples in test set: {test_df.shape[0]}")
    
    # Write out the splitted subsets
    WRITE_DIR = os.path.join(METADATA_DIR, condition)
    os.makedirs(WRITE_DIR, exist_ok=True)

    train_df.to_csv(os.path.join(WRITE_DIR, 'train.csv'), index=False)
    val_df.to_csv  (os.path.join(WRITE_DIR, 'val.csv'),   index=False)
    test_df.to_csv (os.path.join(WRITE_DIR, 'test.csv'),  index=False)

Spinal Canal Stenosis :
Number of samples in training set: 7045
Number of samples in validation set: 1245
Number of samples in test set: 1463
Left Neural Foraminal Narrowing :
Number of samples in training set: 7120
Number of samples in validation set: 1260
Number of samples in test set: 1480
Right Neural Foraminal Narrowing :
Number of samples in training set: 7070
Number of samples in validation set: 1249
Number of samples in test set: 1471
Left Subarticular Stenosis :
Number of samples in training set: 6939
Number of samples in validation set: 1223
Number of samples in test set: 1441
Right Subarticular Stenosis :
Number of samples in training set: 2999
Number of samples in validation set: 523
Number of samples in test set: 633
