In [1]:
import pandas as pd
import re, csv
from ast import literal_eval
from os import listdir
from os.path import isfile, join
import numpy as np
import random

Splits the original train dataset into train and validation.
Additionally, only takes the cases specified by CASE_NUMS.

In [2]:
DATASET_PATH = 'dataset/original/train.csv'
PN_NOTES_PATH = 'dataset/original/patient_notes.csv'
FEATURES_PATH = 'dataset/original/features.csv'

SUBSAMPLED_TRAIN_PATH = 'dataset/subset_250/train.csv'
SUBSAMPLED_VALID_PATH = 'dataset/subset_250/valid.csv'
SUBSAMPLED_TEST_PATH = 'dataset/subset_250/test.csv'
SUBSAMPLED_FEATURES_PATH = 'dataset/subset_250/features.csv'
SUBSAMPLED_PN_NOTES_PATH = 'dataset/subset_250/patient_notes.csv'

CASE_NUMS = [0, 1, 2, 3, 4]
VALID_NUM_PNS_PER_CASE = 20 
TRAIN_NUM_PNS_PER_CASE = 60
TEST_NUM_PNS_PER_CASE = 20

In [3]:
df = pd.read_csv(DATASET_PATH)

df = df.loc[df['case_num'] <= 4]

df_valid_list = []
df_test_list = []
for case_num in CASE_NUMS:
    df_case = df.loc[df['case_num'] == case_num]
    pn_nums = list(df_case.pn_num.unique())
    pn_nums =random.sample(pn_nums, VALID_NUM_PNS_PER_CASE + TEST_NUM_PNS_PER_CASE)
    pn_nums_val = pn_nums[0:20]
    pn_nums_test = pn_nums[20:]
    # za val
    df_case = df.loc[df['pn_num'].isin(pn_nums_val)]
    df = df.drop(df.index[df['pn_num'].isin(pn_nums_val)]) #removing selected validation samples from the train dataset
    df_valid_list.append(df_case)
    # za test
    df_case = df.loc[df['pn_num'].isin(pn_nums_test)]
    df = df.drop(df.index[df['pn_num'].isin(pn_nums_test)]) #removing selected validation samples from the train dataset
    df_test_list.append(df_case)



    
df_valid = pd.concat(df_valid_list, axis=0)
df_valid.to_csv(SUBSAMPLED_VALID_PATH, index=False)

df_test = pd.concat(df_test_list, axis=0)
df_test.to_csv(SUBSAMPLED_TEST_PATH, index=False)

df_train_subset = []
for case_num in CASE_NUMS:
    df_case = df.loc[df['case_num'] == case_num]
    pn_nums = list(df_case.pn_num.unique())
    pn_nums =random.sample(pn_nums, TRAIN_NUM_PNS_PER_CASE)
    df_case = df.loc[df['pn_num'].isin(pn_nums)]
    df_train_subset.append(df_case)
    df_train = df.drop(df.index[df['pn_num'].isin(pn_nums)]) #removing selected train samples
    
df_train_subset = pd.concat(df_train_subset)
df_train_subset.to_csv(SUBSAMPLED_TRAIN_PATH, index=False)

In [4]:

df_features = pd.read_csv(FEATURES_PATH)
df_features = df_features.loc[df_features['case_num'] <= 4]
df_features.to_csv(SUBSAMPLED_FEATURES_PATH, index=False)

In [5]:
df_train_subset = pd.read_csv(SUBSAMPLED_TRAIN_PATH)
df_valid_subset = pd.read_csv(SUBSAMPLED_VALID_PATH)
df_test_subset = pd.read_csv(SUBSAMPLED_TEST_PATH)

train_unique = df_train_subset['pn_num'].unique()
valid_unique = df_valid_subset['pn_num'].unique()
test_unique = df_test_subset['pn_num'].unique()

unique = np.concatenate((train_unique, valid_unique, test_unique))


df_pn_notes = pd.read_csv(PN_NOTES_PATH)
df_pn_notes = df_pn_notes.loc[df_pn_notes['pn_num'].isin(unique)]
df_pn_notes.to_csv(SUBSAMPLED_PN_NOTES_PATH, index=False)