In [38]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
preprocessed_cohort_path = '<your_base_path>/off_policy_policy_evaluation/datasets/mimic_iii/preprocessed_cohort/sepsis_final_data_RAW_withTimes_90_day_death_window.csv'
preprocessed_cohort = pd.read_csv(preprocessed_cohort_path)
preprocessed_cohort.head()

# 30 Stratified splits using length of trajectories
### We will save the splits in a csv file with the following columns:
- traj
- split (train, test)

### First, compute the length of each trajectory
### Next, compute split
### Finally, save the splits in a csv file
### Repeat the process for 30 times

In [3]:
traj_step_counts = preprocessed_cohort.groupby('traj')['step'].max() + 1 # add one for the 0-based index
traj_step_counts.rename('traj_length', inplace=True)
traj_step_counts.head()

In [19]:
preprocessed_cohort.loc[preprocessed_cohort['traj'] == 1, 'r:reward'].to_numpy().nonzero()[0][0]

In [40]:
survived = preprocessed_cohort.groupby('traj')['r:reward'].aggregate(lambda r: r.iloc[r.to_numpy().nonzero()[0][0]])
survived.rename('survived', inplace=True)
assert (survived != 0).all()
survived.head()

In [44]:
traj_df = survived.to_frame().join(traj_step_counts.to_frame(), on='traj')
traj_df['survived'] = np.isclose(traj_df['survived'], 1.0)
traj_df.head()

In [49]:
traj_df['strat_col'] = traj_df.apply(lambda row: f'{row["traj_length"]}_{row["survived"]}', axis=1)
traj_df.head()

In [45]:
test = preprocessed_cohort.merge(traj_df, on='traj', how='left')
test.head()

In [57]:
test.isnull().any().any()

In [58]:
import math

# Now compute split
train_split = 0.7
val_split = 0.1
test_split = 0.2
num_splits = 30

split_save_path = '<your_base_path>/off_policy_policy_evaluation/datasets/mimic_iii/stratified_splits/'
if not os.path.isdir(split_save_path):
    os.makedirs(split_save_path)

for i in range(num_splits):
    remainder_data, test_data = train_test_split(traj_df, test_size=math.ceil(traj_df.shape[0] * test_split), stratify=traj_df['strat_col'])
    train_data, val_data = train_test_split(remainder_data, test_size=math.ceil(traj_step_counts.shape[0] * val_split), stratify=remainder_data['strat_col'])
    train_data = train_data.reset_index()
    val_data = val_data.reset_index()
    test_data = test_data.reset_index()
    train_data['split'] = 'train'
    val_data['split'] = 'val'
    test_data['split'] = 'test'
    data = pd.concat([train_data, val_data, test_data])
    data.drop('strat_col', axis=1, inplace=True)
    assert (data['split'] == 'train').sum() + (data['split'] == 'val').sum() + (data['split'] == 'test').sum() == data.shape[0] == traj_step_counts.shape[0], 'Error in split'
    data.to_csv(os.path.join(split_save_path, f'split_{i}.csv'))
    loaded_data = pd.read_csv(os.path.join(split_save_path, f'split_{i}.csv'), index_col=0)
    assert data.equals(loaded_data), f'Error in saving dataframe {i}'
    
#print(train_data.value_counts())
#print(test_data.value_counts())

In [59]:
data.head()

In [60]:
data.loc[data['split'] == 'train', 'survived'].value_counts()

In [61]:
data.loc[data['split'] == 'val', 'survived'].value_counts()

In [62]:
data.loc[data['split'] == 'test', 'survived'].value_counts()

In [63]:
assert math.isclose(train_data.shape[0] / traj_step_counts.shape[0], train_split, rel_tol=0.01), f'Error in train split: {train_data.shape[0] / traj_step_counts.shape[0]}'
assert math.isclose(val_data.shape[0] / traj_step_counts.shape[0], val_split, rel_tol=0.01), f'Error in val split: {val_data.shape[0] / traj_step_counts.shape[0]}'
assert math.isclose(test_data.shape[0] / traj_step_counts.shape[0], test_split, rel_tol=0.01), f'Error in test split: {test_data.shape[0] / traj_step_counts.shape[0]}'

In [64]:
data.head()