### Configuration

In [None]:
import os

num_bootstraps = 10
bootstrap_folder = os.path.expanduser("~/dropbox/sts-data/bootstraps")
sample_csv = os.path.expanduser("~/dropbox/sts-data/sts-mgh.csv")

In [None]:
import sys
import math
import numpy as np
import pandas as pd
from ml4cvd.arguments import parse_args
from ml4cvd.explorations import explore
from typing import List, Tuple, Union

def print_dataframe(df):
    """
    Display entire dataframe, be careful of printing very large dataframes
    """
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', -1):
        print(df)

### Load data

In [None]:
sys.argv = f"""
.
--tensors /storage/shared/ecg/mgh
--sample_csv {sample_csv}
--input_tensors
    ecg_patientid_clean_preop_newest
    ecg_age_preop_newest
    ecg_sex_preop_newest
    ecg_rate_md_preop_newest
    opcab
    opvalve
    opother
    status
    carshock
    sts_death
--explore_stratify_label
    sts_death
--output_folder /tmp
--id explore
""".split()
args = parse_args()
df = explore(args, disable_saving_output=True)

##### Quantiles are determined such that each unique combination of stratification labels has at least 3 patients (at least 1 per train/valid/test). These quantiles are tuned by manually adjusting and checking the resulting groups.

In [None]:
# Isolate patients >= 21, exclude bad ECGs, bin continuous values
df['ecg_patientid_clean_preop_newest'] = df['ecg_patientid_clean_preop_newest'].astype(int)
df = df[df['ecg_age_preop_newest'] >= 21]
bad = pd.read_csv(os.path.expanduser('~/dropbox/sts-data/mgh-bad-ecgs.csv'))
bad = bad[bad['Problem'] != 'None']
df = df.merge(bad, how='outer', left_on='ecg_patientid_clean_preop_newest', right_on='MRN', indicator=True)
df = df[df['_merge'] == 'left_only']

In [None]:
df['mrn'] = df['ecg_patientid_clean_preop_newest'].astype(int)
df['death'] = df['sts_death'].astype(int)
df['sex'] = df['ecg_sex_preop_newest_male'].astype(int)
df['opcab'] = df['opcab']
df['opvalve'] = df['opvalve']
df['opother'] = df['opother']
df['age'] = df['ecg_age_preop_newest']
df['hr'] = df['ecg_rate_md_preop_newest']
df['status_elective'] = df['status_status_1']
df['status_urgent'] = df['status_status_2']
df['status_emergent'] = df['status_status_3']
df['status_salvage'] = df['status_status_4']
df['shock'] = df['carshock']
df = df[[
    'mrn',
    'death',
    'sex',
    'age',
    'hr',
    'opcab',
    'opvalve',
    'opother',
    'status_elective',
    'status_urgent',
    'status_emergent',
    'status_salvage',
    'shock',
]]

# print_dataframe(df.groupby(['death', 'sex-male', 'age-quartile', 'heart-rate-tertile']).size())

In [None]:
_df = df.copy()
num_age_bins = 4
num_hr_bins = 3
_df['age'], age_bins = pd.qcut(_df['age'], num_age_bins, retbins=True, labels=list(range(num_age_bins)))
_df['hr'], hr_bins = pd.qcut(_df['hr'], num_hr_bins, retbins=True, labels=list(range(num_hr_bins)))
# print_dataframe(_df.groupby(['death', 'age', 'opcab', 'opvalve', 'opother']).size())

### Stratify across train, valid, test splits

In [None]:
def train_valid_test_split(
    df: pd.DataFrame, 
    stratify_by: Union[str, List[str]], 
    test_ratio: float = 0.1, 
    valid_ratio: float = 0.2,
    seed: int = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:    
    assert not test_ratio + valid_ratio > 1
    relative_valid_ratio = valid_ratio / (1 - test_ratio)

    test_dfs = []
    valid_dfs = []
    train_dfs = []

    gb = df.groupby(stratify_by)
    
    combine = []
    groups = []
    for name, group in gb:
        tot = len(group)
        if tot == 0:
            continue
            
        if (name == (1, 1, 0.0, 0.0, 0.0) or
            name == (1, 0, 0.0, 0.0, 0.0)):
            combine.append(group)
        else:
            groups.append(group)
            
    if len(combine) > 0:
        groups.append(pd.concat(combine))
        
    for group in groups:
        tot = len(group)
        n = int(test_ratio * tot) or 1
        test_df = group.sample(n=n, replace=False, random_state=seed)
        group = group.drop(test_df.index)

        tot = len(group)
        n = int(relative_valid_ratio * tot) or 1
        valid_df = group.sample(n=n, replace=False, random_state=seed)
        train_df = group.drop(valid_df.index)
        
        # Assert that group is represented in all splits
        assert len(test_df) != 0
        assert len(valid_df) != 0
        assert len(train_df) != 0
        test_dfs.append(test_df)
        valid_dfs.append(valid_df)
        train_dfs.append(train_df)

    test_df = pd.concat(test_dfs)
    valid_df = pd.concat(valid_dfs)
    train_df = pd.concat(train_dfs)
    
    # Assert that all groups are represented in all splits
    # train_groups = train_df.groupby(stratify_by)
    # valid_groups = valid_df.groupby(stratify_by)
    # test_groups = test_df.groupby(stratify_by)
    # assert len(train_groups) == len(valid_groups) == len(test_groups)
    
    # Assert that split data exactly matches original data
    cols = list(df.columns)
    assert df.sort_values(cols).equals(pd.concat([train_df, valid_df, test_df]).sort_values(cols))

    train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    valid_df = valid_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    test_df = test_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    return train_df, valid_df, test_df

### Stratify for bootstraps

In [None]:
for i in range(num_bootstraps):
    total=0
    train, valid, test = train_valid_test_split(
        df=_df,
        stratify_by=[
            'death',
            'age',
            'opcab',
            'opvalve',
            'opother',
        ],
        test_ratio=0.1,
        valid_ratio=0.2,
        seed=i,
    )
    this_bootstrap = os.path.join(bootstrap_folder, str(i))
    os.makedirs(this_bootstrap, exist_ok=True)

    train.to_csv(os.path.join(this_bootstrap, "train.csv"), index=False)
    total += train.shape[0]

    valid.to_csv(os.path.join(this_bootstrap, "valid.csv"), index=False)
    total += valid.shape[0]

    test.to_csv(os.path.join(this_bootstrap, "test.csv"), index=False)
    total += test.shape[0]
    
print(f"All cases: N={total}")

### Save subcohort bootstraps

In [None]:
splits = ['train', 'valid', 'test']
subcohorts = []

In [None]:
path_cabg = os.path.expanduser('~/dropbox/sts-data/bootstraps-cabg')
filter_df = lambda df: df[(df['opcab'] == 1) & (df['opvalve'] == 0) & (df['opother'] == 0)]
subcohorts.append((path_cabg, filter_df))

In [None]:
path_valve = os.path.expanduser('~/dropbox/sts-data/bootstraps-valve')
filter_df = lambda df: df[(df['opcab'] == 0) & (df['opvalve'] == 1) & (df['opother'] == 0)]
subcohorts.append((path_valve, filter_df))

In [None]:
path_cabg_valve = os.path.expanduser('~/dropbox/sts-data/bootstraps-cabg-valve')
filter_df = lambda df: df[(df['opcab'] == 1) & (df['opvalve'] == 1) & (df['opother'] == 0)]
subcohorts.append((path_cabg_valve, filter_df))

In [None]:
path_major = os.path.expanduser('~/dropbox/sts-data/bootstraps-major')
filter_df = lambda df: df[
    ((df['opcab'] == 1) & (df['opvalve'] == 0) & (df['opother'] == 0)) |
    ((df['opcab'] == 0) & (df['opvalve'] == 1) & (df['opother'] == 0)) |
    ((df['opcab'] == 1) & (df['opvalve'] == 1) & (df['opother'] == 0))
]
subcohorts.append((path_major, filter_df))

In [None]:
path_other = os.path.expanduser('~/dropbox/sts-data/bootstraps-other')
filter_df = lambda df: df[df['opother'] == 1]
subcohorts.append((path_other, filter_df))

In [None]:
path_office = os.path.expanduser('~/dropbox/sts-data/bootstraps-office')
filter_df = lambda df: df[df['status_emergent'] == 0]
filter_df = lambda df: df[df['status_salvage'] == 0]
filter_df = lambda df: df[df['shock'] == 0]
subcohorts.append((path_office, filter_df))

In [None]:
for path_subcohort, filter_df in subcohorts:
    os.makedirs(path_subcohort, exist_ok=True)
    print(f"------------- {os.path.basename(path_subcohort).split('-')[1]} -------------")
    
    for bootstrap in range(num_bootstraps):
        original_bootstrap_dir = os.path.join(bootstrap_folder, str(bootstrap))
        subcohort_bootstrap_dir = os.path.join(path_subcohort, str(bootstrap))
        os.makedirs(subcohort_bootstrap_dir, exist_ok=True)
        print(f"Bootstrap {bootstrap}")

        total = 0
        
        for split in splits:
            original_split_csv = os.path.join(original_bootstrap_dir, f"{split}.csv")
            subcohort_split_csv = os.path.join(subcohort_bootstrap_dir, f"{split}.csv")

            original_split_df = pd.read_csv(original_split_csv)
            subcohort_split_df = filter_df(original_split_df)
            subcohort_split_df.to_csv(subcohort_split_csv, index=False)

            subcohort_split_died = len(subcohort_split_df[subcohort_split_df['death'] == 1])
            subcohort_split_total = len(subcohort_split_df)
            print(f"\t{subcohort_split_died:>5}/{subcohort_split_total:>5} = {subcohort_split_died/subcohort_split_total:0.4f}")
            
            total += subcohort_split_df.shape[0]

    print(f"{path_subcohort} N={total}")

### Report distribution of each stratify label

In [None]:
def print_label_prevalence(train: pd.DataFrame, valid: pd.DataFrame, test: pd.DataFrame, label: str):
    concat = pd.concat([train, valid, test], keys=['train', 'valid', 'test']).reset_index(0).rename({'level_0': 'split'}, axis=1)
    concat['split'] = pd.Categorical(concat['split'], ["train", "valid", "test"])
    grouped = concat.groupby([label, 'split']).size()
    print_dataframe(grouped.groupby(level=1).apply(lambda x: 100 * x / float(x.sum())))
    print()

In [None]:
for label in ['death', 'age', 'opcab', 'opvalve', 'opother',  'status_emergent', 'status_salvage', 'shock',]:
    print_label_prevalence(train, valid, test, label)