### Configuration

In [None]:
import os

num_bootstraps = 10
bootstrap_folder = os.path.expanduser("~/dropbox/sts-data/bootstraps-intersect-ecg-age-sex-metadata-newest")
patient_csv = os.path.expanduser("~/dropbox/sts-data/sts-intersect-ecg-age-sex-metadata-newest.csv")

In [None]:
import sys
import math
import numpy as np
import pandas as pd
from ml4c3.arguments import parse_args
from ml4c3.explorations import explore
from typing import List, Tuple, Union
from collections import defaultdict

def print_dataframe(df):
    """
    Display entire dataframe, be careful of printing very large dataframes
    """
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', -1):
        print(df)

### Load data

In [None]:
# Get dataframe of surgeries with preop ECGs
sys.argv = f"""
. explore
--tensors /storage/shared/ecg/mgh
--patient_csv {patient_csv}
--input_tensors
    ecg_patientid_clean_preop_newest
    age_with_preop_ecg_newest
    gender_with_preop_ecg_newest
    opcab_with_preop_ecg_newest
    opvalve_with_preop_ecg_newest
    opother_with_preop_ecg_newest
    status_with_preop_ecg_newest
    predmort_with_preop_ecg_newest
    sts_death_with_preop_ecg_newest
--output_folder /tmp/explore
""".split()
args = parse_args()
df = explore(args, disable_saving_output=True)
df = df.dropna(subset=['ecg_patientid_clean_preop_newest'])

# Isolate patients >= 21, exclude bad ECGs, bin continuous values
df['ecg_patientid_clean_preop_newest'] = df['ecg_patientid_clean_preop_newest'].astype(int)
df = df[df['age_with_preop_ecg_newest'] >= 21]
bad = pd.read_csv(os.path.expanduser('~/dropbox/sts-data/mgh-bad-ecgs.csv'))
bad = bad[bad['Problem'] != 'None']
df = df.merge(bad, how='outer', left_on='ecg_patientid_clean_preop_newest', right_on='MRN', indicator=True)
df = df[df['_merge'] == 'left_only']

# Clean columns
df['mrn'] = df['ecg_patientid_clean_preop_newest'].astype(int)
df['death'] = df['sts_death_with_preop_ecg_newest'].astype(int)
df['age'] = df['age_with_preop_ecg_newest']
df['sex'] = df['gender_with_preop_ecg_newest'].astype(int)
df['opcab'] = df['opcab_with_preop_ecg_newest'].astype(int)
df['opvalve'] = df['opvalve_with_preop_ecg_newest'].astype(int)
df['opother'] = df['opother_with_preop_ecg_newest'].astype(int)
df['status_elective'] = df['status_with_preop_ecg_newest_status_1'].astype(int)
df['status_urgent'] = df['status_with_preop_ecg_newest_status_2'].astype(int)
df['status_emergent'] = df['status_with_preop_ecg_newest_status_3'].astype(int)
df['status_salvage'] = df['status_with_preop_ecg_newest_status_4'].astype(int)
df['predmort'] = df['predmort_with_preop_ecg_newest']
df['opmajor'] = (df['opcab'] | df['opvalve']) & ~df['opother']
df['preop_ecg'] = 1
df = df[[
    'mrn',
    'death',
    'age',
    'sex',
    'opmajor',
    'opcab',
    'opvalve',
    'opother',
    'status_elective',
    'status_urgent',
    'status_emergent',
    'status_salvage',
    'predmort',
    'preop_ecg',
]]
df_ecg = df.sort_values('mrn').reset_index(drop=True)

In [None]:
# Get dataframe of surgeries with no preop ECGs, using the newest surgery for patients with multiple surgeries
d = pd.read_csv(patient_csv)
d = d.sort_values(['medrecn', 'surgdt'])
d = d.drop_duplicates('medrecn', keep='last').reset_index(drop=True)
d = d[~d['medrecn'].isin(df['mrn'])]

# Clean columns
d['mrn'] = d['medrecn'].astype(int)
d['death'] = d['mtopd']
d['sex'] = (d['gender'] == 1).astype(int)
d['opcab'] = d['opcab'].astype(int)
d['opvalve'] = d['opvalve'].astype(int)
d['opother'] = d['opother'].astype(int)
d['opmajor'] = (d['opcab'] | d['opvalve']) & ~d['opother']
d['status_elective'] = (d['status'] == 1).astype(int)
d['status_urgent'] = (d['status'] == 2).astype(int)
d['status_emergent'] = (d['status'] == 3).astype(int)
d['status_salvage'] = (d['status'] == 4).astype(int)
d['preop_ecg'] = 0
d = d[[
    'mrn',
    'death',
    'age',
    'sex',
    'opmajor',
    'opcab',
    'opvalve',
    'opother',
    'status_elective',
    'status_urgent',
    'status_emergent',
    'status_salvage',
    'predmort',
    'preop_ecg',
]]

# Create dataframe of all surgeries
df_all = pd.concat([df, d]).sort_values('mrn').reset_index(drop=True)

In [None]:
_df_all = df_all.copy()
num_age_bins = 4
age_labels = list(range(num_age_bins))
_df_all['age'], age_bins = pd.qcut(_df_all['age'], num_age_bins, retbins=True, labels=age_labels)

### Stratify across train, valid, test splits

In [None]:
def train_valid_test_split(
    df: pd.DataFrame,
    stratify_by: Union[str, List[str]], 
    test_ratio: float = 0.1, 
    valid_ratio: float = 0.2,
    seed: int = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:    
    assert not test_ratio + valid_ratio > 1
    relative_valid_ratio = valid_ratio / (1 - test_ratio)

    test_dfs = []
    valid_dfs = []
    train_dfs = []

    gb = df.groupby(stratify_by)

    combine = defaultdict(list)
    groups = []
    for name, group in gb:
        tot = len(group)
        if tot == 0:
            continue

        # custom groupings
        if tot < 3:
            needs_group = True
            if name[0] == 1: # died
                if name[3:] == (1, 1, 1): # major, elective, with preop ecg
                    combine["died, major, elective, preop ecg"].append(group)
                    needs_group = False
                elif name[3:] == (1, 1, 0): # major, elective, no preop ecg
                    combine["died, major, elective, no ecg"].append(group)
                    needs_group = False
                elif name[3:] == (0, 1, 0): # other, elective, no preop ecg
                    combine["died, other, elective, no ecg"].append(group)
                    needs_group = False

            if needs_group:
                print(name)
                print_dataframe(group['mrn'].count())
        else:
            groups.append(group)

    for combine_name, combine_dfs in combine.items():
        combined = pd.concat(combine_dfs)
        if seed == 0:
            print(f"Combined {len(combine_dfs)} {combine_name} groups into 1 group of size {len(combined)}")
        groups.append(combined)

    for group in groups:
        tot = len(group)
        n = int(test_ratio * tot) or 1
        test_df = group.sample(n=n, replace=False, random_state=seed)
        group = group.drop(test_df.index)

        tot = len(group)
        n = int(relative_valid_ratio * tot) or 1
        valid_df = group.sample(n=n, replace=False, random_state=seed)
        train_df = group.drop(valid_df.index)

        # Assert that group is represented in all splits
        assert len(test_df) != 0
        assert len(valid_df) != 0
        assert len(train_df) != 0
        test_dfs.append(test_df)
        valid_dfs.append(valid_df)
        train_dfs.append(train_df)

    test_df = pd.concat(test_dfs)
    valid_df = pd.concat(valid_dfs)
    train_df = pd.concat(train_dfs)

    # Assert that split data exactly matches original data
    cols = list(df.columns)
    assert df.sort_values(cols).equals(pd.concat([train_df, valid_df, test_df]).sort_values(cols))

    train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    valid_df = valid_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    test_df = test_df.sample(frac=1, random_state=seed).reset_index(drop=True)
    return train_df, valid_df, test_df

In [None]:
_df_all = pd.read_csv(patient_csv, low_memory=False)
_df_all

### Stratify for bootstraps

In [None]:
for i in range(num_bootstraps):
    total=0
    train, valid, test = train_valid_test_split(
        df=_df_all,
        stratify_by=[
            'death',
            'age',
            'sex',
#             'opmajor',
#             'status_elective',
#             'preop_ecg',
        ],
        test_ratio=0.1,
        valid_ratio=0.1,
        seed=i,
    )
    this_bootstrap = os.path.join(bootstrap_folder, str(i))
    os.makedirs(this_bootstrap, exist_ok=True)

    train.to_csv(os.path.join(this_bootstrap, "train.csv"), index=False)
    total += train.shape[0]

    valid.to_csv(os.path.join(this_bootstrap, "valid.csv"), index=False)
    total += valid.shape[0]

    test.to_csv(os.path.join(this_bootstrap, "test.csv"), index=False)
    total += test.shape[0]

print(f"All cases: N={total}")

### Save subcohort bootstraps

In [None]:
filter_df = lambda df, lambda_func: df[lambda_func(df)]

ecg = lambda df: df['preop_ecg'] == 1
major = lambda df: df['opmajor'] == 1
other = lambda df: df['opmajor'] == 0 # opmajor calculated above
elective = lambda df: df['status_elective'] == 1

# bootstraps-all (above)
# bootstraps-major
# bootstraps-major-elective
# bootstraps-other
# bootstraps-other-elective
# bootstraps-ecg
# bootstraps-ecg-major
# bootstraps-ecg-major-elective
# bootstraps-ecg-other
# bootstraps-ecg-other-elective

In [None]:
splits = ['train', 'valid', 'test']
subcohorts = [

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-major')
lambda_func_major = lambda df: major(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_major)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-major-elective')
lambda_func_major_elective = lambda df: major(df) & elective(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_major_elective)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-other')
lambda_func_other = lambda df: other(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_other)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-other-elective')
lambda_func_other_elective = lambda df: other(df) & elective(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_other_elective)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-ecg')
lambda_func_ecg = lambda df: ecg(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_ecg)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-ecg-major')
lambda_func_ecg_major = lambda df: major(df) & ecg(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_ecg_major)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-ecg-major-elective')
lambda_func_ecg_major_elective = lambda df: major(df) & elective(df) & ecg(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_ecg_major_elective)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-ecg-other')
lambda_func_ecg_other = lambda df: other(df) & ecg(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_ecg_other)))

In [None]:
path = os.path.expanduser('~/dropbox/sts-data/bootstraps-ecg-other-elective')
lambda_func_ecg_other_elective = lambda df: other(df) & elective(df) & ecg(df)
subcohorts.append((path, lambda df: filter_df(df, lambda_func_ecg_other_elective)))

In [None]:
for path_subcohort, subcohort_filter in subcohorts:
    os.makedirs(path_subcohort, exist_ok=True)
    print(f"------------- {os.path.basename(path_subcohort)[15:]} -------------")
    
    for bootstrap in range(num_bootstraps):
        original_bootstrap_dir = os.path.join(bootstrap_folder, str(bootstrap))
        subcohort_bootstrap_dir = os.path.join(path_subcohort, str(bootstrap))
        os.makedirs(subcohort_bootstrap_dir, exist_ok=True)
        print(f"Bootstrap {bootstrap}")

        total = 0
        
        for split in splits:
            original_split_csv = os.path.join(original_bootstrap_dir, f"{split}.csv")
            subcohort_split_csv = os.path.join(subcohort_bootstrap_dir, f"{split}.csv")

            original_split_df = pd.read_csv(original_split_csv)
            subcohort_split_df = subcohort_filter(original_split_df)
            subcohort_split_df.to_csv(subcohort_split_csv, index=False)

            subcohort_split_died = len(subcohort_split_df[subcohort_split_df['death'] == 1])
            subcohort_split_total = len(subcohort_split_df)
            print(f"\t{subcohort_split_died:>5}/{subcohort_split_total:>5} = {subcohort_split_died/subcohort_split_total:0.4f}")
            
            total += subcohort_split_df.shape[0]

    print(f"{path_subcohort} N={total}")

### Report distribution of each stratify label

In [None]:
def print_label_prevalence(train: pd.DataFrame, valid: pd.DataFrame, test: pd.DataFrame, label: str):
    concat = pd.concat([train, valid, test], keys=['train', 'valid', 'test']).reset_index(0).rename({'level_0': 'split'}, axis=1)
    concat['split'] = pd.Categorical(concat['split'], ["train", "valid", "test"])
    grouped = concat.groupby([label, 'split']).size()
    print_dataframe(grouped.groupby(level=1).apply(lambda x: 100 * x / float(x.sum())))
    print()

In [None]:
for label in ['death', 'age', 'sex', 'opmajor', 'status_elective', 'preop_ecg']:
    print_label_prevalence(train, valid, test, label)