In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

num_bootstraps = 10
bootstrap_folder = os.path.expanduser("~/dropbox/sts-data/bootstraps-intersect-ecg-age-sex-metadata-newest")
patient_csv = os.path.expanduser("~/dropbox/sts-data/sts-intersect-ecg-age-sex-metadata-newest.csv")

%matplotlib inline

In [None]:
df = pd.read_csv(patient_csv, low_memory=False)

In [None]:
df['age_bins'] = pd.cut(x=df['age'], bins=[0, 55, 65, 75, 500])

In [None]:
df

In [None]:
def stats(df: pd.DataFrame, prefix_text: str):
   
    n_deaths = df['death'].sum()
    percent_deaths = n_deaths / df.shape[0] * 100
    print(f"{prefix_text} deaths: {n_deaths} / {df.shape[0]} ({percent_deaths:.1f}%)")
    
    n_men = df['sex'].sum()
    percent_men = n_men / df.shape[0] * 100
    print(f"{prefix_text} men: {n_men} / {df.shape[0]} ({percent_men:.1f}%)")
    
    for bin, count in df['age_bins'].value_counts().items():
        percent_age_bin = count / df.shape[0] * 100
        print(f"{prefix_text} age bin {bin}: n={count} / {df.shape[0]} ({percent_age_bin:.1f}%)")
    print('')

In [None]:
df_train_prior = None
df_valid_prior = None
df_test_prior = None

for i in range(10):    
    df_train, df_valid_and_test = train_test_split(
        df,
        test_size=0.2,
        random_state=i,
        stratify=df[['death', 'age_bins', 'sex']],
    )

    df_valid, df_test = train_test_split(
        df_valid_and_test,
        test_size=0.5,
        random_state=i,
        stratify=df_valid_and_test[['death', 'age_bins', 'sex']],
    )

    print(f"====================== bootstrap {i+1} ======================")
    stats(df=df_train, prefix_text=f"train")
    stats(df=df_valid, prefix_text=f"valid")
    stats(df=df_test, prefix_text=f"test")
    
    # Ensure the new dataframe is not equal to the prior
    if df_train_prior is not None:
        assert not df_train_prior.equals(df_train)
    if df_valid_prior is not None:
        assert not df_valid_prior.equals(df_valid)
    if df_test_prior is not None:
        assert not df_test_prior.equals(df_test)
        
    df_train_prior = df_train
    df_valid_prior = df_valid
    df_test_prior = df_test
    
    if not os.path.isdir(f"{bootstrap_folder}/{i}"):
        os.makedirs(f"{bootstrap_folder}/{i}")
    
    df_train.to_csv(f"{bootstrap_folder}/{i}/train.csv", index=False)
    df_valid.to_csv(f"{bootstrap_folder}/{i}/valid.csv", index=False)
    df_test.to_csv(f"{bootstrap_folder}/{i}/test.csv", index=False)