# Load Package

In [None]:
# Important libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import joblib

data_label = "mimic"
seed = 2023

# File paths
fp_notebooks_folder = "../"
fp_code_folder = os.path.join(fp_notebooks_folder, "../")
fp_processed_folder = os.path.join(fp_code_folder, "processed_data", "mimic")
fp_downsampled_folder = os.path.join(fp_processed_folder, "downsampled")
fp_downsampled_dropna_file = os.path.join(fp_downsampled_folder, "dropna.csv")
fp_downsampled_scaler_file = os.path.join(fp_processed_folder, "scaler.pkl")

fp_project_checkpoints = os.path.join(fp_code_folder, "checkpoints", data_label)
fp_tuning = os.path.join(fp_project_checkpoints, "tuning")
fp_project_models = os.path.join(fp_project_checkpoints, "models")
fp_project_predictions = os.path.join(fp_project_checkpoints, "predictions")
fp_project_pi_predictions = os.path.join(fp_project_checkpoints, "pi_predictions")
fp_project_model_evaluations = os.path.join(fp_project_checkpoints, "model_evaluation")
fp_project_consolidated_results = os.path.join(fp_project_checkpoints, "consolidated_results")
fp_time_log = os.path.join(fp_project_consolidated_results, "runtime.log")

def create_folder(fp):
    if not os.path.exists(fp):
        os.makedirs(fp)
        return True
    else:
        False

def create_all_seed_folders(cur_seed):
    fp_checkpoint_folders = [fp_project_models, fp_tuning, fp_project_predictions, fp_project_model_evaluations, fp_project_pi_predictions]
    for fp_folder in fp_checkpoint_folders:
        fp = os.path.join(fp_folder, str(cur_seed))
        create_folder(fp)
    print(f"All folders created for seed = {cur_seed}!")

batch_size = 64

# Create all folders
create_all_seed_folders(seed)
create_folder(fp_project_consolidated_results)

# Check GPU is available
# print(tf.config.list_physical_devices('GPU'))

# function to show df
def display_df(df):
    display(df.head())
    print("Shape:", df.shape)

# Load Data

In [None]:
df = pd.read_csv(fp_downsampled_dropna_file, index_col=0)
df

In [None]:
df["train"].sum()

In [None]:
df["valid"].sum()

In [None]:
df["test"].sum()

In [None]:
predictors = df.columns[:60].to_list()
print(predictors)

In [None]:
pred_cols_1 = [col for col in df.columns if "PredMin1" in col]
pred_cols_2 = [col for col in df.columns if "PredMin2" in col]
pred_cols_3 = [col for col in df.columns if "PredMin3" in col]
print(pred_cols_1)
print(pred_cols_2)
print(pred_cols_3)

In [None]:
# Make train, validation and test sets
def train_valid_test_split(df, pred_cols):
    df_train, df_valid, df_test = df[df["train"]], df[df["valid"]], df[df["test"]]
    num_pred_cols = len(pred_cols)
    
    # Plot distribution of pred_col for each set
    fig, axes = plt.subplots(num_pred_cols, 3, figsize=(10, 2*num_pred_cols))
    for i, col in enumerate(pred_cols):
        axes[i, 0].hist(df_train[col])
        axes[i, 0].set_xlabel("Train")
        axes[i, 0].set_ylabel(col.split("_")[0])
        axes[i, 1].hist(df_valid[col])
        axes[i, 1].set_xlabel("Valid")
        axes[i, 2].hist(df_test[col])
        axes[i, 2].set_xlabel("Test")
    
    plt.tight_layout()

    return df_train, df_valid, df_test

df_train_1, df_valid_1, df_test_1 = train_valid_test_split(df, pred_cols=pred_cols_1)

In [None]:
df_train_2, df_valid_2, df_test_2 = train_valid_test_split(df, pred_cols=pred_cols_2)

In [None]:
df_train_3, df_valid_3, df_test_3 = train_valid_test_split(df, pred_cols=pred_cols_3)

# Export

In [None]:
split_dict = {
    "t+1": {"train_df": df_train_1, "valid_df": df_valid_1, "test_df": df_test_1, "outputs": pred_cols_1},
    "t+2": {"train_df": df_train_2, "valid_df": df_valid_2, "test_df": df_test_2, "outputs": pred_cols_2},
    "t+3": {"train_df": df_train_3, "valid_df": df_valid_3, "test_df": df_test_3, "outputs": pred_cols_3},
}
joblib.dump(split_dict, os.path.join(fp_processed_folder, "mimic_split_dict.joblib"))