In [1]:
import splits_generation
import utils
import numpy as np
import pandas as pd

## App2. Sliding window

In [21]:
# SEPSIS DEFINITION
# Question 1) How many hours do we propagate a True atb backwards and forwards? 
# Parameters defined as Nts_pre and Nts_post. 
# Question 2) How many hours do we propagate from sepsis_onset?
 
sep_def = {
    # Parameteres for antibiotic propagation
    'Nts_pre': 24, 'Nts_post': 24,
    # Parametere for sepsis propagation
    'N_prog_sep': 12, 
    # Parameter for determine sepsis onset
    'increm_sofa': 2,
    # Time step of reference: ICU admission (True) or pre-ICU admission (False)
    'ref_sofa_icu': False
}
 
keys_to_select_filter = ['stay_id', 'stay_time',
'hr_raw','o2sat_raw','temp_raw','sbp_raw','map_raw','dbp_raw','resp_raw','etco2_raw','fio2_raw',
'be_raw', 'bicar_raw','ph_raw','pco2_raw','cl_raw','mg_raw','phos_raw','k_raw','ast_raw','bun_raw',
'alp_raw','ca_raw','crea_raw','bildir_raw','glu_raw','lact_raw', 'bili_raw','tri_raw','hct_raw',
'hgb_raw','ptt_raw','wbc_raw','fgn_raw','plt_raw','age_static','female_static','cai_raw','na_raw',
'po2_raw','alb_raw','alt_raw','ck_raw','ckmb_raw','crp_raw','tnt_raw','urine_raw','basos_raw',
'bnd_raw','eos_raw','esr_raw','hbco_raw','inrpt_raw','lymph_raw','mch_raw','mchc_raw',
'mcv_raw','methb_raw','neut_raw','pt_raw','rbc_raw','rdw_raw','tco2_raw','weight_static','height_static', 'abx', 'sofa']

keys_to_select = ['stay_id', 'stay_time','hr_raw', 'o2sat_raw','dbp_raw', 'map_raw', 'resp_raw', 'fio2_raw',  'crp_raw',
                  'po2_raw','bili_raw', 'plt_raw', 'crea_raw', 'temp_raw',
                  #'age_static','female_static','weight_static','height_static', 
                  'sep_onset','sep_%2s' % str(sep_def['N_prog_sep'])]

 
params_to_configure = {
    # File to laod
    'path': '../datasets/hirid_0.5.6.parquet',
    'w_pre_onset':  None,  # Number of windows pre sep_onset = 1
    'w_post_onset':  None,  # Number of windows post sep_onset = 1
    'keys': keys_to_select,
    'label':  ['sep_onset','sep_'+str(sep_def['N_prog_sep'])],
    'f_tr_te':  ['stay_id', 'stay_time','sep_onset','sep_%2s' % str(sep_def['N_prog_sep']), 'w_id'],
    # sliding window
    'moving_span': 1,
    # min_length_pat
    'min_length_pat': 0, # default: 0
    # Type of imputation
    'imputationType': "LVCF",
    # filter or not patients with less of th information    
    "filter_pat_nans": False, # if true, fix a threshold (next)
    # Threshold: remove patients with less information that theshold value
    'th': 50,
    # Only select patients with sepsis
    "filter_pat": False,
    "length_window": 7,
}
 
seeds = [34, 56, 78]
folders = ["s1", "s2", "s3"]

In [14]:
# keys_by_split = []
# idx_exp = 2

# params_to_configure['min_length_pat'] = 0
# df, min_length_pat = splits_generation.preprocessing(params_to_configure, 
#                                                     sep_def,  
#                                                     debug=False)

# Load data
params = params_to_configure
df = pd.read_parquet(params['path'])
print("# of patients:", len(df.stay_id.unique()))
df = utils.get_SI(df, sep_def['Nts_pre'], sep_def['Nts_post'])
# if sep_def['ref_sofa_icu']:
#     df = df[df.stay_time >= 0].reset_index(drop=True)
df['bsofa'] = df.groupby('stay_id')['sofa'].apply(utils.f_baseline_sofa).reset_index(level=0, drop=True)
df = utils.get_sep(df, sep_def['N_prog_sep'], sep_def['increm_sofa'])
df = df.drop(["bsofa"], axis=1)

# of patients: 27374


In [22]:
aux_0 = df[keys_to_select_filter]
aux_1 = utils.get_SI(aux_0, sep_def['Nts_pre'], sep_def['Nts_post'])

In [23]:
aux_1['bsofa'] = aux_1.groupby('stay_id')['sofa'].apply(utils.f_baseline_sofa).reset_index(level=0, drop=True)

In [27]:
aux_2 = utils.get_sep(aux_1, sep_def['N_prog_sep'], sep_def['increm_sofa'])

In [32]:
print("# of patients with sepsis", (len(aux_2[aux_2.sep_onset == 1].stay_id.unique()))
print("# of patients without sepsis", len(aux_2[aux_2.sep_onset == 0].stay_id.unique()))

27374

In [34]:
df_sw = utils.slidingWindow(aux_2, params_to_configure['moving_span'], params_to_configure['length_window'])

KeyboardInterrupt: 

In [None]:
# def filterWindows(df_sw):
# Get the unique stay_ids where sep_onset is 1
pats = df_sw[df_sw.sep_onset == 1].stay_id.unique()

# Create a list to store the filtered DataFrames
result_list = []

for pat_id in pats:
    # Filter the DataFrame by each stay_id
    pat = df_sw[df_sw.stay_id == pat_id].reset_index(drop=True)

    # Find the index of the last occurrence of sep_onset = 1
    last_sep_onset_idx = pat[pat['sep_onset'] == 1].index[-1]

    # Get the next w_id after the last sep_onset = 1
    w_id = pat.iloc[last_sep_onset_idx].w_id.split("_")
    next_w_id = w_id[0] + "_" + str(int(w_id[1]) + 1)

    # Check if the next w_id exists in the DataFrame
    if next_w_id in pat['w_id'].values:
        idx = pat[pat['w_id'] == next_w_id].index[0]
        # Filter the DataFrame to include only the rows before the next w_id
        result_df = pat.iloc[:idx]
    else:
        # If the next w_id does not exist, include all rows after the last sep_onset = 1
        result_df = pat.iloc[last_sep_onset_idx + 1:]

    # Add the filtered DataFrame to the results list
    result_list.append(result_df)
    break

# Concatenate all the DataFrames into one
# final_result_df = pd.concat(result_list, ignore_index=True)
#     return final_result_df

In [None]:
df_filter.to_parquet('df_app2_hirid.parquet', index=False)

In [None]:
for i in range(len(seeds)):
    print("split...", folders[i])    
    params_to_configure['min_length_pat'] = params_to_configure['length_window']
    X_train, X_test, y_train, y_test, keys, w_id_tr, w_id_te = splits_generation.get_tr_te(df_filter,
                                                                        params_to_configure, 
                                                                        seeds[i])
    
    
    
    keys_by_split.append(keys)
    print("X_train:", X_train.shape)
    print("X_test:", X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    
    
    np.save(
       "./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/X_train_tensor.npy", X_train
    )
    np.save(
       "./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/y_train_tensor.npy", y_train
    )
    np.save(
       "./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/X_test_tensor.npy", X_test
    )
    np.save(
       "./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/y_test_tensor.npy", y_test
    )
    
    pd.DataFrame(keys).to_csv("./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/keys.csv")
    w_id_tr.to_csv("./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/w_id_tr.csv")
    w_id_te.to_csv("./splits/hirid/App"+str(idx_exp)+"/"+ folders[i] + "/w_id_te.csv")